diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-03 15:20:36 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-03 15:20:36 +0000 |
commit | d288ef4c1788d3a951a7558c68312c2d320612b1 (patch) | |
tree | ece909a5200f95f85f0813599a9500620f4d9217 /lib | |
parent | f382538d471e38a9b98f016c4caebd24c8d60b62 (diff) | |
download | src-d288ef4c1788d3a951a7558c68312c2d320612b1.tar.gz src-d288ef4c1788d3a951a7558c68312c2d320612b1.zip |
Notes
Diffstat (limited to 'lib')
99 files changed, 4411 insertions, 1564 deletions
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 6a1af87450c9..a906770dbb34 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -1170,7 +1170,9 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate, const DataLayout &DL, const TargetLibraryInfo *TLI) { // fold: icmp (inttoptr x), null -> icmp x, 0 + // fold: icmp null, (inttoptr x) -> icmp 0, x // fold: icmp (ptrtoint x), 0 -> icmp x, null + // fold: icmp 0, (ptrtoint x) -> icmp null, x // fold: icmp (inttoptr x), (inttoptr y) -> icmp trunc/zext x, trunc/zext y // fold: icmp (ptrtoint x), (ptrtoint y) -> icmp x, y // @@ -1240,6 +1242,11 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate, Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or; return ConstantFoldBinaryOpOperands(OpC, LHS, RHS, DL); } + } else if (isa<ConstantExpr>(Ops1)) { + // If RHS is a constant expression, but the left side isn't, swap the + // operands and try again. + Predicate = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)Predicate); + return ConstantFoldCompareInstOperands(Predicate, Ops1, Ops0, DL, TLI); } return ConstantExpr::getCompare(Predicate, Ops0, Ops1); diff --git a/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/lib/Analysis/IndirectCallPromotionAnalysis.cpp index 3da33ac71421..ed233d201537 100644 --- a/lib/Analysis/IndirectCallPromotionAnalysis.cpp +++ b/lib/Analysis/IndirectCallPromotionAnalysis.cpp @@ -43,7 +43,7 @@ static cl::opt<unsigned> // The percent threshold for the direct-call target (this call site vs the // total call count) for it to be considered as the promotion target. static cl::opt<unsigned> - ICPPercentThreshold("icp-percent-threshold", cl::init(33), cl::Hidden, + ICPPercentThreshold("icp-percent-threshold", cl::init(30), cl::Hidden, cl::ZeroOrMore, cl::desc("The percentage threshold for the promotion")); diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 4702569126c6..77c87928728a 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -54,11 +54,6 @@ static cl::opt<int> cl::init(45), cl::desc("Threshold for inlining cold callsites")); -static cl::opt<bool> - EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden, - cl::init(false), - cl::desc("Enable generic switch cost model")); - // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. @@ -1015,83 +1010,68 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { if (isa<ConstantInt>(V)) return true; - if (EnableGenericSwitchCost) { - // Assume the most general case where the swith is lowered into - // either a jump table, bit test, or a balanced binary tree consisting of - // case clusters without merging adjacent clusters with the same - // destination. We do not consider the switches that are lowered with a mix - // of jump table/bit test/binary search tree. The cost of the switch is - // proportional to the size of the tree or the size of jump table range. - - // Exit early for a large switch, assuming one case needs at least one - // instruction. - // FIXME: This is not true for a bit test, but ignore such case for now to - // save compile-time. - int64_t CostLowerBound = - std::min((int64_t)INT_MAX, - (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); - - if (CostLowerBound > Threshold) { - Cost = CostLowerBound; - return false; - } + // Assume the most general case where the swith is lowered into + // either a jump table, bit test, or a balanced binary tree consisting of + // case clusters without merging adjacent clusters with the same + // destination. We do not consider the switches that are lowered with a mix + // of jump table/bit test/binary search tree. The cost of the switch is + // proportional to the size of the tree or the size of jump table range. + // + // NB: We convert large switches which are just used to initialize large phi + // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent + // inlining those. It will prevent inlining in cases where the optimization + // does not (yet) fire. - unsigned JumpTableSize = 0; - unsigned NumCaseCluster = - TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); + // Exit early for a large switch, assuming one case needs at least one + // instruction. + // FIXME: This is not true for a bit test, but ignore such case for now to + // save compile-time. + int64_t CostLowerBound = + std::min((int64_t)INT_MAX, + (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); - // If suitable for a jump table, consider the cost for the table size and - // branch to destination. - if (JumpTableSize) { - int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + - 4 * InlineConstants::InstrCost; - Cost = std::min((int64_t)INT_MAX, JTCost + Cost); - return false; - } + if (CostLowerBound > Threshold) { + Cost = CostLowerBound; + return false; + } - // Considering forming a binary search, we should find the number of nodes - // which is same as the number of comparisons when lowered. For a given - // number of clusters, n, we can define a recursive function, f(n), to find - // the number of nodes in the tree. The recursion is : - // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, - // and f(n) = n, when n <= 3. - // This will lead a binary tree where the leaf should be either f(2) or f(3) - // when n > 3. So, the number of comparisons from leaves should be n, while - // the number of non-leaf should be : - // 2^(log2(n) - 1) - 1 - // = 2^log2(n) * 2^-1 - 1 - // = n / 2 - 1. - // Considering comparisons from leaf and non-leaf nodes, we can estimate the - // number of comparisons in a simple closed form : - // n + n / 2 - 1 = n * 3 / 2 - 1 - if (NumCaseCluster <= 3) { - // Suppose a comparison includes one compare and one conditional branch. - Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; - return false; - } - int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1; - uint64_t SwitchCost = - ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; - Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost); + unsigned JumpTableSize = 0; + unsigned NumCaseCluster = + TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); + + // If suitable for a jump table, consider the cost for the table size and + // branch to destination. + if (JumpTableSize) { + int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + + 4 * InlineConstants::InstrCost; + Cost = std::min((int64_t)INT_MAX, JTCost + Cost); return false; } - // Use a simple switch cost model where we accumulate a cost proportional to - // the number of distinct successor blocks. This fan-out in the CFG cannot - // be represented for free even if we can represent the core switch as a - // jumptable that takes a single instruction. - /// - // NB: We convert large switches which are just used to initialize large phi - // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent - // inlining those. It will prevent inlining in cases where the optimization - // does not (yet) fire. - SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; - SuccessorBlocks.insert(SI.getDefaultDest()); - for (auto Case : SI.cases()) - SuccessorBlocks.insert(Case.getCaseSuccessor()); - // Add cost corresponding to the number of distinct destinations. The first - // we model as free because of fallthrough. - Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; + // Considering forming a binary search, we should find the number of nodes + // which is same as the number of comparisons when lowered. For a given + // number of clusters, n, we can define a recursive function, f(n), to find + // the number of nodes in the tree. The recursion is : + // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, + // and f(n) = n, when n <= 3. + // This will lead a binary tree where the leaf should be either f(2) or f(3) + // when n > 3. So, the number of comparisons from leaves should be n, while + // the number of non-leaf should be : + // 2^(log2(n) - 1) - 1 + // = 2^log2(n) * 2^-1 - 1 + // = n / 2 - 1. + // Considering comparisons from leaf and non-leaf nodes, we can estimate the + // number of comparisons in a simple closed form : + // n + n / 2 - 1 = n * 3 / 2 - 1 + if (NumCaseCluster <= 3) { + // Suppose a comparison includes one compare and one conditional branch. + Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; + return false; + } + int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1; + uint64_t SwitchCost = + ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; + Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost); return false; } diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp index a2b9015a8a1d..6a9ae6440ace 100644 --- a/lib/Analysis/LazyValueInfo.cpp +++ b/lib/Analysis/LazyValueInfo.cpp @@ -662,13 +662,13 @@ namespace { bool solveBlockValuePHINode(LVILatticeVal &BBLV, PHINode *PN, BasicBlock *BB); bool solveBlockValueSelect(LVILatticeVal &BBLV, SelectInst *S, BasicBlock *BB); - bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, Instruction *BBI, + bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, BinaryOperator *BBI, BasicBlock *BB); - bool solveBlockValueCast(LVILatticeVal &BBLV, Instruction *BBI, + bool solveBlockValueCast(LVILatticeVal &BBLV, CastInst *CI, BasicBlock *BB); void intersectAssumeOrGuardBlockValueConstantRange(Value *Val, LVILatticeVal &BBLV, - Instruction *BBI); + Instruction *BBI); void solve(); @@ -849,12 +849,12 @@ bool LazyValueInfoImpl::solveBlockValueImpl(LVILatticeVal &Res, return true; } if (BBI->getType()->isIntegerTy()) { - if (isa<CastInst>(BBI)) - return solveBlockValueCast(Res, BBI, BB); - + if (auto *CI = dyn_cast<CastInst>(BBI)) + return solveBlockValueCast(Res, CI, BB); + BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI); if (BO && isa<ConstantInt>(BO->getOperand(1))) - return solveBlockValueBinaryOp(Res, BBI, BB); + return solveBlockValueBinaryOp(Res, BO, BB); } DEBUG(dbgs() << " compute BB '" << BB->getName() @@ -1168,9 +1168,9 @@ bool LazyValueInfoImpl::solveBlockValueSelect(LVILatticeVal &BBLV, } bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV, - Instruction *BBI, - BasicBlock *BB) { - if (!BBI->getOperand(0)->getType()->isSized()) { + CastInst *CI, + BasicBlock *BB) { + if (!CI->getOperand(0)->getType()->isSized()) { // Without knowing how wide the input is, we can't analyze it in any useful // way. BBLV = LVILatticeVal::getOverdefined(); @@ -1180,7 +1180,7 @@ bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV, // Filter out casts we don't know how to reason about before attempting to // recurse on our operand. This can cut a long search short if we know we're // not going to be able to get any useful information anways. - switch (BBI->getOpcode()) { + switch (CI->getOpcode()) { case Instruction::Trunc: case Instruction::SExt: case Instruction::ZExt: @@ -1197,44 +1197,43 @@ bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV, // Figure out the range of the LHS. If that fails, we still apply the // transfer rule on the full set since we may be able to locally infer // interesting facts. - if (!hasBlockValue(BBI->getOperand(0), BB)) - if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0)))) + if (!hasBlockValue(CI->getOperand(0), BB)) + if (pushBlockValue(std::make_pair(BB, CI->getOperand(0)))) // More work to do before applying this transfer rule. return false; const unsigned OperandBitWidth = - DL.getTypeSizeInBits(BBI->getOperand(0)->getType()); + DL.getTypeSizeInBits(CI->getOperand(0)->getType()); ConstantRange LHSRange = ConstantRange(OperandBitWidth); - if (hasBlockValue(BBI->getOperand(0), BB)) { - LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB); - intersectAssumeOrGuardBlockValueConstantRange(BBI->getOperand(0), LHSVal, - BBI); + if (hasBlockValue(CI->getOperand(0), BB)) { + LVILatticeVal LHSVal = getBlockValue(CI->getOperand(0), BB); + intersectAssumeOrGuardBlockValueConstantRange(CI->getOperand(0), LHSVal, + CI); if (LHSVal.isConstantRange()) LHSRange = LHSVal.getConstantRange(); } - const unsigned ResultBitWidth = - cast<IntegerType>(BBI->getType())->getBitWidth(); + const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth(); // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze // more definitions. - auto CastOp = (Instruction::CastOps) BBI->getOpcode(); - BBLV = LVILatticeVal::getRange(LHSRange.castOp(CastOp, ResultBitWidth)); + BBLV = LVILatticeVal::getRange(LHSRange.castOp(CI->getOpcode(), + ResultBitWidth)); return true; } bool LazyValueInfoImpl::solveBlockValueBinaryOp(LVILatticeVal &BBLV, - Instruction *BBI, + BinaryOperator *BO, BasicBlock *BB) { - assert(BBI->getOperand(0)->getType()->isSized() && + assert(BO->getOperand(0)->getType()->isSized() && "all operands to binary operators are sized"); // Filter out operators we don't know how to reason about before attempting to // recurse on our operand(s). This can cut a long search short if we know - // we're not going to be able to get any useful information anways. - switch (BBI->getOpcode()) { + // we're not going to be able to get any useful information anyways. + switch (BO->getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Mul: @@ -1256,29 +1255,29 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(LVILatticeVal &BBLV, // Figure out the range of the LHS. If that fails, use a conservative range, // but apply the transfer rule anyways. This lets us pick up facts from // expressions like "and i32 (call i32 @foo()), 32" - if (!hasBlockValue(BBI->getOperand(0), BB)) - if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0)))) + if (!hasBlockValue(BO->getOperand(0), BB)) + if (pushBlockValue(std::make_pair(BB, BO->getOperand(0)))) // More work to do before applying this transfer rule. return false; const unsigned OperandBitWidth = - DL.getTypeSizeInBits(BBI->getOperand(0)->getType()); + DL.getTypeSizeInBits(BO->getOperand(0)->getType()); ConstantRange LHSRange = ConstantRange(OperandBitWidth); - if (hasBlockValue(BBI->getOperand(0), BB)) { - LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB); - intersectAssumeOrGuardBlockValueConstantRange(BBI->getOperand(0), LHSVal, - BBI); + if (hasBlockValue(BO->getOperand(0), BB)) { + LVILatticeVal LHSVal = getBlockValue(BO->getOperand(0), BB); + intersectAssumeOrGuardBlockValueConstantRange(BO->getOperand(0), LHSVal, + BO); if (LHSVal.isConstantRange()) LHSRange = LHSVal.getConstantRange(); } - ConstantInt *RHS = cast<ConstantInt>(BBI->getOperand(1)); + ConstantInt *RHS = cast<ConstantInt>(BO->getOperand(1)); ConstantRange RHSRange = ConstantRange(RHS->getValue()); // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze // more definitions. - auto BinOp = (Instruction::BinaryOps) BBI->getOpcode(); + Instruction::BinaryOps BinOp = BO->getOpcode(); BBLV = LVILatticeVal::getRange(LHSRange.binaryOp(BinOp, RHSRange)); return true; } diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index 26706f5509ba..3253f27c010d 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -275,7 +275,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* LiveRoot = */ false); + /* Live = */ false); auto FuncSummary = llvm::make_unique<FunctionSummary>( Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(), TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(), @@ -295,7 +295,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* LiveRoot = */ false); + /* Live = */ false); auto GVarSummary = llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -308,7 +308,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet<GlobalValue::GUID> &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* LiveRoot = */ false); + /* Live = */ false); auto AS = llvm::make_unique<AliasSummary>(Flags, ArrayRef<ValueInfo>{}); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -323,7 +323,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) { if (ValueInfo VI = Index.getValueInfo(GlobalValue::getGUID(Name))) for (auto &Summary : VI.getSummaryList()) - Summary->setLiveRoot(); + Summary->setLive(true); } ModuleSummaryIndex llvm::buildModuleSummaryIndex( @@ -423,8 +423,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( return; assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, - /* NotEligibleToImport */ true, - /* LiveRoot */ true); + /* NotEligibleToImport = */ true, + /* Live = */ true); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (isa<Function>(GV)) { diff --git a/lib/Analysis/OrderedBasicBlock.cpp b/lib/Analysis/OrderedBasicBlock.cpp index 0f0016f22cc0..a04c0aef04be 100644 --- a/lib/Analysis/OrderedBasicBlock.cpp +++ b/lib/Analysis/OrderedBasicBlock.cpp @@ -55,7 +55,7 @@ bool OrderedBasicBlock::comesBefore(const Instruction *A, assert(II != IE && "Instruction not found?"); assert((Inst == A || Inst == B) && "Should find A or B"); LastInstFound = II; - return Inst == A; + return Inst != B; } /// \brief Find out whether \p A dominates \p B, meaning whether \p A diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp index 82107cb18025..b38e6225c840 100644 --- a/lib/Analysis/RegionPass.cpp +++ b/lib/Analysis/RegionPass.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/RegionPass.h" #include "llvm/Analysis/RegionIterator.h" +#include "llvm/IR/OptBisect.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" @@ -280,3 +281,18 @@ Pass *RegionPass::createPrinterPass(raw_ostream &O, const std::string &Banner) const { return new PrintRegionPass(Banner, O); } + +bool RegionPass::skipRegion(Region &R) const { + Function &F = *R.getEntry()->getParent(); + if (!F.getContext().getOptBisect().shouldRunPass(this, R)) + return true; + + if (F.hasFnAttribute(Attribute::OptimizeNone)) { + // Report this only once per function. + if (R.getEntry() == &F.getEntryBlock()) + DEBUG(dbgs() << "Skipping pass '" << getPassName() + << "' on function " << F.getName() << "\n"); + return true; + } + return false; +} diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 686c94687669..fffa9045b2fd 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -865,11 +865,11 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits RawFlags = RawFlags >> 4; bool NotEligibleToImport = (RawFlags & 0x1) || Version < 3; - // The LiveRoot flag wasn't introduced until version 3. For dead stripping + // The Live flag wasn't introduced until version 3. For dead stripping // to work correctly on earlier versions, we must conservatively treat all // values as live. - bool LiveRoot = (RawFlags & 0x2) || Version < 3; - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, LiveRoot); + bool Live = (RawFlags & 0x2) || Version < 3; + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index a402b4ddd462..9043b8c12d25 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -351,7 +351,8 @@ public: /// Calls the callback for each value GUID and summary to be written to /// bitcode. This hides the details of whether they are being pulled from the /// entire index or just those in a provided ModuleToSummariesForIndex map. - void forEachSummary(std::function<void(GVInfo)> Callback) { + template<typename Functor> + void forEachSummary(Functor Callback) { if (ModuleToSummariesForIndex) { for (auto &M : *ModuleToSummariesForIndex) for (auto &Summary : M.second) @@ -363,6 +364,29 @@ public: } } + /// Calls the callback for each entry in the modulePaths StringMap that + /// should be written to the module path string table. This hides the details + /// of whether they are being pulled from the entire index or just those in a + /// provided ModuleToSummariesForIndex map. + template <typename Functor> void forEachModule(Functor Callback) { + if (ModuleToSummariesForIndex) { + for (const auto &M : *ModuleToSummariesForIndex) { + const auto &MPI = Index.modulePaths().find(M.first); + if (MPI == Index.modulePaths().end()) { + // This should only happen if the bitcode file was empty, in which + // case we shouldn't be importing (the ModuleToSummariesForIndex + // would only include the module we are writing and index for). + assert(ModuleToSummariesForIndex->size() == 1); + continue; + } + Callback(*MPI); + } + } else { + for (const auto &MPSE : Index.modulePaths()) + Callback(MPSE); + } + } + /// Main entry point for writing a combined index to bitcode. void write(); @@ -370,14 +394,6 @@ private: void writeModStrings(); void writeCombinedGlobalValueSummary(); - /// Indicates whether the provided \p ModulePath should be written into - /// the module string table, e.g. if full index written or if it is in - /// the provided subset. - bool doIncludeModule(StringRef ModulePath) { - return !ModuleToSummariesForIndex || - ModuleToSummariesForIndex->count(ModulePath); - } - Optional<unsigned> getValueId(GlobalValue::GUID ValGUID) { auto VMI = GUIDToValueIdMap.find(ValGUID); if (VMI == GUIDToValueIdMap.end()) @@ -864,7 +880,7 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { uint64_t RawFlags = 0; RawFlags |= Flags.NotEligibleToImport; // bool - RawFlags |= (Flags.LiveRoot << 1); + RawFlags |= (Flags.Live << 1); // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. @@ -968,19 +984,18 @@ void ModuleBitcodeWriter::writeValueSymbolTableForwardDecl() { enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 }; /// Determine the encoding to use for the given string name and length. -static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) { +static StringEncoding getStringEncoding(StringRef Str) { bool isChar6 = true; - for (const char *C = Str, *E = C + StrLen; C != E; ++C) { + for (char C : Str) { if (isChar6) - isChar6 = BitCodeAbbrevOp::isChar6(*C); - if ((unsigned char)*C & 128) + isChar6 = BitCodeAbbrevOp::isChar6(C); + if ((unsigned char)C & 128) // don't bother scanning the rest. return SE_Fixed8; } if (isChar6) return SE_Char6; - else - return SE_Fixed7; + return SE_Fixed7; } /// Emit top-level description of module, including target triple, inline asm, @@ -1073,8 +1088,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { SmallVector<unsigned, 64> Vals; // Emit the module's source file name. { - StringEncoding Bits = getStringEncoding(M.getSourceFileName().data(), - M.getSourceFileName().size()); + StringEncoding Bits = getStringEncoding(M.getSourceFileName()); BitCodeAbbrevOp AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8); if (Bits == SE_Char6) AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Char6); @@ -2790,8 +2804,7 @@ void ModuleBitcodeWriter::writeFunctionLevelValueSymbolTable( for (const ValueName &Name : VST) { // Figure out the encoding to use for the name. - StringEncoding Bits = - getStringEncoding(Name.getKeyData(), Name.getKeyLength()); + StringEncoding Bits = getStringEncoding(Name.getKey()); unsigned AbbrevToUse = VST_ENTRY_8_ABBREV; NameVals.push_back(VE.getValueID(Name.getValue())); @@ -3149,41 +3162,33 @@ void IndexBitcodeWriter::writeModStrings() { unsigned AbbrevHash = Stream.EmitAbbrev(std::move(Abbv)); SmallVector<unsigned, 64> Vals; - for (const auto &MPSE : Index.modulePaths()) { - if (!doIncludeModule(MPSE.getKey())) - continue; - StringEncoding Bits = - getStringEncoding(MPSE.getKey().data(), MPSE.getKey().size()); - unsigned AbbrevToUse = Abbrev8Bit; - if (Bits == SE_Char6) - AbbrevToUse = Abbrev6Bit; - else if (Bits == SE_Fixed7) - AbbrevToUse = Abbrev7Bit; - - Vals.push_back(MPSE.getValue().first); - - for (const auto P : MPSE.getKey()) - Vals.push_back((unsigned char)P); - - // Emit the finished record. - Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse); - - Vals.clear(); - // Emit an optional hash for the module now - auto &Hash = MPSE.getValue().second; - bool AllZero = true; // Detect if the hash is empty, and do not generate it - for (auto Val : Hash) { - if (Val) - AllZero = false; - Vals.push_back(Val); - } - if (!AllZero) { - // Emit the hash record. - Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash); - } + forEachModule( + [&](const StringMapEntry<std::pair<uint64_t, ModuleHash>> &MPSE) { + StringRef Key = MPSE.getKey(); + const auto &Value = MPSE.getValue(); + StringEncoding Bits = getStringEncoding(Key); + unsigned AbbrevToUse = Abbrev8Bit; + if (Bits == SE_Char6) + AbbrevToUse = Abbrev6Bit; + else if (Bits == SE_Fixed7) + AbbrevToUse = Abbrev7Bit; + + Vals.push_back(Value.first); + Vals.append(Key.begin(), Key.end()); + + // Emit the finished record. + Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse); + + // Emit an optional hash for the module now + const auto &Hash = Value.second; + if (llvm::any_of(Hash, [](uint32_t H) { return H; })) { + Vals.assign(Hash.begin(), Hash.end()); + // Emit the hash record. + Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash); + } - Vals.clear(); - } + Vals.clear(); + }); Stream.ExitBlock(); } diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp index 20e1467b30c3..c2ad9db81cfd 100644 --- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp +++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp @@ -194,6 +194,10 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF, // some variables. for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.isDef() && MO.getReg()) { + // Ignore call instructions that claim to clobber SP. The AArch64 + // backend does this for aggregate function arguments. + if (MI.isCall() && MO.getReg() == SP) + continue; // If this is a virtual register, only clobber it since it doesn't // have aliases. if (TRI->isVirtualRegister(MO.getReg())) diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 4d30c6574b12..256a0c95d365 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -77,6 +77,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePostRASchedulerPass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeProcessImplicitDefsPass(Registry); + initializeRABasicPass(Registry); initializeRAGreedyPass(Registry); initializeRegisterCoalescerPass(Registry); initializeRenameIndependentSubregsPass(Registry); diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp index 23812a2a2344..3603f9b7ed93 100644 --- a/lib/CodeGen/GlobalMerge.cpp +++ b/lib/CodeGen/GlobalMerge.cpp @@ -556,6 +556,10 @@ bool GlobalMerge::doInitialization(Module &M) { if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection()) continue; + // It's not safe to merge globals that may be preempted + if (TM && !TM->shouldAssumeDSOLocal(M, &GV)) + continue; + if (!(MergeExternalGlobals && GV.hasExternalLinkage()) && !GV.hasInternalLinkage()) continue; diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp index 0dc1079b2ad4..cde6ccd29dfd 100644 --- a/lib/CodeGen/LivePhysRegs.cpp +++ b/lib/CodeGen/LivePhysRegs.cpp @@ -198,13 +198,12 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) { } void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); if (!MBB.succ_empty()) { - const MachineFunction &MF = *MBB.getParent(); addPristines(*this, MF); addLiveOutsNoPristines(MBB); } else if (MBB.isReturnBlock()) { // For the return block: Add all callee saved registers. - const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); if (MFI.isCalleeSavedInfoValid()) addCalleeSavedRegs(*this, MF); diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp index dff555f49565..3746b74e0528 100644 --- a/lib/CodeGen/LiveRegUnits.cpp +++ b/lib/CodeGen/LiveRegUnits.cpp @@ -12,11 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LiveRegUnits.h" + #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -81,46 +83,50 @@ void LiveRegUnits::accumulateBackward(const MachineInstr &MI) { } /// Add live-in registers of basic block \p MBB to \p LiveUnits. -static void addLiveIns(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { +static void addBlockLiveIns(LiveRegUnits &LiveUnits, + const MachineBasicBlock &MBB) { for (const auto &LI : MBB.liveins()) LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask); } -static void addLiveOuts(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { - // To get the live-outs we simply merge the live-ins of all successors. - for (const MachineBasicBlock *Succ : MBB.successors()) - addLiveIns(LiveUnits, *Succ); +/// Adds all callee saved registers to \p LiveUnits. +static void addCalleeSavedRegs(LiveRegUnits &LiveUnits, + const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) + LiveUnits.addReg(*CSR); } -/// Add pristine registers to the given \p LiveUnits. This function removes -/// actually saved callee save registers when \p InPrologueEpilogue is false. -static void removeSavedRegs(LiveRegUnits &LiveUnits, const MachineFunction &MF, - const MachineFrameInfo &MFI, - const TargetRegisterInfo &TRI) { +/// Adds pristine registers to the given \p LiveUnits. Pristine registers are +/// callee saved registers that are unused in the function. +static void addPristines(LiveRegUnits &LiveUnits, const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid()) + return; + /// Add all callee saved regs, then remove the ones that are saved+restored. + addCalleeSavedRegs(LiveUnits, MF); + /// Remove the ones that are not saved/restored; they are pristine. for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) LiveUnits.removeReg(Info.getReg()); } void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) { - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) - addReg(*I); - if (!MBB.isReturnBlock()) - removeSavedRegs(*this, MF, MFI, *TRI); + if (!MBB.succ_empty()) { + addPristines(*this, MF); + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addBlockLiveIns(*this, *Succ); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) + addCalleeSavedRegs(*this, MF); } - ::addLiveOuts(*this, MBB); } void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) { - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) - addReg(*I); - if (&MBB != &MF.front()) - removeSavedRegs(*this, MF, MFI, *TRI); - } - ::addLiveIns(*this, MBB); + addPristines(*this, MF); + addBlockLiveIns(*this, MBB); } diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp index 71ad4e6aa7f5..2402ffdbbcb1 100644 --- a/lib/CodeGen/MachineRegionInfo.cpp +++ b/lib/CodeGen/MachineRegionInfo.cpp @@ -1,7 +1,19 @@ -#include "llvm/CodeGen/MachineRegionInfo.h" +//===- lib/Codegen/MachineRegionInfo.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegionInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" #define DEBUG_TYPE "machine-region-info" @@ -11,36 +23,29 @@ STATISTIC(numMachineRegions, "The # of machine regions"); STATISTIC(numMachineSimpleRegions, "The # of simple machine regions"); namespace llvm { + template class RegionBase<RegionTraits<MachineFunction>>; template class RegionNodeBase<RegionTraits<MachineFunction>>; template class RegionInfoBase<RegionTraits<MachineFunction>>; -} + +} // end namespace llvm //===----------------------------------------------------------------------===// // MachineRegion implementation -// MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit, MachineRegionInfo* RI, MachineDominatorTree *DT, MachineRegion *Parent) : - RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) { + RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {} -} - -MachineRegion::~MachineRegion() { } +MachineRegion::~MachineRegion() = default; //===----------------------------------------------------------------------===// // MachineRegionInfo implementation -// - -MachineRegionInfo::MachineRegionInfo() : - RegionInfoBase<RegionTraits<MachineFunction>>() { - -} -MachineRegionInfo::~MachineRegionInfo() { +MachineRegionInfo::MachineRegionInfo() = default; -} +MachineRegionInfo::~MachineRegionInfo() = default; void MachineRegionInfo::updateStatistics(MachineRegion *R) { ++numMachineRegions; @@ -73,9 +78,7 @@ MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) { initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry()); } -MachineRegionInfoPass::~MachineRegionInfoPass() { - -} +MachineRegionInfoPass::~MachineRegionInfoPass() = default; bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { releaseMemory(); @@ -137,8 +140,9 @@ INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE, // the link time optimization. namespace llvm { - FunctionPass *createMachineRegionInfoPass() { - return new MachineRegionInfoPass(); - } + +FunctionPass *createMachineRegionInfoPass() { + return new MachineRegionInfoPass(); } +} // end namespace llvm diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index 265f93c363ca..f6dbf667cf02 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -36,6 +36,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" @@ -909,17 +910,43 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { } } - // Generic loads and stores must have a single MachineMemOperand - // describing that access. - if ((MI->getOpcode() == TargetOpcode::G_LOAD || - MI->getOpcode() == TargetOpcode::G_STORE) && - !MI->hasOneMemOperand()) - report("Generic instruction accessing memory must have one mem operand", - MI); - StringRef ErrorInfo; if (!TII->verifyInstruction(*MI, ErrorInfo)) report(ErrorInfo.data(), MI); + + // Verify properties of various specific instruction types + switch(MI->getOpcode()) { + default: + break; + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: + // Generic loads and stores must have a single MachineMemOperand + // describing that access. + if (!MI->hasOneMemOperand()) + report("Generic instruction accessing memory must have one mem operand", + MI); + break; + case TargetOpcode::STATEPOINT: + if (!MI->getOperand(StatepointOpers::IDPos).isImm() || + !MI->getOperand(StatepointOpers::NBytesPos).isImm() || + !MI->getOperand(StatepointOpers::NCallArgsPos).isImm()) + report("meta operands to STATEPOINT not constant!", MI); + break; + + auto VerifyStackMapConstant = [&](unsigned Offset) { + if (!MI->getOperand(Offset).isImm() || + MI->getOperand(Offset).getImm() != StackMaps::ConstantOp || + !MI->getOperand(Offset + 1).isImm()) + report("stack map constant to STATEPOINT not well formed!", MI); + }; + const unsigned VarStart = StatepointOpers(MI).getVarIdx(); + VerifyStackMapConstant(VarStart + StatepointOpers::CCOffset); + VerifyStackMapConstant(VarStart + StatepointOpers::FlagsOffset); + VerifyStackMapConstant(VarStart + StatepointOpers::NumDeoptOperandsOffset); + + // TODO: verify we have properly encoded deopt arguments + + }; } void diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index a9813e534c5f..e9f8d43fe643 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -54,8 +54,6 @@ static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, const MBBVector &SaveBlocks, const MBBVector &RestoreBlocks); -static void doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS); - namespace { class PEI : public MachineFunctionPass { public: @@ -84,7 +82,7 @@ private: const MBBVector &SaveBlocks, const MBBVector &RestoreBlocks)> SpillCalleeSavedRegisters; - std::function<void(MachineFunction &MF, RegScavenger *RS)> + std::function<void(MachineFunction &MF, RegScavenger &RS)> ScavengeFrameVirtualRegs; bool UsesCalleeSaves = false; @@ -142,7 +140,6 @@ MachineFunctionPass *llvm::createPrologEpilogInserterPass() { return new PEI(); } -STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); STATISTIC(NumBytesStackSpace, "Number of bytes used for stack in all functions"); @@ -168,10 +165,10 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *, unsigned &, unsigned &, const MBBVector &, const MBBVector &) {}; - ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {}; + ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger &) {}; } else { SpillCalleeSavedRegisters = doSpillCalleeSavedRegs; - ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs; + ScavengeFrameVirtualRegs = scavengeFrameVirtualRegs; UsesCalleeSaves = true; } } @@ -222,7 +219,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { // post-pass, scavenge the virtual registers that frame index elimination // inserted. if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) { - ScavengeFrameVirtualRegs(Fn, RS); + ScavengeFrameVirtualRegs(Fn, *RS); // Clear any vregs created by virtual scavenging. Fn.getRegInfo().clearVirtRegs(); @@ -1153,92 +1150,3 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, RS->forward(MI); } } - -/// doScavengeFrameVirtualRegs - Replace all frame index virtual registers -/// with physical registers. Use the register scavenger to find an -/// appropriate register to use. -/// -/// FIXME: Iterating over the instruction stream is unnecessary. We can simply -/// iterate over the vreg use list, which at this point only contains machine -/// operands for which eliminateFrameIndex need a new scratch reg. -static void -doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) { - // Run through the instructions and find any virtual registers. - MachineRegisterInfo &MRI = MF.getRegInfo(); - for (MachineBasicBlock &MBB : MF) { - RS->enterBasicBlock(MBB); - - int SPAdj = 0; - - // The instruction stream may change in the loop, so check MBB.end() - // directly. - for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) { - // We might end up here again with a NULL iterator if we scavenged a - // register for which we inserted spill code for definition by what was - // originally the first instruction in MBB. - if (I == MachineBasicBlock::iterator(nullptr)) - I = MBB.begin(); - - const MachineInstr &MI = *I; - MachineBasicBlock::iterator J = std::next(I); - MachineBasicBlock::iterator P = - I == MBB.begin() ? MachineBasicBlock::iterator(nullptr) - : std::prev(I); - - // RS should process this instruction before we might scavenge at this - // location. This is because we might be replacing a virtual register - // defined by this instruction, and if so, registers killed by this - // instruction are available, and defined registers are not. - RS->forward(I); - - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - - // When we first encounter a new virtual register, it - // must be a definition. - assert(MO.isDef() && "frame index virtual missing def!"); - // Scavenge a new scratch register - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj); - - ++NumScavengedRegs; - - // Replace this reference to the virtual register with the - // scratch register. - assert(ScratchReg && "Missing scratch register!"); - MRI.replaceRegWith(Reg, ScratchReg); - - // Because this instruction was processed by the RS before this - // register was allocated, make sure that the RS now records the - // register as being used. - RS->setRegUsed(ScratchReg); - } - - // If the scavenger needed to use one of its spill slots, the - // spill code will have been inserted in between I and J. This is a - // problem because we need the spill code before I: Move I to just - // prior to J. - if (I != std::prev(J)) { - MBB.splice(J, &MBB, I); - - // Before we move I, we need to prepare the RS to visit I again. - // Specifically, RS will assert if it sees uses of registers that - // it believes are undefined. Because we have already processed - // register kills in I, when it visits I again, it will believe that - // those registers are undefined. To avoid this situation, unprocess - // the instruction I. - assert(RS->getCurrentPosition() == I && - "The register scavenger has an unexpected position"); - I = P; - RS->unprocess(P); - } else - ++I; - } - } - - MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); -} diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index a87fed3a687e..24be7ea98d82 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -58,8 +58,9 @@ namespace { /// whenever a register is unavailable. This is not practical in production but /// provides a useful baseline both for measuring other allocators and comparing /// the speed of the basic algorithm against other styles of allocators. -class RABasic : public MachineFunctionPass, public RegAllocBase -{ +class RABasic : public MachineFunctionPass, + public RegAllocBase, + private LiveRangeEdit::Delegate { // context MachineFunction *MF; @@ -72,6 +73,9 @@ class RABasic : public MachineFunctionPass, public RegAllocBase // selectOrSplit(). BitVector UsableRegs; + bool LRE_CanEraseVirtReg(unsigned) override; + void LRE_WillShrinkVirtReg(unsigned) override; + public: RABasic(); @@ -121,17 +125,46 @@ char RABasic::ID = 0; } // end anonymous namespace +char &llvm::RABasicID = RABasic::ID; + +INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer) +INITIALIZE_PASS_DEPENDENCY(MachineScheduler) +INITIALIZE_PASS_DEPENDENCY(LiveStacks) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false, + false) + +bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) { + if (VRM->hasPhys(VirtReg)) { + LiveInterval &LI = LIS->getInterval(VirtReg); + Matrix->unassign(LI); + aboutToRemoveInterval(LI); + return true; + } + // Unassigned virtreg is probably in the priority queue. + // RegAllocBase will erase it after dequeueing. + return false; +} + +void RABasic::LRE_WillShrinkVirtReg(unsigned VirtReg) { + if (!VRM->hasPhys(VirtReg)) + return; + + // Register is assigned, put it back on the queue for reassignment. + LiveInterval &LI = LIS->getInterval(VirtReg); + Matrix->unassign(LI); + enqueue(&LI); +} + RABasic::RABasic(): MachineFunctionPass(ID) { - initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry()); - initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); - initializeSlotIndexesPass(*PassRegistry::getPassRegistry()); - initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry()); - initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); - initializeLiveStacksPass(*PassRegistry::getPassRegistry()); - initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); - initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); - initializeVirtRegMapPass(*PassRegistry::getPassRegistry()); - initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry()); } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -200,7 +233,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, Matrix->unassign(Spill); // Spill the extracted interval. - LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats); + LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats); spiller().spill(LRE); } return true; @@ -259,7 +292,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, DEBUG(dbgs() << "spilling: " << VirtReg << '\n'); if (!VirtReg.isSpillable()) return ~0u; - LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats); + LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats); spiller().spill(LRE); // The live virtual register requesting allocation was spilled, so tell diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index 3b5964eef55e..b2dfef91add5 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -49,9 +49,11 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PBQP/Graph.h" +#include "llvm/CodeGen/PBQP/Math.h" #include "llvm/CodeGen/PBQP/Solution.h" #include "llvm/CodeGen/PBQPRAConstraint.h" #include "llvm/CodeGen/RegAllocPBQP.h" @@ -139,13 +141,13 @@ public: } private: - typedef std::map<const LiveInterval*, unsigned> LI2NodeMap; - typedef std::vector<const LiveInterval*> Node2LIMap; - typedef std::vector<unsigned> AllowedSet; - typedef std::vector<AllowedSet> AllowedSetMap; - typedef std::pair<unsigned, unsigned> RegPair; - typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap; - typedef std::set<unsigned> RegSet; + using LI2NodeMap = std::map<const LiveInterval *, unsigned>; + using Node2LIMap = std::vector<const LiveInterval *>; + using AllowedSet = std::vector<unsigned>; + using AllowedSetMap = std::vector<AllowedSet>; + using RegPair = std::pair<unsigned, unsigned>; + using CoalesceMap = std::map<RegPair, PBQP::PBQPNum>; + using RegSet = std::set<unsigned>; char *customPassID; @@ -212,12 +214,12 @@ public: /// @brief Add interference edges between overlapping vregs. class Interference : public PBQPRAConstraint { private: - typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr; - typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey; - typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache; - typedef DenseSet<IKey> DisjointAllowedRegsCache; - typedef std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId> IEdgeKey; - typedef DenseSet<IEdgeKey> IEdgeCache; + using AllowedRegVecPtr = const PBQP::RegAlloc::AllowedRegVector *; + using IKey = std::pair<AllowedRegVecPtr, AllowedRegVecPtr>; + using IMatrixCache = DenseMap<IKey, PBQPRAGraph::MatrixPtr>; + using DisjointAllowedRegsCache = DenseSet<IKey>; + using IEdgeKey = std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId>; + using IEdgeCache = DenseSet<IEdgeKey>; bool haveDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId, @@ -252,8 +254,8 @@ private: // for the fast interference graph construction algorithm. The last is there // to save us from looking up node ids via the VRegToNode map in the graph // metadata. - typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId> - IntervalInfo; + using IntervalInfo = + std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>; static SlotIndex getStartPoint(const IntervalInfo &I) { return std::get<0>(I)->segments[std::get<1>(I)].start; @@ -320,9 +322,10 @@ public: // Cache known disjoint allowed registers pairs DisjointAllowedRegsCache D; - typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet; - typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>, - decltype(&lowestStartPoint)> IntervalQueue; + using IntervalSet = std::set<IntervalInfo, decltype(&lowestEndPoint)>; + using IntervalQueue = + std::priority_queue<IntervalInfo, std::vector<IntervalInfo>, + decltype(&lowestStartPoint)>; IntervalSet Active(lowestEndPoint); IntervalQueue Inactive(lowestStartPoint); @@ -658,7 +661,6 @@ void RegAllocPBQP::spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals, MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM, Spiller &VRegSpiller) { - VRegsToAlloc.erase(VReg); LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM, nullptr, &DeadRemats); diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index 0635e5c0a63c..1aed58c36e17 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -15,18 +15,23 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/RegisterScavenging.h" + #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/PassSupport.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -39,6 +44,8 @@ using namespace llvm; #define DEBUG_TYPE "reg-scavenging" +STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); + void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) { LiveUnits.addRegMasked(Reg, LaneMask); } @@ -469,3 +476,120 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, return SReg; } + +void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) { + // FIXME: Iterating over the instruction stream is unnecessary. We can simply + // iterate over the vreg use list, which at this point only contains machine + // operands for which eliminateFrameIndex need a new scratch reg. + + // Run through the instructions and find any virtual registers. + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (MachineBasicBlock &MBB : MF) { + RS.enterBasicBlock(MBB); + + int SPAdj = 0; + + // The instruction stream may change in the loop, so check MBB.end() + // directly. + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) { + // We might end up here again with a NULL iterator if we scavenged a + // register for which we inserted spill code for definition by what was + // originally the first instruction in MBB. + if (I == MachineBasicBlock::iterator(nullptr)) + I = MBB.begin(); + + const MachineInstr &MI = *I; + MachineBasicBlock::iterator J = std::next(I); + MachineBasicBlock::iterator P = + I == MBB.begin() ? MachineBasicBlock::iterator(nullptr) + : std::prev(I); + + // RS should process this instruction before we might scavenge at this + // location. This is because we might be replacing a virtual register + // defined by this instruction, and if so, registers killed by this + // instruction are available, and defined registers are not. + RS.forward(I); + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + // When we first encounter a new virtual register, it + // must be a definition. + assert(MO.isDef() && "frame index virtual missing def!"); + // Scavenge a new scratch register + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + unsigned ScratchReg = RS.scavengeRegister(RC, J, SPAdj); + + ++NumScavengedRegs; + + // Replace this reference to the virtual register with the + // scratch register. + assert(ScratchReg && "Missing scratch register!"); + MRI.replaceRegWith(Reg, ScratchReg); + + // Because this instruction was processed by the RS before this + // register was allocated, make sure that the RS now records the + // register as being used. + RS.setRegUsed(ScratchReg); + } + + // If the scavenger needed to use one of its spill slots, the + // spill code will have been inserted in between I and J. This is a + // problem because we need the spill code before I: Move I to just + // prior to J. + if (I != std::prev(J)) { + MBB.splice(J, &MBB, I); + + // Before we move I, we need to prepare the RS to visit I again. + // Specifically, RS will assert if it sees uses of registers that + // it believes are undefined. Because we have already processed + // register kills in I, when it visits I again, it will believe that + // those registers are undefined. To avoid this situation, unprocess + // the instruction I. + assert(RS.getCurrentPosition() == I && + "The register scavenger has an unexpected position"); + I = P; + RS.unprocess(P); + } else + ++I; + } + } + + MRI.clearVirtRegs(); + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); +} + +namespace { +/// This class runs register scavenging independ of the PrologEpilogInserter. +/// This is used in for testing. +class ScavengerTest : public MachineFunctionPass { +public: + static char ID; + ScavengerTest() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetFrameLowering &TFL = *STI.getFrameLowering(); + + RegScavenger RS; + // Let's hope that calling those outside of PrologEpilogueInserter works + // well enough to initialize the scavenger with some emergency spillslots + // for the target. + BitVector SavedRegs; + TFL.determineCalleeSaves(MF, SavedRegs, &RS); + TFL.processFunctionBeforeFrameFinalized(MF, &RS); + + // Let's scavenge the current function + scavengeFrameVirtualRegs(MF, RS); + return true; + } +}; +char ScavengerTest::ID; + +} // end anonymous namespace + +INITIALIZE_PASS(ScavengerTest, "scavenger-test", + "Scavenge virtual registers inside basic blocks", false, false) diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 8035ea80364b..3fdbd2459361 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -12,32 +12,54 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/ADT/IntEqClasses.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/ScheduleDFS.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Type.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <string> +#include <utility> +#include <vector> using namespace llvm; @@ -90,11 +112,9 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo *mli, bool RemoveKillFlags) : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), - RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false), - TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr), + RemoveKillFlags(RemoveKillFlags), UnknownValue(UndefValue::get( - Type::getVoidTy(mf.getFunction()->getContext()))), - FirstDbgValue(nullptr) { + Type::getVoidTy(mf.getFunction()->getContext()))) { DbgValues.clear(); const TargetSubtargetInfo &ST = mf.getSubtarget(); @@ -126,7 +146,7 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) { return V; } assert(V->getType()->isIntegerTy() && "Unexpected operand type!"); - } while (1); + } while (true); } /// This is a wrapper around GetUnderlyingObjects and adds support for basic @@ -563,7 +583,7 @@ void ScheduleDAGInstrs::initSUnits() { // which is contained within a basic block. SUnits.reserve(NumRegionInstrs); - for (MachineInstr &MI : llvm::make_range(RegionBegin, RegionEnd)) { + for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) { if (MI.isDebugValue()) continue; @@ -606,13 +626,13 @@ void ScheduleDAGInstrs::initSUnits() { class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { /// Current total number of SUs in map. - unsigned NumNodes; + unsigned NumNodes = 0; /// 1 for loads, 0 for stores. (see comment in SUList) unsigned TrueMemOrderLatency; public: - Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {} + Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {} /// To keep NumNodes up to date, insert() is used instead of /// this operator w/ push_back(). @@ -630,7 +650,7 @@ public: void inline clearList(ValueType V) { iterator Itr = find(V); if (Itr != end()) { - assert (NumNodes >= Itr->second.size()); + assert(NumNodes >= Itr->second.size()); NumNodes -= Itr->second.size(); Itr->second.clear(); @@ -646,7 +666,7 @@ public: unsigned inline size() const { return NumNodes; } /// Counts the number of SUs in this map after a reduction. - void reComputeSize(void) { + void reComputeSize() { NumNodes = 0; for (auto &I : *this) NumNodes += I.second.size(); @@ -676,7 +696,7 @@ void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, } void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { - assert (BarrierChain != nullptr); + assert(BarrierChain != nullptr); for (auto &I : map) { SUList &sus = I.second; @@ -687,7 +707,7 @@ void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { } void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { - assert (BarrierChain != nullptr); + assert(BarrierChain != nullptr); // Go through all lists of SUs. for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) { @@ -1028,7 +1048,7 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, // The N last elements in NodeNums will be removed, and the SU with // the lowest NodeNum of them will become the new BarrierChain to // let the not yet seen SUs have a dependency to the removed SUs. - assert (N <= NodeNums.size()); + assert(N <= NodeNums.size()); SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)]; if (BarrierChain) { // The aliasing and non-aliasing maps reduce independently of each @@ -1156,6 +1176,7 @@ std::string ScheduleDAGInstrs::getDAGName() const { //===----------------------------------------------------------------------===// namespace llvm { + /// Internal state used to compute SchedDFSResult. class SchedDFSImpl { SchedDFSResult &R; @@ -1163,16 +1184,16 @@ class SchedDFSImpl { /// Join DAG nodes into equivalence classes by their subtree. IntEqClasses SubtreeClasses; /// List PredSU, SuccSU pairs that represent data edges between subtrees. - std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs; + std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs; struct RootData { unsigned NodeID; unsigned ParentNodeID; ///< Parent node (member of the parent subtree). - unsigned SubInstrCount; ///< Instr count in this tree only, not children. + unsigned SubInstrCount = 0; ///< Instr count in this tree only, not + /// children. RootData(unsigned id): NodeID(id), - ParentNodeID(SchedDFSResult::InvalidSubtreeID), - SubInstrCount(0) {} + ParentNodeID(SchedDFSResult::InvalidSubtreeID) {} unsigned getSparseSetIndex() const { return NodeID; } }; @@ -1340,12 +1361,15 @@ protected: } while (FromTree != SchedDFSResult::InvalidSubtreeID); } }; + } // end namespace llvm namespace { + /// Manage the stack used by a reverse depth-first search over the DAG. class SchedDAGReverseDFS { - std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack; + std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack; + public: bool isComplete() const { return DFSStack.empty(); } @@ -1367,7 +1391,8 @@ public: return getCurr()->Preds.end(); } }; -} // anonymous + +} // end anonymous namespace static bool hasDataSucc(const SUnit *SU) { for (const SDep &SuccDep : SU->Succs) { @@ -1392,7 +1417,7 @@ void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { SchedDAGReverseDFS DFS; Impl.visitPreorder(&SU); DFS.follow(&SU); - for (;;) { + while (true) { // Traverse the leftmost path as far as possible. while (DFS.getPred() != DFS.getPredEnd()) { const SDep &PredDep = *DFS.getPred(); @@ -1457,4 +1482,5 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { } } // end namespace llvm + #endif diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 5f167f8de1cf..9355dbe77f94 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -225,6 +225,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { } return TranslateLegalizeResults(Op, Lowered); } + LLVM_FALLTHROUGH; case TargetLowering::Expand: Changed = true; return LegalizeOp(ExpandLoad(Op)); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 177898e1e950..80a03ea4eea0 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1,4 +1,4 @@ -//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===// +//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===// // // The LLVM Compiler Infrastructure // @@ -11,29 +11,46 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h" #include "SDNodeDbgValue.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/APSInt.h" -#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -#include "llvm/IR/CallingConv.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -41,16 +58,20 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> -#include <cmath> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <limits> +#include <set> +#include <string> #include <utility> +#include <vector> using namespace llvm; @@ -269,7 +290,6 @@ ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) { return ISD::CondCode(Operation); } - /// For an integer comparison, return 1 if the comparison is a signed operation /// and 2 if the result is an unsigned comparison. Return zero if the operation /// does not depend on the sign of the input (setne and seteq). @@ -338,7 +358,6 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, //===----------------------------------------------------------------------===// /// AddNodeIDOpcode - Add the node opcode to the NodeID data. -/// static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) { ID.AddInteger(OpC); } @@ -350,7 +369,6 @@ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) { } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. -/// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef<SDValue> Ops) { for (auto& Op : Ops) { @@ -360,7 +378,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID, } /// AddNodeIDOperands - Various routines for adding operands to the NodeID data. -/// static void AddNodeIDOperands(FoldingSetNodeID &ID, ArrayRef<SDUse> Ops) { for (auto& Op : Ops) { @@ -392,10 +409,9 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { break; } case ISD::TargetConstantFP: - case ISD::ConstantFP: { + case ISD::ConstantFP: ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue()); break; - } case ISD::TargetGlobalAddress: case ISD::GlobalAddress: case ISD::TargetGlobalTLSAddress: @@ -770,7 +786,6 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) { /// maps and modified in place. Add it back to the CSE maps, unless an identical /// node already exists, in which case transfer all its users to the existing /// node. This transfer can potentially trigger recursive merging. -/// void SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) { // For node types that aren't CSE'd, just act as if no identical node @@ -835,7 +850,6 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, return Node; } - /// FindModifiedNodeSlot - Find a slot for the specified node if its operands /// were replaced with those specified. If this node is never memoized, /// return null, otherwise return a pointer to the slot it would take. If a @@ -864,10 +878,9 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const { // EntryNode could meaningfully have debug info if we can find it... SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) - : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL), + : TM(tm), OptLevel(OL), EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), - Root(getEntryNode()), NewNodesMustHaveLegalTypes(false), - UpdateListeners(nullptr) { + Root(getEntryNode()) { InsertNode(&EntryNode); DbgInfo = new SDDbgInfo(); } @@ -1038,7 +1051,6 @@ SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL, } /// getNOT - Create a bitwise NOT operation as (XOR Val, -1). -/// SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) { EVT EltVT = VT.getScalarType(); SDValue NegOne = @@ -1317,7 +1329,6 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT, return SDValue(N, 0); } - SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT, unsigned Alignment, int Offset, bool isTarget, @@ -1451,7 +1462,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, // Validate that all indices in Mask are within the range of the elements // input to the shuffle. int NElts = Mask.size(); - assert(all_of(Mask, [&](int M) { return M < (NElts * 2); }) && + assert(llvm::all_of(Mask, [&](int M) { return M < (NElts * 2); }) && "Index out of range"); // Copy the mask so we can do any needed cleanup. @@ -2918,7 +2929,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, else DemandedRHS.setBit((unsigned)M % NumElts); } - Tmp = UINT_MAX; + Tmp = std::numeric_limits<unsigned>::max(); if (!!DemandedLHS) Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); if (!!DemandedRHS) { @@ -3122,7 +3133,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, unsigned EltIdx = CEltNo->getZExtValue(); // If we demand the inserted element then get its sign bits. - Tmp = UINT_MAX; + Tmp = std::numeric_limits<unsigned>::max(); if (DemandedElts[EltIdx]) { // TODO - handle implicit truncation of inserted elements. if (InVal.getScalarValueSizeInBits() != VTBits) @@ -3188,7 +3199,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, case ISD::CONCAT_VECTORS: // Determine the minimum number of sign bits across all demanded // elts of the input vectors. Early out if the result is already 1. - Tmp = UINT_MAX; + Tmp = std::numeric_limits<unsigned>::max(); EVT SubVectorVT = Op.getOperand(0).getValueType(); unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements(); unsigned NumSubVectors = Op.getNumOperands(); @@ -3327,7 +3338,7 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, - llvm::SelectionDAG &DAG) { + SelectionDAG &DAG) { assert(!Ops.empty() && "Can't concatenate an empty list of vectors!"); assert(llvm::all_of(Ops, [Ops](SDValue Op) { @@ -3836,8 +3847,9 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { return true; return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) && - any_of(Divisor->op_values(), - [](SDValue V) { return V.isUndef() || isNullConstant(V); }); + llvm::any_of(Divisor->op_values(), + [](SDValue V) { return V.isUndef() || + isNullConstant(V); }); // TODO: Handle signed overflow. } // TODO: Handle oversized shifts. @@ -3948,8 +3960,8 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, // All operands must be vector types with the same number of elements as // the result type and must be either UNDEF or a build vector of constant // or UNDEF scalars. - if (!all_of(Ops, IsConstantBuildVectorOrUndef) || - !all_of(Ops, IsScalarOrSameVectorSize)) + if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) || + !llvm::all_of(Ops, IsScalarOrSameVectorSize)) return SDValue(); // If we are comparing vectors, then the result needs to be a i1 boolean @@ -5550,7 +5562,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, Opcode == ISD::PREFETCH || Opcode == ISD::LIFETIME_START || Opcode == ISD::LIFETIME_END || - (Opcode <= INT_MAX && + ((int)Opcode <= std::numeric_limits<int>::max() && (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) && "Opcode is not a memory-accessing opcode!"); @@ -5884,7 +5896,6 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::LoadExtType ExtTy, bool isExpanding) { - SDVTList VTs = getVTList(VT, MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Src0 }; FoldingSetNodeID ID; @@ -6038,13 +6049,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, switch (Opcode) { default: break; - case ISD::CONCAT_VECTORS: { + case ISD::CONCAT_VECTORS: // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) return V; break; - } - case ISD::SELECT_CC: { + case ISD::SELECT_CC: assert(NumOps == 5 && "SELECT_CC takes 5 operands!"); assert(Ops[0].getValueType() == Ops[1].getValueType() && "LHS and RHS of condition must have same type!"); @@ -6053,14 +6063,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Ops[2].getValueType() == VT && "select_cc node must be of same type as true and false value!"); break; - } - case ISD::BR_CC: { + case ISD::BR_CC: assert(NumOps == 5 && "BR_CC takes 5 operands!"); assert(Ops[2].getValueType() == Ops[3].getValueType() && "LHS/RHS of comparison should match types!"); break; } - } // Memoize nodes. SDNode *N; @@ -6599,7 +6607,6 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) { return Res; } - /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. /// @@ -6812,7 +6819,7 @@ public: : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {} }; -} +} // end anonymous namespace /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead. /// This can cause recursive merging of nodes in the DAG. @@ -6858,7 +6865,6 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) { AddModifiedNodeToCSEMaps(User); } - // If we just RAUW'd the root, take note. if (FromN == getRoot()) setRoot(To); @@ -7028,6 +7034,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){ } namespace { + /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith /// to record information about a use. struct UseMemo { @@ -7040,7 +7047,8 @@ namespace { bool operator<(const UseMemo &L, const UseMemo &R) { return (intptr_t)L.User < (intptr_t)R.User; } -} + +} // end anonymous namespace /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving /// uses of other values produced by From.getNode() alone. The same value @@ -7106,7 +7114,6 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From, /// based on their topological order. It returns the maximum id and a vector /// of the SDNodes* in assigned order by reference. unsigned SelectionDAG::AssignTopologicalOrder() { - unsigned DAGSize = 0; // SortedPos tracks the progress of the algorithm. Nodes before it are @@ -7333,6 +7340,7 @@ void SDNode::Profile(FoldingSetNodeID &ID) const { } namespace { + struct EVTArray { std::vector<EVT> VTs; @@ -7342,11 +7350,12 @@ namespace { VTs.push_back(MVT((MVT::SimpleValueType)i)); } }; -} -static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs; +} // end anonymous namespace + +static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs; static ManagedStatic<EVTArray> SimpleVTArray; -static ManagedStatic<sys::SmartMutex<true> > VTMutex; +static ManagedStatic<sys::SmartMutex<true>> VTMutex; /// getValueTypeList - Return a pointer to the specified value type. /// @@ -7380,7 +7389,6 @@ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const { return NUses == 0; } - /// hasAnyUseOfValue - Return true if there are any use of the indicated /// value. This method ignores uses of other values defined by this operation. bool SDNode::hasAnyUseOfValue(unsigned Value) const { @@ -7393,9 +7401,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const { return false; } - /// isOnlyUserOf - Return true if this node is the only use of N. -/// bool SDNode::isOnlyUserOf(const SDNode *N) const { bool Seen = false; for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { @@ -7425,7 +7431,6 @@ bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) { } /// isOperand - Return true if this node is an operand of N. -/// bool SDValue::isOperandOf(const SDNode *N) const { for (const SDValue &Op : N->op_values()) if (*this == Op) @@ -7475,7 +7480,7 @@ bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, } // Next, try a deep search: check whether every operand of the TokenFactor // reaches Dest. - return all_of((*this)->ops(), [=](SDValue Op) { + return llvm::all_of((*this)->ops(), [=](SDValue Op) { return Op.reachesChainWithoutSideEffects(Dest, Depth - 1); }); } @@ -7627,7 +7632,6 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, return false; } - /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if /// it cannot be inferred. unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { @@ -7718,7 +7722,6 @@ unsigned GlobalAddressSDNode::getAddressSpace() const { return getGlobal()->getType()->getAddressSpace(); } - Type *ConstantPoolSDNode::getType() const { if (isMachineConstantPoolEntry()) return Val.MachineCPVal->getType(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 687b882c5e4d..b5ccd64ee76c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2022,7 +2022,7 @@ static SDNode *findGlueUse(SDNode *N) { } /// findNonImmUse - Return true if "Use" is a non-immediate use of "Def". -/// This function recursively traverses up the operand chain, ignoring +/// This function iteratively traverses up the operand chain, ignoring /// certain nodes. static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited, @@ -2035,30 +2035,36 @@ static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, // The Use may be -1 (unassigned) if it is a newly allocated node. This can // happen because we scan down to newly selected nodes in the case of glue // uses. - if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)) - return false; + std::vector<SDNode *> WorkList; + WorkList.push_back(Use); - // Don't revisit nodes if we already scanned it and didn't fail, we know we - // won't fail if we scan it again. - if (!Visited.insert(Use).second) - return false; + while (!WorkList.empty()) { + Use = WorkList.back(); + WorkList.pop_back(); + if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1) + continue; - for (const SDValue &Op : Use->op_values()) { - // Ignore chain uses, they are validated by HandleMergeInputChains. - if (Op.getValueType() == MVT::Other && IgnoreChains) + // Don't revisit nodes if we already scanned it and didn't fail, we know we + // won't fail if we scan it again. + if (!Visited.insert(Use).second) continue; - SDNode *N = Op.getNode(); - if (N == Def) { - if (Use == ImmedUse || Use == Root) - continue; // We are not looking for immediate use. - assert(N != Root); - return true; - } + for (const SDValue &Op : Use->op_values()) { + // Ignore chain uses, they are validated by HandleMergeInputChains. + if (Op.getValueType() == MVT::Other && IgnoreChains) + continue; - // Traverse up the operand chain. - if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains)) - return true; + SDNode *N = Op.getNode(); + if (N == Def) { + if (Use == ImmedUse || Use == Root) + continue; // We are not looking for immediate use. + assert(N != Root); + return true; + } + + // Traverse up the operand chain. + WorkList.push_back(N); + } } return false; } diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 1c66649cae01..eed667dbe7e0 100644 --- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -818,7 +818,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP, SI.GCTransitionArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end()); SI.ID = ISP.getID(); - SI.DeoptState = ArrayRef<const Use>(ISP.vm_state_begin(), ISP.vm_state_end()); + SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end()); SI.StatepointFlags = ISP.getFlags(); SI.NumPatchBytes = ISP.getNumPatchBytes(); SI.EHPadBB = EHPadBB; diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 0dffffee9976..adb2b188265b 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1493,8 +1493,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } - // Ensure that the constant occurs on the RHS, and fold constant - // comparisons. + // Ensure that the constant occurs on the RHS and fold constant comparisons. ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond); if (isa<ConstantSDNode>(N0.getNode()) && (DCI.isBeforeLegalizeOps() || @@ -1638,14 +1637,13 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0), TopSetCC.getOperand(1), InvCond); - } } } - // If the LHS is '(and load, const)', the RHS is 0, - // the test is for equality or unsigned, and all 1 bits of the const are - // in the same partial word, see if we can shorten the load. + // If the LHS is '(and load, const)', the RHS is 0, the test is for + // equality or unsigned, and all 1 bits of the const are in the same + // partial word, see if we can shorten the load. if (DCI.isBeforeLegalize() && !ISD::isSignedIntSetCC(Cond) && N0.getOpcode() == ISD::AND && C1 == 0 && @@ -1669,10 +1667,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, APInt newMask = APInt::getLowBitsSet(maskWidth, width); for (unsigned offset=0; offset<origWidth/width; offset++) { if ((newMask & Mask) == Mask) { - if (!DAG.getDataLayout().isLittleEndian()) - bestOffset = (origWidth/width - offset - 1) * (width/8); - else + if (DAG.getDataLayout().isLittleEndian()) bestOffset = (uint64_t)offset * (width/8); + else + bestOffset = (origWidth/width - offset - 1) * (width/8); bestMask = Mask.lshr(offset * (width/8) * 8); bestWidth = width; break; @@ -1713,10 +1711,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, switch (Cond) { case ISD::SETUGT: case ISD::SETUGE: - case ISD::SETEQ: return DAG.getConstant(0, dl, VT); + case ISD::SETEQ: + return DAG.getConstant(0, dl, VT); case ISD::SETULT: case ISD::SETULE: - case ISD::SETNE: return DAG.getConstant(1, dl, VT); + case ISD::SETNE: + return DAG.getConstant(1, dl, VT); case ISD::SETGT: case ISD::SETGE: // True if the sign bit of C1 is set. @@ -1816,9 +1816,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, BitWidth-1))) { // Okay, get the un-inverted input value. SDValue Val; - if (N0.getOpcode() == ISD::XOR) + if (N0.getOpcode() == ISD::XOR) { Val = N0.getOperand(0); - else { + } else { assert(N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::XOR); // ((X^1)&1)^1 -> X & 1 @@ -1883,7 +1883,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // Canonicalize GE/LE comparisons to use GT/LT comparisons. if (Cond == ISD::SETGE || Cond == ISD::SETUGE) { - if (C1 == MinVal) return DAG.getConstant(1, dl, VT); // X >= MIN --> true + // X >= MIN --> true + if (C1 == MinVal) + return DAG.getConstant(1, dl, VT); + // X >= C0 --> X > (C0 - 1) APInt C = C1 - 1; ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT; @@ -1898,7 +1901,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } if (Cond == ISD::SETLE || Cond == ISD::SETULE) { - if (C1 == MaxVal) return DAG.getConstant(1, dl, VT); // X <= MAX --> true + // X <= MAX --> true + if (C1 == MaxVal) + return DAG.getConstant(1, dl, VT); + // X <= C0 --> X < (C0 + 1) APInt C = C1 + 1; ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT; diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 900c0318b179..c43a5e18ad23 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -1456,6 +1456,7 @@ void TargetLoweringBase::computeRegisterProperties( } if (IsLegalWiderType) break; + LLVM_FALLTHROUGH; } case TypeWidenVector: { // Try to widen the vector. @@ -1473,6 +1474,7 @@ void TargetLoweringBase::computeRegisterProperties( } if (IsLegalWiderType) break; + LLVM_FALLTHROUGH; } case TypeSplitVector: case TypeScalarizeVector: { diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp index 282e3103adc9..711144fc2faa 100644 --- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp +++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp @@ -27,6 +27,14 @@ Error CodeViewRecordIO::beginRecord(Optional<uint32_t> MaxLength) { Error CodeViewRecordIO::endRecord() { assert(!Limits.empty() && "Not in a record!"); Limits.pop_back(); + // We would like to assert that we actually read / wrote all the bytes that we + // expected to for this record, but unfortunately we can't do this. Some + // producers such as MASM over-allocate for certain types of records and + // commit the extraneous data, so when reading we can't be sure every byte + // will have been read. And when writing we over-allocate temporarily since + // we don't know how big the record is until we're finished writing it, so + // even though we don't commit the extraneous data, we still can't guarantee + // we're at the end of the allocated data. return Error::success(); } @@ -49,6 +57,12 @@ uint32_t CodeViewRecordIO::maxFieldLength() const { return *Min; } +Error CodeViewRecordIO::padToAlignment(uint32_t Align) { + if (isReading()) + return Reader->padToAlignment(Align); + return Writer->padToAlignment(Align); +} + Error CodeViewRecordIO::skipPadding() { assert(!isWriting() && "Cannot skip padding while writing!"); diff --git a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp index b8741eb0b675..2e72242181b0 100644 --- a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp +++ b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp @@ -72,7 +72,7 @@ Error DebugStringTableSubsection::commit(BinaryStreamWriter &Writer) const { uint32_t DebugStringTableSubsection::size() const { return Strings.size(); } uint32_t DebugStringTableSubsection::getStringId(StringRef S) const { - auto P = Strings.find(S); - assert(P != Strings.end()); - return P->second; + auto Iter = Strings.find(S); + assert(Iter != Strings.end()); + return Iter->second; } diff --git a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp index 511f36d0020a..cfd1c5d3ab0c 100644 --- a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp +++ b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp @@ -16,14 +16,17 @@ using namespace llvm; using namespace llvm::codeview; DebugSubsectionRecord::DebugSubsectionRecord() - : Kind(DebugSubsectionKind::None) {} + : Container(CodeViewContainer::ObjectFile), + Kind(DebugSubsectionKind::None) {} DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind, - BinaryStreamRef Data) - : Kind(Kind), Data(Data) {} + BinaryStreamRef Data, + CodeViewContainer Container) + : Container(Container), Kind(Kind), Data(Data) {} Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream, - DebugSubsectionRecord &Info) { + DebugSubsectionRecord &Info, + CodeViewContainer Container) { const DebugSubsectionHeader *Header; BinaryStreamReader Reader(Stream); if (auto EC = Reader.readObject(Header)) @@ -41,13 +44,14 @@ Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream, } if (auto EC = Reader.readStreamRef(Info.Data, Header->Length)) return EC; + Info.Container = Container; Info.Kind = Kind; return Error::success(); } uint32_t DebugSubsectionRecord::getRecordLength() const { uint32_t Result = sizeof(DebugSubsectionHeader) + Data.getLength(); - assert(Result % 4 == 0); + assert(Result % alignOf(Container) == 0); return Result; } @@ -56,25 +60,29 @@ DebugSubsectionKind DebugSubsectionRecord::kind() const { return Kind; } BinaryStreamRef DebugSubsectionRecord::getRecordData() const { return Data; } DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder( - DebugSubsectionKind Kind, DebugSubsection &Frag) - : Kind(Kind), Frag(Frag) {} + std::unique_ptr<DebugSubsection> Subsection, CodeViewContainer Container) + : Subsection(std::move(Subsection)), Container(Container) {} uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() { - uint32_t Size = sizeof(DebugSubsectionHeader) + - alignTo(Frag.calculateSerializedSize(), 4); + uint32_t Size = + sizeof(DebugSubsectionHeader) + + alignTo(Subsection->calculateSerializedSize(), alignOf(Container)); return Size; } Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) { + assert(Writer.getOffset() % alignOf(Container) == 0 && + "Debug Subsection not properly aligned"); + DebugSubsectionHeader Header; - Header.Kind = uint32_t(Kind); + Header.Kind = uint32_t(Subsection->kind()); Header.Length = calculateSerializedLength() - sizeof(DebugSubsectionHeader); if (auto EC = Writer.writeObject(Header)) return EC; - if (auto EC = Frag.commit(Writer)) + if (auto EC = Subsection->commit(Writer)) return EC; - if (auto EC = Writer.padToAlignment(4)) + if (auto EC = Writer.padToAlignment(alignOf(Container))) return EC; return Error::success(); diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp index 3d49a7198d1a..66045933ce9b 100644 --- a/lib/DebugInfo/CodeView/SymbolDumper.cpp +++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp @@ -668,7 +668,7 @@ Error CVSymbolDumperImpl::visitUnknownSymbol(CVSymbol &CVR) { Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) { SymbolVisitorCallbackPipeline Pipeline; - SymbolDeserializer Deserializer(ObjDelegate.get()); + SymbolDeserializer Deserializer(ObjDelegate.get(), Container); CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes); Pipeline.addCallbackToPipeline(Deserializer); @@ -679,7 +679,7 @@ Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) { Error CVSymbolDumper::dump(const CVSymbolArray &Symbols) { SymbolVisitorCallbackPipeline Pipeline; - SymbolDeserializer Deserializer(ObjDelegate.get()); + SymbolDeserializer Deserializer(ObjDelegate.get(), Container); CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes); Pipeline.addCallbackToPipeline(Deserializer); diff --git a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp index bb1731465495..ea46841a70f6 100644 --- a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp +++ b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp @@ -40,6 +40,7 @@ Error SymbolRecordMapping::visitSymbolBegin(CVSymbol &Record) { } Error SymbolRecordMapping::visitSymbolEnd(CVSymbol &Record) { + error(IO.padToAlignment(alignOf(Container))); error(IO.endRecord()); return Error::success(); } diff --git a/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/lib/DebugInfo/CodeView/SymbolSerializer.cpp index 251cc431f52b..9f2d619d1a1c 100644 --- a/lib/DebugInfo/CodeView/SymbolSerializer.cpp +++ b/lib/DebugInfo/CodeView/SymbolSerializer.cpp @@ -12,9 +12,11 @@ using namespace llvm; using namespace llvm::codeview; -SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator) - : Storage(Allocator), RecordBuffer(MaxRecordLength), Stream(RecordBuffer, llvm::support::little), - Writer(Stream), Mapping(Writer) { } +SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator, + CodeViewContainer Container) + : Storage(Allocator), RecordBuffer(MaxRecordLength), + Stream(RecordBuffer, llvm::support::little), Writer(Stream), + Mapping(Writer, Container) {} Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) { assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!"); diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp index dfdeb8414212..faf2442bc94b 100644 --- a/lib/DebugInfo/MSF/MappedBlockStream.cpp +++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp @@ -47,42 +47,46 @@ static Interval intersect(const Interval &I1, const Interval &I2) { MappedBlockStream::MappedBlockStream(uint32_t BlockSize, const MSFStreamLayout &Layout, - BinaryStreamRef MsfData) - : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData) {} - -std::unique_ptr<MappedBlockStream> -MappedBlockStream::createStream(uint32_t BlockSize, - const MSFStreamLayout &Layout, - BinaryStreamRef MsfData) { + BinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) + : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData), + Allocator(Allocator) {} + +std::unique_ptr<MappedBlockStream> MappedBlockStream::createStream( + uint32_t BlockSize, const MSFStreamLayout &Layout, BinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) { return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>( - BlockSize, Layout, MsfData); + BlockSize, Layout, MsfData, Allocator); } std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream( - const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex) { + const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex, + BumpPtrAllocator &Allocator) { assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index"); MSFStreamLayout SL; SL.Blocks = Layout.StreamMap[StreamIndex]; SL.Length = Layout.StreamSizes[StreamIndex]; return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>( - Layout.SB->BlockSize, SL, MsfData); + Layout.SB->BlockSize, SL, MsfData, Allocator); } std::unique_ptr<MappedBlockStream> MappedBlockStream::createDirectoryStream(const MSFLayout &Layout, - BinaryStreamRef MsfData) { + BinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) { MSFStreamLayout SL; SL.Blocks = Layout.DirectoryBlocks; SL.Length = Layout.SB->NumDirectoryBytes; - return createStream(Layout.SB->BlockSize, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator); } std::unique_ptr<MappedBlockStream> MappedBlockStream::createFpmStream(const MSFLayout &Layout, - BinaryStreamRef MsfData) { + BinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) { MSFStreamLayout SL; initializeFpmStreamLayout(Layout, SL); - return createStream(Layout.SB->BlockSize, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator); } Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size, @@ -148,7 +152,7 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size, // into it, and return an ArrayRef to that. Do not touch existing pool // allocations, as existing clients may be holding a pointer which must // not be invalidated. - uint8_t *WriteBuffer = static_cast<uint8_t *>(Pool.Allocate(Size, 8)); + uint8_t *WriteBuffer = static_cast<uint8_t *>(Allocator.Allocate(Size, 8)); if (auto EC = readBytes(Offset, MutableArrayRef<uint8_t>(WriteBuffer, Size))) return EC; @@ -269,10 +273,6 @@ Error MappedBlockStream::readBytes(uint32_t Offset, return Error::success(); } -uint32_t MappedBlockStream::getNumBytesCopied() const { - return static_cast<uint32_t>(Pool.getBytesAllocated()); -} - void MappedBlockStream::invalidateCache() { CacheMap.shrink_and_clear(); } void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset, @@ -313,43 +313,48 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset, WritableMappedBlockStream::WritableMappedBlockStream( uint32_t BlockSize, const MSFStreamLayout &Layout, - WritableBinaryStreamRef MsfData) - : ReadInterface(BlockSize, Layout, MsfData), WriteInterface(MsfData) {} + WritableBinaryStreamRef MsfData, BumpPtrAllocator &Allocator) + : ReadInterface(BlockSize, Layout, MsfData, Allocator), + WriteInterface(MsfData) {} std::unique_ptr<WritableMappedBlockStream> WritableMappedBlockStream::createStream(uint32_t BlockSize, const MSFStreamLayout &Layout, - WritableBinaryStreamRef MsfData) { + WritableBinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) { return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>( - BlockSize, Layout, MsfData); + BlockSize, Layout, MsfData, Allocator); } std::unique_ptr<WritableMappedBlockStream> WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData, - uint32_t StreamIndex) { + uint32_t StreamIndex, + BumpPtrAllocator &Allocator) { assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index"); MSFStreamLayout SL; SL.Blocks = Layout.StreamMap[StreamIndex]; SL.Length = Layout.StreamSizes[StreamIndex]; - return createStream(Layout.SB->BlockSize, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator); } std::unique_ptr<WritableMappedBlockStream> WritableMappedBlockStream::createDirectoryStream( - const MSFLayout &Layout, WritableBinaryStreamRef MsfData) { + const MSFLayout &Layout, WritableBinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) { MSFStreamLayout SL; SL.Blocks = Layout.DirectoryBlocks; SL.Length = Layout.SB->NumDirectoryBytes; - return createStream(Layout.SB->BlockSize, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator); } std::unique_ptr<WritableMappedBlockStream> WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout, - WritableBinaryStreamRef MsfData) { + WritableBinaryStreamRef MsfData, + BumpPtrAllocator &Allocator) { MSFStreamLayout SL; initializeFpmStreamLayout(Layout, SL); - return createStream(Layout.SB->BlockSize, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator); } Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size, diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp index b28ec2ff33ac..22c2ef31bd71 100644 --- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp @@ -66,7 +66,11 @@ void DbiModuleDescriptorBuilder::setObjFileName(StringRef Name) { void DbiModuleDescriptorBuilder::addSymbol(CVSymbol Symbol) { Symbols.push_back(Symbol); - SymbolByteSize += Symbol.data().size(); + // Symbols written to a PDB file are required to be 4 byte aligned. The same + // is not true of object files. + assert(Symbol.length() % alignOf(CodeViewContainer::Pdb) == 0 && + "Invalid Symbol alignment!"); + SymbolByteSize += Symbol.length(); } void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) { @@ -140,7 +144,7 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter, if (Layout.ModDiStream != kInvalidStreamIndex) { auto NS = WritableMappedBlockStream::createIndexedStream( - MsfLayout, MsfBuffer, Layout.ModDiStream); + MsfLayout, MsfBuffer, Layout.ModDiStream, MSF.getAllocator()); WritableBinaryStreamRef Ref(*NS); BinaryStreamWriter SymbolWriter(Ref); // Write the symbols. @@ -153,7 +157,8 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter, if (auto EC = SymbolWriter.writeStreamRef(RecordsRef)) return EC; // TODO: Write C11 Line data - + assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 && + "Invalid debug section alignment!"); for (const auto &Builder : C13Builders) { assert(Builder && "Empty C13 Fragment Builder!"); if (auto EC = Builder->commit(SymbolWriter)) @@ -169,42 +174,9 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter, return Error::success(); } -void DbiModuleDescriptorBuilder::addC13Fragment( - std::unique_ptr<DebugLinesSubsection> Lines) { - DebugLinesSubsection &Frag = *Lines; - - // File Checksums have to come first, so push an empty entry on if this - // is the first. - if (C13Builders.empty()) - C13Builders.push_back(nullptr); - - this->LineInfo.push_back(std::move(Lines)); - C13Builders.push_back( - llvm::make_unique<DebugSubsectionRecordBuilder>(Frag.kind(), Frag)); -} - -void DbiModuleDescriptorBuilder::addC13Fragment( - std::unique_ptr<codeview::DebugInlineeLinesSubsection> Inlinees) { - DebugInlineeLinesSubsection &Frag = *Inlinees; - - // File Checksums have to come first, so push an empty entry on if this - // is the first. - if (C13Builders.empty()) - C13Builders.push_back(nullptr); - - this->Inlinees.push_back(std::move(Inlinees)); - C13Builders.push_back( - llvm::make_unique<DebugSubsectionRecordBuilder>(Frag.kind(), Frag)); -} - -void DbiModuleDescriptorBuilder::setC13FileChecksums( - std::unique_ptr<DebugChecksumsSubsection> Checksums) { - assert(!ChecksumInfo && "Can't have more than one checksum info!"); - - if (C13Builders.empty()) - C13Builders.push_back(nullptr); - - ChecksumInfo = std::move(Checksums); - C13Builders[0] = llvm::make_unique<DebugSubsectionRecordBuilder>( - ChecksumInfo->kind(), *ChecksumInfo); +void DbiModuleDescriptorBuilder::addDebugSubsection( + std::unique_ptr<DebugSubsection> Subsection) { + assert(Subsection); + C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>( + std::move(Subsection), CodeViewContainer::Pdb)); } diff --git a/lib/DebugInfo/PDB/Native/DbiStream.cpp b/lib/DebugInfo/PDB/Native/DbiStream.cpp index 2f4fb6cc295d..320b11dc5cab 100644 --- a/lib/DebugInfo/PDB/Native/DbiStream.cpp +++ b/lib/DebugInfo/PDB/Native/DbiStream.cpp @@ -252,7 +252,7 @@ Error DbiStream::initializeSectionHeadersData() { return make_error<RawError>(raw_error_code::no_stream); auto SHS = MappedBlockStream::createIndexedStream( - Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum); + Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator()); size_t StreamLen = SHS->getLength(); if (StreamLen % sizeof(object::coff_section)) @@ -284,7 +284,7 @@ Error DbiStream::initializeFpoRecords() { return make_error<RawError>(raw_error_code::no_stream); auto FS = MappedBlockStream::createIndexedStream( - Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum); + Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator()); size_t StreamLen = FS->getLength(); if (StreamLen % sizeof(object::FpoData)) diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp index 23c7456d7772..55c20fdb9af6 100644 --- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp @@ -357,8 +357,8 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout, if (auto EC = finalize()) return EC; - auto DbiS = WritableMappedBlockStream::createIndexedStream(Layout, MsfBuffer, - StreamDBI); + auto DbiS = WritableMappedBlockStream::createIndexedStream( + Layout, MsfBuffer, StreamDBI, Allocator); BinaryStreamWriter Writer(*DbiS); if (auto EC = Writer.writeObject(*Header)) @@ -396,7 +396,7 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout, if (Stream.StreamNumber == kInvalidStreamIndex) continue; auto WritableStream = WritableMappedBlockStream::createIndexedStream( - Layout, MsfBuffer, Stream.StreamNumber); + Layout, MsfBuffer, Stream.StreamNumber, Allocator); BinaryStreamWriter DbgStreamWriter(*WritableStream); if (auto EC = DbgStreamWriter.writeArray(Stream.Data)) return EC; diff --git a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp index f019d410328a..707128f7efd4 100644 --- a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp @@ -50,8 +50,8 @@ Error InfoStreamBuilder::finalizeMsfLayout() { Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout, WritableBinaryStreamRef Buffer) const { - auto InfoS = - WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamPDB); + auto InfoS = WritableMappedBlockStream::createIndexedStream( + Layout, Buffer, StreamPDB, Msf.getAllocator()); BinaryStreamWriter Writer(*InfoS); InfoStreamHeader H; diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp index d7a203746a0d..c4ff30011a17 100644 --- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp +++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp @@ -55,9 +55,9 @@ Error ModuleDebugStreamRef::reload() { if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size)) return EC; - BinaryStreamReader LineReader(C13LinesSubstream); - if (auto EC = - LineReader.readArray(LinesAndChecksums, LineReader.bytesRemaining())) + BinaryStreamReader SubsectionsReader(C13LinesSubstream); + if (auto EC = SubsectionsReader.readArray(Subsections, + SubsectionsReader.bytesRemaining())) return EC; uint32_t GlobalRefsSize; @@ -77,13 +77,27 @@ ModuleDebugStreamRef::symbols(bool *HadError) const { return make_range(SymbolsSubstream.begin(HadError), SymbolsSubstream.end()); } -llvm::iterator_range<ModuleDebugStreamRef::LinesAndChecksumsIterator> -ModuleDebugStreamRef::linesAndChecksums() const { - return make_range(LinesAndChecksums.begin(), LinesAndChecksums.end()); +llvm::iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator> +ModuleDebugStreamRef::subsections() const { + return make_range(Subsections.begin(), Subsections.end()); } -bool ModuleDebugStreamRef::hasLineInfo() const { +bool ModuleDebugStreamRef::hasDebugSubsections() const { return C13LinesSubstream.getLength() > 0; } Error ModuleDebugStreamRef::commit() { return Error::success(); } + +Expected<codeview::DebugChecksumsSubsectionRef> +ModuleDebugStreamRef::findChecksumsSubsection() const { + for (const auto &SS : subsections()) { + if (SS.kind() != DebugSubsectionKind::FileChecksums) + continue; + + codeview::DebugChecksumsSubsectionRef Result; + if (auto EC = Result.initialize(SS.getRecordData())) + return std::move(EC); + return Result; + } + return make_error<RawError>(raw_error_code::no_entry); +} diff --git a/lib/DebugInfo/PDB/Native/PDBFile.cpp b/lib/DebugInfo/PDB/Native/PDBFile.cpp index 859295d2c7d3..1254e23c73eb 100644 --- a/lib/DebugInfo/PDB/Native/PDBFile.cpp +++ b/lib/DebugInfo/PDB/Native/PDBFile.cpp @@ -146,7 +146,8 @@ Error PDBFile::parseFileHeaders() { // at getBlockSize() intervals, so we have to be compatible. // See the function fpmPn() for more information: // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L489 - auto FpmStream = MappedBlockStream::createFpmStream(ContainerLayout, *Buffer); + auto FpmStream = + MappedBlockStream::createFpmStream(ContainerLayout, *Buffer, Allocator); BinaryStreamReader FpmReader(*FpmStream); ArrayRef<uint8_t> FpmBytes; if (auto EC = FpmReader.readBytes(FpmBytes, @@ -184,7 +185,8 @@ Error PDBFile::parseStreamData() { // is exactly what we are attempting to parse. By specifying a custom // subclass of IPDBStreamData which only accesses the fields that have already // been parsed, we can avoid this and reuse MappedBlockStream. - auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer); + auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer, + Allocator); BinaryStreamReader Reader(*DS); if (auto EC = Reader.readInteger(NumStreams)) return EC; @@ -407,5 +409,6 @@ PDBFile::safelyCreateIndexedStream(const MSFLayout &Layout, uint32_t StreamIndex) const { if (StreamIndex >= getNumStreams()) return make_error<RawError>(raw_error_code::no_stream); - return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex); + return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex, + Allocator); } diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp index c6568029ec55..2c6465e6fb2a 100644 --- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp @@ -140,8 +140,8 @@ Error PDBFileBuilder::commit(StringRef Filename) { if (auto EC = Writer.writeArray(Layout.DirectoryBlocks)) return EC; - auto DirStream = - WritableMappedBlockStream::createDirectoryStream(Layout, Buffer); + auto DirStream = WritableMappedBlockStream::createDirectoryStream( + Layout, Buffer, Allocator); BinaryStreamWriter DW(*DirStream); if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size())) return EC; @@ -158,8 +158,8 @@ Error PDBFileBuilder::commit(StringRef Filename) { if (!ExpectedSN) return ExpectedSN.takeError(); - auto NS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer, - *ExpectedSN); + auto NS = WritableMappedBlockStream::createIndexedStream( + Layout, Buffer, *ExpectedSN, Allocator); BinaryStreamWriter NSWriter(*NS); if (auto EC = Strings.commit(NSWriter)) return EC; diff --git a/lib/DebugInfo/PDB/Native/PDBStringTable.cpp b/lib/DebugInfo/PDB/Native/PDBStringTable.cpp index e84573fe07b8..6013c342cf02 100644 --- a/lib/DebugInfo/PDB/Native/PDBStringTable.cpp +++ b/lib/DebugInfo/PDB/Native/PDBStringTable.cpp @@ -56,6 +56,10 @@ Error PDBStringTable::readStrings(BinaryStreamReader &Reader) { return Error::success(); } +codeview::DebugStringTableSubsectionRef PDBStringTable::getStringTable() const { + return Strings; +} + Error PDBStringTable::readHashTable(BinaryStreamReader &Reader) { const support::ulittle32_t *HashCount; if (auto EC = Reader.readObject(HashCount)) diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp index 623afb371b50..67c803d3124e 100644 --- a/lib/DebugInfo/PDB/Native/TpiStream.cpp +++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp @@ -32,8 +32,7 @@ using namespace llvm::support; using namespace llvm::msf; using namespace llvm::pdb; -TpiStream::TpiStream(const PDBFile &File, - std::unique_ptr<MappedBlockStream> Stream) +TpiStream::TpiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream) : Pdb(File), Stream(std::move(Stream)) {} TpiStream::~TpiStream() = default; @@ -77,7 +76,8 @@ Error TpiStream::reload() { "Invalid TPI hash stream index."); auto HS = MappedBlockStream::createIndexedStream( - Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex); + Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex, + Pdb.getAllocator()); BinaryStreamReader HSR(*HS); // There should be a hash value for every type record, or no hashes at all. diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp index 20456cc97823..9e943c7f114d 100644 --- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp @@ -147,8 +147,8 @@ Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout, if (auto EC = finalize()) return EC; - auto InfoS = - WritableMappedBlockStream::createIndexedStream(Layout, Buffer, Idx); + auto InfoS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer, + Idx, Allocator); BinaryStreamWriter Writer(*InfoS); if (auto EC = Writer.writeObject(*Header)) @@ -159,8 +159,8 @@ Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout, return EC; if (HashStreamIndex != kInvalidStreamIndex) { - auto HVS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer, - HashStreamIndex); + auto HVS = WritableMappedBlockStream::createIndexedStream( + Layout, Buffer, HashStreamIndex, Allocator); BinaryStreamWriter HW(*HVS); if (HashValueStream) { if (auto EC = HW.writeStreamRef(*HashValueStream)) diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp index 7e6f9a7804b9..7754ac03b43d 100644 --- a/lib/IR/DIBuilder.cpp +++ b/lib/IR/DIBuilder.cpp @@ -39,6 +39,21 @@ void DIBuilder::trackIfUnresolved(MDNode *N) { UnresolvedNodes.emplace_back(N); } +void DIBuilder::finalizeSubprogram(DISubprogram *SP) { + MDTuple *Temp = SP->getVariables().get(); + if (!Temp || !Temp->isTemporary()) + return; + + SmallVector<Metadata *, 4> Variables; + + auto PV = PreservedVariables.find(SP); + if (PV != PreservedVariables.end()) + Variables.append(PV->second.begin(), PV->second.end()); + + DINodeArray AV = getOrCreateArray(Variables); + TempMDTuple(Temp)->replaceAllUsesWith(AV.get()); +} + void DIBuilder::finalize() { if (!CUNode) { assert(!AllowUnresolvedNodes && @@ -62,25 +77,11 @@ void DIBuilder::finalize() { CUNode->replaceRetainedTypes(MDTuple::get(VMContext, RetainValues)); DISubprogramArray SPs = MDTuple::get(VMContext, AllSubprograms); - auto resolveVariables = [&](DISubprogram *SP) { - MDTuple *Temp = SP->getVariables().get(); - if (!Temp) - return; - - SmallVector<Metadata *, 4> Variables; - - auto PV = PreservedVariables.find(SP); - if (PV != PreservedVariables.end()) - Variables.append(PV->second.begin(), PV->second.end()); - - DINodeArray AV = getOrCreateArray(Variables); - TempMDTuple(Temp)->replaceAllUsesWith(AV.get()); - }; for (auto *SP : SPs) - resolveVariables(SP); + finalizeSubprogram(SP); for (auto *N : RetainValues) if (auto *SP = dyn_cast<DISubprogram>(N)) - resolveVariables(SP); + finalizeSubprogram(SP); if (!AllGVs.empty()) CUNode->replaceGlobalVariables(MDTuple::get(VMContext, AllGVs)); diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp index b7e3f0c6779e..0485fece7c42 100644 --- a/lib/IR/DebugLoc.cpp +++ b/lib/IR/DebugLoc.cpp @@ -99,87 +99,6 @@ DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt, return Last; } -/// Reparent \c Scope from \c OrigSP to \c NewSP. -static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope, - DISubprogram *OrigSP, DISubprogram *NewSP, - DenseMap<const MDNode *, MDNode *> &Cache) { - SmallVector<DIScope *, 3> ScopeChain; - DIScope *Last = NewSP; - DIScope *CurScope = Scope; - do { - if (auto *SP = dyn_cast<DISubprogram>(CurScope)) { - // Don't rewrite this scope chain if it doesn't lead to the replaced SP. - if (SP != OrigSP) - return Scope; - Cache.insert({OrigSP, NewSP}); - break; - } - if (auto *Found = Cache[CurScope]) { - Last = cast<DIScope>(Found); - break; - } - ScopeChain.push_back(CurScope); - } while ((CurScope = CurScope->getScope().resolve())); - - // Starting from the top, rebuild the nodes to point to the new inlined-at - // location (then rebuilding the rest of the chain behind it) and update the - // map of already-constructed inlined-at nodes. - for (const DIScope *MD : reverse(ScopeChain)) { - if (auto *LB = dyn_cast<DILexicalBlock>(MD)) - Cache[MD] = Last = DILexicalBlock::getDistinct( - Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn()); - else if (auto *LB = dyn_cast<DILexicalBlockFile>(MD)) - Cache[MD] = Last = DILexicalBlockFile::getDistinct( - Ctx, Last, LB->getFile(), LB->getDiscriminator()); - else - llvm_unreachable("illegal parent scope"); - } - return Last; -} - -void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP, - DISubprogram *NewSP, - DenseMap<const MDNode *, MDNode *> &Cache) { - auto DL = I.getDebugLoc(); - if (!OrigSP || !NewSP || OrigSP == NewSP || !DL) - return; - - // Reparent the debug location. - auto &Ctx = I.getContext(); - DILocation *InlinedAt = DL->getInlinedAt(); - if (InlinedAt) { - while (auto *IA = InlinedAt->getInlinedAt()) - InlinedAt = IA; - auto NewScope = - reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache); - InlinedAt = - DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope); - } - I.setDebugLoc( - DebugLoc::get(DL.getLine(), DL.getCol(), - reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache), - DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache, - ReplaceLastInlinedAt))); - - // Fix up debug variables to point to NewSP. - auto reparentVar = [&](DILocalVariable *Var) { - return DILocalVariable::get( - Ctx, - cast<DILocalScope>( - reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)), - Var->getName(), Var->getFile(), Var->getLine(), Var->getType(), - Var->getArg(), Var->getFlags(), Var->getAlignInBits()); - }; - if (auto *DbgValue = dyn_cast<DbgValueInst>(&I)) { - auto *Var = DbgValue->getVariable(); - I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var))); - } else if (auto *DbgDeclare = dyn_cast<DbgDeclareInst>(&I)) { - auto *Var = DbgDeclare->getVariable(); - I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var))); - } -} - - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void DebugLoc::dump() const { if (!Loc) diff --git a/lib/IR/OptBisect.cpp b/lib/IR/OptBisect.cpp index b670c817569a..a03a6fb62237 100644 --- a/lib/IR/OptBisect.cpp +++ b/lib/IR/OptBisect.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/RegionInfo.h" #include "llvm/IR/Module.h" #include "llvm/IR/OptBisect.h" #include "llvm/Pass.h" @@ -53,13 +54,20 @@ static std::string getDescription(const BasicBlock &BB) { } static std::string getDescription(const Loop &L) { - // FIXME: I'd like to be able to provide a better description here, but - // calling L->getHeader() would introduce a new dependency on the - // LLVMCore library. + // FIXME: Move into LoopInfo so we can get a better description + // (and avoid a circular dependency between IR and Analysis). return "loop"; } +static std::string getDescription(const Region &R) { + // FIXME: Move into RegionInfo so we can get a better description + // (and avoid a circular dependency between IR and Analysis). + return "region"; +} + static std::string getDescription(const CallGraphSCC &SCC) { + // FIXME: Move into CallGraphSCCPass to avoid circular dependency between + // IR and Analysis. std::string Desc = "SCC ("; bool First = true; for (CallGraphNode *CGN : SCC) { @@ -83,6 +91,7 @@ template bool OptBisect::shouldRunPass(const Pass *, const Function &); template bool OptBisect::shouldRunPass(const Pass *, const BasicBlock &); template bool OptBisect::shouldRunPass(const Pass *, const Loop &); template bool OptBisect::shouldRunPass(const Pass *, const CallGraphSCC &); +template bool OptBisect::shouldRunPass(const Pass *, const Region &); template <class UnitT> bool OptBisect::shouldRunPass(const Pass *P, const UnitT &U) { diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 9efc095f9fcf..92145aaf667a 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -122,6 +122,7 @@ static void computeCacheKey( AddUnsigned(Conf.CGOptLevel); AddUnsigned(Conf.CGFileType); AddUnsigned(Conf.OptLevel); + AddUnsigned(Conf.UseNewPM); AddString(Conf.OptPipeline); AddString(Conf.AAPipeline); AddString(Conf.OverrideTriple); @@ -621,6 +622,19 @@ unsigned LTO::getMaxTasks() const { } Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) { + // Compute "dead" symbols, we don't want to import/export these! + DenseSet<GlobalValue::GUID> GUIDPreservedSymbols; + for (auto &Res : GlobalResolutions) { + if (Res.second.VisibleOutsideThinLTO && + // IRName will be defined if we have seen the prevailing copy of + // this value. If not, no need to preserve any ThinLTO copies. + !Res.second.IRName.empty()) + GUIDPreservedSymbols.insert(GlobalValue::getGUID( + GlobalValue::dropLLVMManglingEscape(Res.second.IRName))); + } + + computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols); + // Save the status of having a regularLTO combined module, as // this is needed for generating the ThinLTO Task ID, and // the CombinedModule will be moved at the end of runRegularLTO. @@ -930,6 +944,17 @@ ThinBackend lto::createWriteIndexesThinBackend(std::string OldPrefix, }; } +static bool IsLiveByGUID(const ModuleSummaryIndex &Index, + GlobalValue::GUID GUID) { + auto VI = Index.getValueInfo(GUID); + if (!VI) + return false; + for (auto &I : VI.getSummaryList()) + if (Index.isGlobalValueLive(I.get())) + return true; + return false; +} + Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, bool HasRegularLTO) { if (ThinLTO.ModuleMap.empty()) @@ -962,22 +987,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR; if (Conf.OptLevel > 0) { - // Compute "dead" symbols, we don't want to import/export these! - DenseSet<GlobalValue::GUID> GUIDPreservedSymbols; - for (auto &Res : GlobalResolutions) { - if (Res.second.VisibleOutsideThinLTO && - // IRName will be defined if we have seen the prevailing copy of - // this value. If not, no need to preserve any ThinLTO copies. - !Res.second.IRName.empty()) - GUIDPreservedSymbols.insert(GlobalValue::getGUID( - GlobalValue::dropLLVMManglingEscape(Res.second.IRName))); - } - - auto DeadSymbols = - computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols); - ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - ImportLists, ExportLists, &DeadSymbols); + ImportLists, ExportLists); std::set<GlobalValue::GUID> ExportedGUIDs; for (auto &Res : GlobalResolutions) { @@ -992,7 +1003,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, auto GUID = GlobalValue::getGUID( GlobalValue::dropLLVMManglingEscape(Res.second.IRName)); // Mark exported unless index-based analysis determined it to be dead. - if (!DeadSymbols.count(GUID)) + if (IsLiveByGUID(ThinLTO.CombinedIndex, GUID)) ExportedGUIDs.insert(GUID); } diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index f9c41f5c9744..3f72e446cdf2 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -42,11 +42,6 @@ using namespace llvm; using namespace lto; -static cl::opt<bool> - LTOUseNewPM("lto-use-new-pm", - cl::desc("Run LTO passes using the new pass manager"), - cl::init(false), cl::Hidden); - LLVM_ATTRIBUTE_NORETURN static void reportOpenError(StringRef Path, Twine Msg) { errs() << "failed to open " << Path << ": " << Msg << '\n'; errs().flush(); @@ -266,7 +261,7 @@ bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, if (!Conf.OptPipeline.empty()) runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline, Conf.DisableVerify); - else if (LTOUseNewPM) + else if (Conf.UseNewPM) runNewPMPasses(Mod, TM, Conf.OptLevel, IsThinLTO); else runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary); diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp index ca3fc60f9501..6b221a347c17 100644 --- a/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/lib/LTO/ThinLTOCodeGenerator.cpp @@ -628,13 +628,13 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, PreservedSymbols, Triple(TheModule.getTargetTriple())); // Compute "dead" symbols, we don't want to import/export these! - auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols); + computeDeadSymbols(Index, GUIDPreservedSymbols); // Generate import/export list StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount); StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists, &DeadSymbols); + ExportLists); // Resolve LinkOnce/Weak symbols. StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR; @@ -673,13 +673,13 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule, PreservedSymbols, Triple(TheModule.getTargetTriple())); // Compute "dead" symbols, we don't want to import/export these! - auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols); + computeDeadSymbols(Index, GUIDPreservedSymbols); // Generate import/export list StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount); StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists, &DeadSymbols); + ExportLists); auto &ImportList = ImportLists[TheModule.getModuleIdentifier()]; crossImportIntoModule(TheModule, Index, ModuleMap, ImportList); @@ -750,13 +750,13 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule, Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); // Compute "dead" symbols, we don't want to import/export these! - auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols); + computeDeadSymbols(Index, GUIDPreservedSymbols); // Generate import/export list StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount); StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists, &DeadSymbols); + ExportLists); auto &ExportList = ExportLists[ModuleIdentifier]; // Be friendly and don't nuke totally the module when the client didn't @@ -902,14 +902,14 @@ void ThinLTOCodeGenerator::run() { computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple); // Compute "dead" symbols, we don't want to import/export these! - auto DeadSymbols = computeDeadSymbols(*Index, GUIDPreservedSymbols); + computeDeadSymbols(*Index, GUIDPreservedSymbols); // Collect the import/export lists for all modules from the call-graph in the // combined index. StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount); StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount); ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists, &DeadSymbols); + ExportLists); // We use a std::map here to be able to have a defined ordering when // producing a hash for the cache entry. diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 8c3df36cfb48..9b2031f05043 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -40,6 +40,7 @@ using namespace llvm; #define DEBUG_TYPE "reloc-info" namespace { + // For patching purposes, we need to remember where each section starts, both // for patching up the section size field, and for patching up references to // locations within the section. @@ -50,6 +51,82 @@ struct SectionBookkeeping { uint64_t ContentsOffset; }; +// The signature of a wasm function, in a struct capable of being used as a +// DenseMap key. +struct WasmFunctionType { + // Support empty and tombstone instances, needed by DenseMap. + enum { Plain, Empty, Tombstone } State; + + // The return types of the function. + SmallVector<wasm::ValType, 1> Returns; + + // The parameter types of the function. + SmallVector<wasm::ValType, 4> Params; + + WasmFunctionType() : State(Plain) {} + + bool operator==(const WasmFunctionType &Other) const { + return State == Other.State && Returns == Other.Returns && + Params == Other.Params; + } +}; + +// Traits for using WasmFunctionType in a DenseMap. +struct WasmFunctionTypeDenseMapInfo { + static WasmFunctionType getEmptyKey() { + WasmFunctionType FuncTy; + FuncTy.State = WasmFunctionType::Empty; + return FuncTy; + } + static WasmFunctionType getTombstoneKey() { + WasmFunctionType FuncTy; + FuncTy.State = WasmFunctionType::Tombstone; + return FuncTy; + } + static unsigned getHashValue(const WasmFunctionType &FuncTy) { + uintptr_t Value = FuncTy.State; + for (wasm::ValType Ret : FuncTy.Returns) + Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret)); + for (wasm::ValType Param : FuncTy.Params) + Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param)); + return Value; + } + static bool isEqual(const WasmFunctionType &LHS, + const WasmFunctionType &RHS) { + return LHS == RHS; + } +}; + +// A wasm import to be written into the import section. +struct WasmImport { + StringRef ModuleName; + StringRef FieldName; + unsigned Kind; + int32_t Type; +}; + +// A wasm function to be written into the function section. +struct WasmFunction { + int32_t Type; + const MCSymbolWasm *Sym; +}; + +// A wasm export to be written into the export section. +struct WasmExport { + StringRef FieldName; + unsigned Kind; + uint32_t Index; +}; + +// A wasm global to be written into the global section. +struct WasmGlobal { + wasm::ValType Type; + bool IsMutable; + bool HasImport; + uint64_t InitialValue; + uint32_t ImportIndex; +}; + class WasmObjectWriter : public MCObjectWriter { /// Helper struct for containing some precomputed information on symbols. struct WasmSymbolData { @@ -91,18 +168,10 @@ public: : MCObjectWriter(OS, /*IsLittleEndian=*/true), TargetObjectWriter(MOTW) {} private: - void reset() override { - MCObjectWriter::reset(); - } - ~WasmObjectWriter() override; void writeHeader(const MCAssembler &Asm); - void writeValueType(wasm::ValType Ty) { - encodeSLEB128(int32_t(Ty), getStream()); - } - void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, bool &IsPCRel, @@ -112,7 +181,37 @@ private: const MCAsmLayout &Layout) override; void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; + + void writeValueType(wasm::ValType Ty) { + encodeSLEB128(int32_t(Ty), getStream()); + } + + void writeTypeSection(const SmallVector<WasmFunctionType, 4> &FunctionTypes); + void writeImportSection(const SmallVector<WasmImport, 4> &Imports); + void writeFunctionSection(const SmallVector<WasmFunction, 4> &Functions); + void writeTableSection(const SmallVector<uint32_t, 4> &TableElems); + void writeMemorySection(const SmallVector<char, 0> &DataBytes); + void writeGlobalSection(const SmallVector<WasmGlobal, 4> &Globals); + void writeExportSection(const SmallVector<WasmExport, 4> &Exports); + void writeElemSection(const SmallVector<uint32_t, 4> &TableElems); + void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout, + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices, + const SmallVector<WasmFunction, 4> &Functions); + uint64_t + writeDataSection(const SmallVector<char, 0> &DataBytes, + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices); + void writeNameSection(const SmallVector<WasmFunction, 4> &Functions, + const SmallVector<WasmImport, 4> &Imports, + uint32_t NumFuncImports); + void writeCodeRelocSection( + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices); + void writeDataRelocSection( + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices, + uint64_t DataSectionHeaderSize); + void writeLinkingMetaDataSection(bool HasStackPointer, + uint32_t StackPointerGlobal); }; + } // end anonymous namespace WasmObjectWriter::~WasmObjectWriter() {} @@ -278,86 +377,6 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm, DataRelocations.push_back(Rec); } -namespace { - -// The signature of a wasm function, in a struct capable of being used as a -// DenseMap key. -struct WasmFunctionType { - // Support empty and tombstone instances, needed by DenseMap. - enum { Plain, Empty, Tombstone } State; - - // The return types of the function. - SmallVector<wasm::ValType, 1> Returns; - - // The parameter types of the function. - SmallVector<wasm::ValType, 4> Params; - - WasmFunctionType() : State(Plain) {} - - bool operator==(const WasmFunctionType &Other) const { - return State == Other.State && Returns == Other.Returns && - Params == Other.Params; - } -}; - -// Traits for using WasmFunctionType in a DenseMap. -struct WasmFunctionTypeDenseMapInfo { - static WasmFunctionType getEmptyKey() { - WasmFunctionType FuncTy; - FuncTy.State = WasmFunctionType::Empty; - return FuncTy; - } - static WasmFunctionType getTombstoneKey() { - WasmFunctionType FuncTy; - FuncTy.State = WasmFunctionType::Tombstone; - return FuncTy; - } - static unsigned getHashValue(const WasmFunctionType &FuncTy) { - uintptr_t Value = FuncTy.State; - for (wasm::ValType Ret : FuncTy.Returns) - Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret)); - for (wasm::ValType Param : FuncTy.Params) - Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param)); - return Value; - } - static bool isEqual(const WasmFunctionType &LHS, - const WasmFunctionType &RHS) { - return LHS == RHS; - } -}; - -// A wasm import to be written into the import section. -struct WasmImport { - StringRef ModuleName; - StringRef FieldName; - unsigned Kind; - int32_t Type; -}; - -// A wasm function to be written into the function section. -struct WasmFunction { - int32_t Type; - const MCSymbolWasm *Sym; -}; - -// A wasm export to be written into the export section. -struct WasmExport { - StringRef FieldName; - unsigned Kind; - uint32_t Index; -}; - -// A wasm global to be written into the global section. -struct WasmGlobal { - wasm::ValType Type; - bool IsMutable; - bool HasImport; - uint64_t InitialValue; - uint32_t ImportIndex; -}; - -} // end anonymous namespace - // Write X as an (unsigned) LEB value at offset Offset in Stream, padded // to allow patching. static void @@ -529,6 +548,367 @@ static void WriteTypeRelocations( } } +void WasmObjectWriter::writeTypeSection( + const SmallVector<WasmFunctionType, 4> &FunctionTypes) { + if (FunctionTypes.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_TYPE); + + encodeULEB128(FunctionTypes.size(), getStream()); + + for (const WasmFunctionType &FuncTy : FunctionTypes) { + encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream()); + encodeULEB128(FuncTy.Params.size(), getStream()); + for (wasm::ValType Ty : FuncTy.Params) + writeValueType(Ty); + encodeULEB128(FuncTy.Returns.size(), getStream()); + for (wasm::ValType Ty : FuncTy.Returns) + writeValueType(Ty); + } + + endSection(Section); +} + +void WasmObjectWriter::writeImportSection( + const SmallVector<WasmImport, 4> &Imports) { + if (Imports.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_IMPORT); + + encodeULEB128(Imports.size(), getStream()); + for (const WasmImport &Import : Imports) { + StringRef ModuleName = Import.ModuleName; + encodeULEB128(ModuleName.size(), getStream()); + writeBytes(ModuleName); + + StringRef FieldName = Import.FieldName; + encodeULEB128(FieldName.size(), getStream()); + writeBytes(FieldName); + + encodeULEB128(Import.Kind, getStream()); + + switch (Import.Kind) { + case wasm::WASM_EXTERNAL_FUNCTION: + encodeULEB128(Import.Type, getStream()); + break; + case wasm::WASM_EXTERNAL_GLOBAL: + encodeSLEB128(int32_t(Import.Type), getStream()); + encodeULEB128(0, getStream()); // mutability + break; + default: + llvm_unreachable("unsupported import kind"); + } + } + + endSection(Section); +} + +void WasmObjectWriter::writeFunctionSection( + const SmallVector<WasmFunction, 4> &Functions) { + if (Functions.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_FUNCTION); + + encodeULEB128(Functions.size(), getStream()); + for (const WasmFunction &Func : Functions) + encodeULEB128(Func.Type, getStream()); + + endSection(Section); +} + +void WasmObjectWriter::writeTableSection( + const SmallVector<uint32_t, 4> &TableElems) { + // For now, always emit the table section, since indirect calls are not + // valid without it. In the future, we could perhaps be more clever and omit + // it if there are no indirect calls. + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_TABLE); + + // The number of tables, fixed to 1 for now. + encodeULEB128(1, getStream()); + + encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream()); + + encodeULEB128(0, getStream()); // flags + encodeULEB128(TableElems.size(), getStream()); // initial + + endSection(Section); +} + +void WasmObjectWriter::writeMemorySection( + const SmallVector<char, 0> &DataBytes) { + // For now, always emit the memory section, since loads and stores are not + // valid without it. In the future, we could perhaps be more clever and omit + // it if there are no loads or stores. + SectionBookkeeping Section; + uint32_t NumPages = + (DataBytes.size() + wasm::WasmPageSize - 1) / wasm::WasmPageSize; + + startSection(Section, wasm::WASM_SEC_MEMORY); + encodeULEB128(1, getStream()); // number of memory spaces + + encodeULEB128(0, getStream()); // flags + encodeULEB128(NumPages, getStream()); // initial + + endSection(Section); +} + +void WasmObjectWriter::writeGlobalSection( + const SmallVector<WasmGlobal, 4> &Globals) { + if (Globals.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_GLOBAL); + + encodeULEB128(Globals.size(), getStream()); + for (const WasmGlobal &Global : Globals) { + writeValueType(Global.Type); + write8(Global.IsMutable); + + if (Global.HasImport) { + assert(Global.InitialValue == 0); + write8(wasm::WASM_OPCODE_GET_GLOBAL); + encodeULEB128(Global.ImportIndex, getStream()); + } else { + assert(Global.ImportIndex == 0); + write8(wasm::WASM_OPCODE_I32_CONST); + encodeSLEB128(Global.InitialValue, getStream()); // offset + } + write8(wasm::WASM_OPCODE_END); + } + + endSection(Section); +} + +void WasmObjectWriter::writeExportSection( + const SmallVector<WasmExport, 4> &Exports) { + if (Exports.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_EXPORT); + + encodeULEB128(Exports.size(), getStream()); + for (const WasmExport &Export : Exports) { + encodeULEB128(Export.FieldName.size(), getStream()); + writeBytes(Export.FieldName); + + encodeSLEB128(Export.Kind, getStream()); + + encodeULEB128(Export.Index, getStream()); + } + + endSection(Section); +} + +void WasmObjectWriter::writeElemSection( + const SmallVector<uint32_t, 4> &TableElems) { + if (TableElems.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_ELEM); + + encodeULEB128(1, getStream()); // number of "segments" + encodeULEB128(0, getStream()); // the table index + + // init expr for starting offset + write8(wasm::WASM_OPCODE_I32_CONST); + encodeSLEB128(0, getStream()); + write8(wasm::WASM_OPCODE_END); + + encodeULEB128(TableElems.size(), getStream()); + for (uint32_t Elem : TableElems) + encodeULEB128(Elem, getStream()); + + endSection(Section); +} + +void WasmObjectWriter::writeCodeSection( + const MCAssembler &Asm, const MCAsmLayout &Layout, + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices, + const SmallVector<WasmFunction, 4> &Functions) { + if (Functions.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_CODE); + + encodeULEB128(Functions.size(), getStream()); + + for (const WasmFunction &Func : Functions) { + MCSectionWasm &FuncSection = + static_cast<MCSectionWasm &>(Func.Sym->getSection()); + + if (Func.Sym->isVariable()) + report_fatal_error("weak symbols not supported yet"); + + if (Func.Sym->getOffset() != 0) + report_fatal_error("function sections must contain one function each"); + + if (!Func.Sym->getSize()) + report_fatal_error("function symbols must have a size set with .size"); + + int64_t Size = 0; + if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout)) + report_fatal_error(".size expression must be evaluatable"); + + encodeULEB128(Size, getStream()); + + FuncSection.setSectionOffset(getStream().tell() - + Section.ContentsOffset); + + Asm.writeSectionData(&FuncSection, Layout); + } + + // Apply the type index fixups for call_indirect etc. instructions. + for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) { + uint32_t Type = TypeIndexFixupTypes[i]; + unsigned Padding = PaddingFor5ByteULEB128(Type); + + const WasmRelocationEntry &Fixup = TypeIndexFixups[i]; + assert(Fixup.Addend == 0); + assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB); + uint64_t Offset = Fixup.Offset + + Fixup.FixupSection->getSectionOffset(); + + uint8_t Buffer[16]; + unsigned SizeLen = encodeULEB128(Type, Buffer, Padding); + assert(SizeLen == 5); + getStream().pwrite((char *)Buffer, SizeLen, + Section.ContentsOffset + Offset); + } + + // Apply fixups. + ApplyRelocations(CodeRelocations, getStream(), SymbolIndices, + Section.ContentsOffset); + + endSection(Section); +} + +uint64_t WasmObjectWriter::writeDataSection( + const SmallVector<char, 0> &DataBytes, + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices) { + if (DataBytes.empty()) + return 0; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_DATA); + + encodeULEB128(1, getStream()); // count + encodeULEB128(0, getStream()); // memory index + write8(wasm::WASM_OPCODE_I32_CONST); + encodeSLEB128(0, getStream()); // offset + write8(wasm::WASM_OPCODE_END); + encodeULEB128(DataBytes.size(), getStream()); // size + uint32_t HeaderSize = getStream().tell() - Section.ContentsOffset; + writeBytes(DataBytes); // data + + // Apply fixups. + ApplyRelocations(DataRelocations, getStream(), SymbolIndices, + Section.ContentsOffset + HeaderSize); + + endSection(Section); + return HeaderSize; +} + +void WasmObjectWriter::writeNameSection( + const SmallVector<WasmFunction, 4> &Functions, + const SmallVector<WasmImport, 4> &Imports, + unsigned NumFuncImports) { + uint32_t TotalFunctions = NumFuncImports + Functions.size(); + if (TotalFunctions == 0) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_CUSTOM, "name"); + SectionBookkeeping SubSection; + startSection(SubSection, wasm::WASM_NAMES_FUNCTION); + + encodeULEB128(TotalFunctions, getStream()); + uint32_t Index = 0; + for (const WasmImport &Import : Imports) { + if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) { + encodeULEB128(Index, getStream()); + encodeULEB128(Import.FieldName.size(), getStream()); + writeBytes(Import.FieldName); + ++Index; + } + } + for (const WasmFunction &Func : Functions) { + encodeULEB128(Index, getStream()); + encodeULEB128(Func.Sym->getName().size(), getStream()); + writeBytes(Func.Sym->getName()); + ++Index; + } + + endSection(SubSection); + endSection(Section); +} + +void WasmObjectWriter::writeCodeRelocSection( + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices) { + // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md + // for descriptions of the reloc sections. + + if (CodeRelocations.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE"); + + encodeULEB128(wasm::WASM_SEC_CODE, getStream()); + encodeULEB128(CodeRelocations.size() + TypeIndexFixups.size(), getStream()); + + WriteRelocations(CodeRelocations, getStream(), SymbolIndices, 0); + WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream()); + + endSection(Section); +} + +void WasmObjectWriter::writeDataRelocSection( + DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices, + uint64_t DataSectionHeaderSize) { + // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md + // for descriptions of the reloc sections. + + if (DataRelocations.empty()) + return; + + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA"); + + encodeULEB128(wasm::WASM_SEC_DATA, getStream()); + encodeULEB128(DataRelocations.size(), getStream()); + + WriteRelocations(DataRelocations, getStream(), SymbolIndices, + DataSectionHeaderSize); + + endSection(Section); +} + +void WasmObjectWriter::writeLinkingMetaDataSection( + bool HasStackPointer, uint32_t StackPointerGlobal) { + if (!HasStackPointer) + return; + SectionBookkeeping Section; + startSection(Section, wasm::WASM_SEC_CUSTOM, "linking"); + + encodeULEB128(1, getStream()); // count + + encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type + encodeULEB128(StackPointerGlobal, getStream()); // id + + endSection(Section); +} + void WasmObjectWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) { MCContext &Ctx = Asm.getContext(); @@ -730,16 +1110,21 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, if (IsAddressTaken.count(&WS)) TableElems.push_back(Index); } else { - if (WS.getOffset() != 0) - report_fatal_error("data sections must contain one variable each"); - if (!WS.getSize()) - report_fatal_error("data symbols must have a size set with .size"); - - int64_t Size = 0; - if (!WS.getSize()->evaluateAsAbsolute(Size, Layout)) - report_fatal_error(".size expression must be evaluatable"); + if (WS.isTemporary() && !WS.getSize()) + continue; if (WS.isDefined(false)) { + if (WS.getOffset() != 0) + report_fatal_error("data sections must contain one variable each: " + + WS.getName()); + if (!WS.getSize()) + report_fatal_error("data symbols must have a size set with .size: " + + WS.getName()); + + int64_t Size = 0; + if (!WS.getSize()->evaluateAsAbsolute(Size, Layout)) + report_fatal_error(".size expression must be evaluatable"); + MCSectionWasm &DataSection = static_cast<MCSectionWasm &>(WS.getSection()); @@ -827,322 +1212,23 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, // Write out the Wasm header. writeHeader(Asm); - SectionBookkeeping Section; - - // === Type Section ========================================================= - if (!FunctionTypes.empty()) { - startSection(Section, wasm::WASM_SEC_TYPE); - - encodeULEB128(FunctionTypes.size(), getStream()); - - for (WasmFunctionType &FuncTy : FunctionTypes) { - encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream()); - encodeULEB128(FuncTy.Params.size(), getStream()); - for (wasm::ValType Ty : FuncTy.Params) - writeValueType(Ty); - encodeULEB128(FuncTy.Returns.size(), getStream()); - for (wasm::ValType Ty : FuncTy.Returns) - writeValueType(Ty); - } - - endSection(Section); - } - - // === Import Section ======================================================== - if (!Imports.empty()) { - startSection(Section, wasm::WASM_SEC_IMPORT); - - encodeULEB128(Imports.size(), getStream()); - for (const WasmImport &Import : Imports) { - StringRef ModuleName = Import.ModuleName; - encodeULEB128(ModuleName.size(), getStream()); - writeBytes(ModuleName); - - StringRef FieldName = Import.FieldName; - encodeULEB128(FieldName.size(), getStream()); - writeBytes(FieldName); - - encodeULEB128(Import.Kind, getStream()); - - switch (Import.Kind) { - case wasm::WASM_EXTERNAL_FUNCTION: - encodeULEB128(Import.Type, getStream()); - break; - case wasm::WASM_EXTERNAL_GLOBAL: - encodeSLEB128(int32_t(Import.Type), getStream()); - encodeULEB128(0, getStream()); // mutability - break; - default: - llvm_unreachable("unsupported import kind"); - } - } - - endSection(Section); - } - - // === Function Section ====================================================== - if (!Functions.empty()) { - startSection(Section, wasm::WASM_SEC_FUNCTION); - - encodeULEB128(Functions.size(), getStream()); - for (const WasmFunction &Func : Functions) - encodeULEB128(Func.Type, getStream()); - - endSection(Section); - } - - // === Table Section ========================================================= - // For now, always emit the table section, since indirect calls are not - // valid without it. In the future, we could perhaps be more clever and omit - // it if there are no indirect calls. - startSection(Section, wasm::WASM_SEC_TABLE); - - // The number of tables, fixed to 1 for now. - encodeULEB128(1, getStream()); - - encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream()); - - encodeULEB128(0, getStream()); // flags - encodeULEB128(TableElems.size(), getStream()); // initial - - endSection(Section); - - // === Memory Section ======================================================== - // For now, always emit the memory section, since loads and stores are not - // valid without it. In the future, we could perhaps be more clever and omit - // it if there are no loads or stores. - uint32_t NumPages = - (DataBytes.size() + wasm::WasmPageSize - 1) / wasm::WasmPageSize; - - startSection(Section, wasm::WASM_SEC_MEMORY); - encodeULEB128(1, getStream()); // number of memory spaces - - encodeULEB128(0, getStream()); // flags - encodeULEB128(NumPages, getStream()); // initial - - endSection(Section); - - // === Global Section ======================================================== - if (!Globals.empty()) { - startSection(Section, wasm::WASM_SEC_GLOBAL); - - encodeULEB128(Globals.size(), getStream()); - for (const WasmGlobal &Global : Globals) { - writeValueType(Global.Type); - write8(Global.IsMutable); - - if (Global.HasImport) { - assert(Global.InitialValue == 0); - write8(wasm::WASM_OPCODE_GET_GLOBAL); - encodeULEB128(Global.ImportIndex, getStream()); - } else { - assert(Global.ImportIndex == 0); - write8(wasm::WASM_OPCODE_I32_CONST); - encodeSLEB128(Global.InitialValue, getStream()); // offset - } - write8(wasm::WASM_OPCODE_END); - } - - endSection(Section); - } - - // === Export Section ======================================================== - if (!Exports.empty()) { - startSection(Section, wasm::WASM_SEC_EXPORT); - - encodeULEB128(Exports.size(), getStream()); - for (const WasmExport &Export : Exports) { - encodeULEB128(Export.FieldName.size(), getStream()); - writeBytes(Export.FieldName); - - encodeSLEB128(Export.Kind, getStream()); - - encodeULEB128(Export.Index, getStream()); - } - - endSection(Section); - } - -#if 0 // TODO: Start Section - if (HaveStartFunction) { - // === Start Section ========================================================= - startSection(Section, wasm::WASM_SEC_START); - - encodeSLEB128(StartFunction, getStream()); - - endSection(Section); - } -#endif - - // === Elem Section ========================================================== - if (!TableElems.empty()) { - startSection(Section, wasm::WASM_SEC_ELEM); - - encodeULEB128(1, getStream()); // number of "segments" - encodeULEB128(0, getStream()); // the table index - - // init expr for starting offset - write8(wasm::WASM_OPCODE_I32_CONST); - encodeSLEB128(0, getStream()); - write8(wasm::WASM_OPCODE_END); - - encodeULEB128(TableElems.size(), getStream()); - for (uint32_t Elem : TableElems) - encodeULEB128(Elem, getStream()); - - endSection(Section); - } - - // === Code Section ========================================================== - if (!Functions.empty()) { - startSection(Section, wasm::WASM_SEC_CODE); - - encodeULEB128(Functions.size(), getStream()); - - for (const WasmFunction &Func : Functions) { - MCSectionWasm &FuncSection = - static_cast<MCSectionWasm &>(Func.Sym->getSection()); - - if (Func.Sym->isVariable()) - report_fatal_error("weak symbols not supported yet"); - - if (Func.Sym->getOffset() != 0) - report_fatal_error("function sections must contain one function each"); - - if (!Func.Sym->getSize()) - report_fatal_error("function symbols must have a size set with .size"); - - int64_t Size = 0; - if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout)) - report_fatal_error(".size expression must be evaluatable"); - - encodeULEB128(Size, getStream()); - - FuncSection.setSectionOffset(getStream().tell() - - Section.ContentsOffset); - - Asm.writeSectionData(&FuncSection, Layout); - } - - // Apply the type index fixups for call_indirect etc. instructions. - for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) { - uint32_t Type = TypeIndexFixupTypes[i]; - unsigned Padding = PaddingFor5ByteULEB128(Type); - - const WasmRelocationEntry &Fixup = TypeIndexFixups[i]; - assert(Fixup.Addend == 0); - assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB); - uint64_t Offset = Fixup.Offset + - Fixup.FixupSection->getSectionOffset(); - - uint8_t Buffer[16]; - unsigned SizeLen = encodeULEB128(Type, Buffer, Padding); - assert(SizeLen == 5); - getStream().pwrite((char *)Buffer, SizeLen, - Section.ContentsOffset + Offset); - } - - // Apply fixups. - ApplyRelocations(CodeRelocations, getStream(), SymbolIndices, - Section.ContentsOffset); - - endSection(Section); - } - - // === Data Section ========================================================== - uint32_t DataSectionHeaderSize = 0; - if (!DataBytes.empty()) { - startSection(Section, wasm::WASM_SEC_DATA); - - encodeULEB128(1, getStream()); // count - encodeULEB128(0, getStream()); // memory index - write8(wasm::WASM_OPCODE_I32_CONST); - encodeSLEB128(0, getStream()); // offset - write8(wasm::WASM_OPCODE_END); - encodeULEB128(DataBytes.size(), getStream()); // size - DataSectionHeaderSize = getStream().tell() - Section.ContentsOffset; - writeBytes(DataBytes); // data - - // Apply fixups. - ApplyRelocations(DataRelocations, getStream(), SymbolIndices, - Section.ContentsOffset + DataSectionHeaderSize); - - endSection(Section); - } - - // === Name Section ========================================================== - uint32_t TotalFunctions = NumFuncImports + Functions.size(); - if (TotalFunctions != 0) { - startSection(Section, wasm::WASM_SEC_CUSTOM, "name"); - SectionBookkeeping SubSection; - startSection(SubSection, wasm::WASM_NAMES_FUNCTION); - - encodeULEB128(TotalFunctions, getStream()); - uint32_t Index = 0; - for (const WasmImport &Import : Imports) { - if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) { - encodeULEB128(Index, getStream()); - encodeULEB128(Import.FieldName.size(), getStream()); - writeBytes(Import.FieldName); - ++Index; - } - } - for (const WasmFunction &Func : Functions) { - encodeULEB128(Index, getStream()); - encodeULEB128(Func.Sym->getName().size(), getStream()); - writeBytes(Func.Sym->getName()); - ++Index; - } - - endSection(SubSection); - endSection(Section); - } - - // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md - // for descriptions of the reloc sections. - - // === Code Reloc Section ==================================================== - if (!CodeRelocations.empty()) { - startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE"); - - encodeULEB128(wasm::WASM_SEC_CODE, getStream()); - - encodeULEB128(CodeRelocations.size() + TypeIndexFixups.size(), getStream()); - - WriteRelocations(CodeRelocations, getStream(), SymbolIndices, 0); - WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream()); - - endSection(Section); - } - - // === Data Reloc Section ==================================================== - if (!DataRelocations.empty()) { - startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA"); - - encodeULEB128(wasm::WASM_SEC_DATA, getStream()); - - encodeULEB128(DataRelocations.size(), getStream()); - - WriteRelocations(DataRelocations, getStream(), SymbolIndices, - DataSectionHeaderSize); - - endSection(Section); - } - - // === Linking Metadata Section ============================================== - if (HasStackPointer) { - startSection(Section, wasm::WASM_SEC_CUSTOM, "linking"); - - encodeULEB128(1, getStream()); // count - - encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type - encodeULEB128(StackPointerGlobal, getStream()); // id - - endSection(Section); - } + writeTypeSection(FunctionTypes); + writeImportSection(Imports); + writeFunctionSection(Functions); + writeTableSection(TableElems); + writeMemorySection(DataBytes); + writeGlobalSection(Globals); + writeExportSection(Exports); + // TODO: Start Section + writeElemSection(TableElems); + writeCodeSection(Asm, Layout, SymbolIndices, Functions); + uint64_t DataSectionHeaderSize = writeDataSection(DataBytes, SymbolIndices); + writeNameSection(Functions, Imports, NumFuncImports); + writeCodeRelocSection(SymbolIndices); + writeDataRelocSection(SymbolIndices, DataSectionHeaderSize); + writeLinkingMetaDataSection(HasStackPointer, StackPointerGlobal); // TODO: Translate the .comment section to the output. - // TODO: Translate debug sections to the output. } diff --git a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp index f652ff57f30d..21d29835624e 100644 --- a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp +++ b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp @@ -17,6 +17,11 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h" +#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h" +#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h" +#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h" +#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h" #include "llvm/DebugInfo/CodeView/EnumTables.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" @@ -36,16 +41,80 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false) +LLVM_YAML_DECLARE_ENUM_TRAITS(DebugSubsectionKind) LLVM_YAML_DECLARE_ENUM_TRAITS(FileChecksumKind) LLVM_YAML_DECLARE_BITSET_TRAITS(LineFlags) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineEntry) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceColumnEntry) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileChecksumEntry) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineInfo) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineBlock) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeInfo) -LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeSite) +LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceLineEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceColumnEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceFileChecksumEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceLineBlock) +LLVM_YAML_DECLARE_MAPPING_TRAITS(InlineeSite) + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct YAMLSubsectionBase { + explicit YAMLSubsectionBase(DebugSubsectionKind Kind) : Kind(Kind) {} + DebugSubsectionKind Kind; + virtual ~YAMLSubsectionBase() {} + + virtual void map(IO &IO) = 0; + virtual std::unique_ptr<DebugSubsection> + toCodeViewSubsection(DebugStringTableSubsection *UseStrings, + DebugChecksumsSubsection *UseChecksums) const = 0; +}; +} +} +} + +namespace { +struct YAMLChecksumsSubsection : public YAMLSubsectionBase { + YAMLChecksumsSubsection() + : YAMLSubsectionBase(DebugSubsectionKind::FileChecksums) {} + + void map(IO &IO) override; + std::unique_ptr<DebugSubsection> + toCodeViewSubsection(DebugStringTableSubsection *Strings, + DebugChecksumsSubsection *Checksums) const override; + static Expected<std::shared_ptr<YAMLChecksumsSubsection>> + fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &FC); + + std::vector<SourceFileChecksumEntry> Checksums; +}; + +struct YAMLLinesSubsection : public YAMLSubsectionBase { + YAMLLinesSubsection() : YAMLSubsectionBase(DebugSubsectionKind::Lines) {} + + void map(IO &IO) override; + std::unique_ptr<DebugSubsection> + toCodeViewSubsection(DebugStringTableSubsection *Strings, + DebugChecksumsSubsection *Checksums) const override; + static Expected<std::shared_ptr<YAMLLinesSubsection>> + fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums, + const DebugLinesSubsectionRef &Lines); + + SourceLineInfo Lines; +}; + +struct YAMLInlineeLinesSubsection : public YAMLSubsectionBase { + YAMLInlineeLinesSubsection() + : YAMLSubsectionBase(DebugSubsectionKind::InlineeLines) {} + + void map(IO &IO) override; + std::unique_ptr<DebugSubsection> + toCodeViewSubsection(DebugStringTableSubsection *Strings, + DebugChecksumsSubsection *Checksums) const override; + static Expected<std::shared_ptr<YAMLInlineeLinesSubsection>> + fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums, + const DebugInlineeLinesSubsectionRef &Lines); + + InlineeInfo InlineeLines; +}; +} void ScalarBitSetTraits<LineFlags>::bitset(IO &io, LineFlags &Flags) { io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns); @@ -99,21 +168,6 @@ void MappingTraits<SourceFileChecksumEntry>::mapping( IO.mapRequired("Checksum", Obj.ChecksumBytes); } -void MappingTraits<SourceLineInfo>::mapping(IO &IO, SourceLineInfo &Obj) { - IO.mapRequired("CodeSize", Obj.CodeSize); - - IO.mapRequired("Flags", Obj.Flags); - IO.mapRequired("RelocOffset", Obj.RelocOffset); - IO.mapRequired("RelocSegment", Obj.RelocSegment); - IO.mapRequired("Blocks", Obj.Blocks); -} - -void MappingTraits<SourceFileInfo>::mapping(IO &IO, SourceFileInfo &Obj) { - IO.mapOptional("Checksums", Obj.FileChecksums); - IO.mapOptional("Lines", Obj.LineFragments); - IO.mapOptional("InlineeLines", Obj.Inlinees); -} - void MappingTraits<InlineeSite>::mapping(IO &IO, InlineeSite &Obj) { IO.mapRequired("FileName", Obj.FileName); IO.mapRequired("LineNum", Obj.SourceLineNum); @@ -121,7 +175,310 @@ void MappingTraits<InlineeSite>::mapping(IO &IO, InlineeSite &Obj) { IO.mapOptional("ExtraFiles", Obj.ExtraFiles); } -void MappingTraits<InlineeInfo>::mapping(IO &IO, InlineeInfo &Obj) { - IO.mapRequired("HasExtraFiles", Obj.HasExtraFiles); - IO.mapRequired("Sites", Obj.Sites); +void YAMLChecksumsSubsection::map(IO &IO) { + IO.mapTag("!FileChecksums", true); + IO.mapRequired("Checksums", Checksums); +} + +void YAMLLinesSubsection::map(IO &IO) { + IO.mapTag("!Lines", true); + IO.mapRequired("CodeSize", Lines.CodeSize); + + IO.mapRequired("Flags", Lines.Flags); + IO.mapRequired("RelocOffset", Lines.RelocOffset); + IO.mapRequired("RelocSegment", Lines.RelocSegment); + IO.mapRequired("Blocks", Lines.Blocks); +} + +void YAMLInlineeLinesSubsection::map(IO &IO) { + IO.mapTag("!InlineeLines", true); + IO.mapRequired("HasExtraFiles", InlineeLines.HasExtraFiles); + IO.mapRequired("Sites", InlineeLines.Sites); +} + +void MappingTraits<YAMLDebugSubsection>::mapping( + IO &IO, YAMLDebugSubsection &Subsection) { + if (!IO.outputting()) { + if (IO.mapTag("!FileChecksums")) { + auto SS = std::make_shared<YAMLChecksumsSubsection>(); + Subsection.Subsection = SS; + } else if (IO.mapTag("!Lines")) { + Subsection.Subsection = std::make_shared<YAMLLinesSubsection>(); + } else if (IO.mapTag("!InlineeLines")) { + Subsection.Subsection = std::make_shared<YAMLInlineeLinesSubsection>(); + } else { + llvm_unreachable("Unexpected subsection tag!"); + } + } + Subsection.Subsection->map(IO); +} + +static Expected<const YAMLChecksumsSubsection &> +findChecksums(ArrayRef<YAMLDebugSubsection> Subsections) { + for (const auto &SS : Subsections) { + if (SS.Subsection->Kind == DebugSubsectionKind::FileChecksums) { + return static_cast<const YAMLChecksumsSubsection &>(*SS.Subsection); + } + } + return make_error<CodeViewError>(cv_error_code::no_records); +} + +std::unique_ptr<DebugSubsection> YAMLChecksumsSubsection::toCodeViewSubsection( + DebugStringTableSubsection *UseStrings, + DebugChecksumsSubsection *UseChecksums) const { + assert(UseStrings && !UseChecksums); + auto Result = llvm::make_unique<DebugChecksumsSubsection>(*UseStrings); + for (const auto &CS : Checksums) { + Result->addChecksum(CS.FileName, CS.Kind, CS.ChecksumBytes.Bytes); + } + return std::move(Result); +} + +std::unique_ptr<DebugSubsection> YAMLLinesSubsection::toCodeViewSubsection( + DebugStringTableSubsection *UseStrings, + DebugChecksumsSubsection *UseChecksums) const { + assert(UseStrings && UseChecksums); + auto Result = + llvm::make_unique<DebugLinesSubsection>(*UseChecksums, *UseStrings); + Result->setCodeSize(Lines.CodeSize); + Result->setRelocationAddress(Lines.RelocSegment, Lines.RelocOffset); + Result->setFlags(Lines.Flags); + for (const auto &LC : Lines.Blocks) { + Result->createBlock(LC.FileName); + if (Result->hasColumnInfo()) { + for (const auto &Item : zip(LC.Lines, LC.Columns)) { + auto &L = std::get<0>(Item); + auto &C = std::get<1>(Item); + uint32_t LE = L.LineStart + L.EndDelta; + Result->addLineAndColumnInfo(L.Offset, + LineInfo(L.LineStart, LE, L.IsStatement), + C.StartColumn, C.EndColumn); + } + } else { + for (const auto &L : LC.Lines) { + uint32_t LE = L.LineStart + L.EndDelta; + Result->addLineInfo(L.Offset, LineInfo(L.LineStart, LE, L.IsStatement)); + } + } + } + return llvm::cast<DebugSubsection>(std::move(Result)); +} + +std::unique_ptr<DebugSubsection> +YAMLInlineeLinesSubsection::toCodeViewSubsection( + DebugStringTableSubsection *UseStrings, + DebugChecksumsSubsection *UseChecksums) const { + assert(UseChecksums); + auto Result = llvm::make_unique<DebugInlineeLinesSubsection>( + *UseChecksums, InlineeLines.HasExtraFiles); + + for (const auto &Site : InlineeLines.Sites) { + Result->addInlineSite(TypeIndex(Site.Inlinee), Site.FileName, + Site.SourceLineNum); + if (!InlineeLines.HasExtraFiles) + continue; + + for (auto EF : Site.ExtraFiles) { + Result->addExtraFile(EF); + } + } + return llvm::cast<DebugSubsection>(std::move(Result)); +} + +static Expected<SourceFileChecksumEntry> +convertOneChecksum(const DebugStringTableSubsectionRef &Strings, + const FileChecksumEntry &CS) { + auto ExpectedString = Strings.getString(CS.FileNameOffset); + if (!ExpectedString) + return ExpectedString.takeError(); + + SourceFileChecksumEntry Result; + Result.ChecksumBytes.Bytes = CS.Checksum; + Result.Kind = CS.Kind; + Result.FileName = *ExpectedString; + return Result; +} + +static Expected<StringRef> +getFileName(const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums, uint32_t FileID) { + auto Iter = Checksums.getArray().at(FileID); + if (Iter == Checksums.getArray().end()) + return make_error<CodeViewError>(cv_error_code::no_records); + uint32_t Offset = Iter->FileNameOffset; + return Strings.getString(Offset); +} + +Expected<std::shared_ptr<YAMLChecksumsSubsection>> +YAMLChecksumsSubsection::fromCodeViewSubsection( + const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &FC) { + auto Result = std::make_shared<YAMLChecksumsSubsection>(); + + for (const auto &CS : FC) { + auto ConvertedCS = convertOneChecksum(Strings, CS); + if (!ConvertedCS) + return ConvertedCS.takeError(); + Result->Checksums.push_back(*ConvertedCS); + } + return Result; +} + +Expected<std::shared_ptr<YAMLLinesSubsection>> +YAMLLinesSubsection::fromCodeViewSubsection( + const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums, + const DebugLinesSubsectionRef &Lines) { + auto Result = std::make_shared<YAMLLinesSubsection>(); + Result->Lines.CodeSize = Lines.header()->CodeSize; + Result->Lines.RelocOffset = Lines.header()->RelocOffset; + Result->Lines.RelocSegment = Lines.header()->RelocSegment; + Result->Lines.Flags = static_cast<LineFlags>(uint16_t(Lines.header()->Flags)); + for (const auto &L : Lines) { + SourceLineBlock Block; + auto EF = getFileName(Strings, Checksums, L.NameIndex); + if (!EF) + return EF.takeError(); + Block.FileName = *EF; + if (Lines.hasColumnInfo()) { + for (const auto &C : L.Columns) { + SourceColumnEntry SCE; + SCE.EndColumn = C.EndColumn; + SCE.StartColumn = C.StartColumn; + Block.Columns.push_back(SCE); + } + } + for (const auto &LN : L.LineNumbers) { + SourceLineEntry SLE; + LineInfo LI(LN.Flags); + SLE.Offset = LN.Offset; + SLE.LineStart = LI.getStartLine(); + SLE.EndDelta = LI.getLineDelta(); + SLE.IsStatement = LI.isStatement(); + Block.Lines.push_back(SLE); + } + Result->Lines.Blocks.push_back(Block); + } + return Result; +} + +Expected<std::shared_ptr<YAMLInlineeLinesSubsection>> +YAMLInlineeLinesSubsection::fromCodeViewSubsection( + const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums, + const DebugInlineeLinesSubsectionRef &Lines) { + auto Result = std::make_shared<YAMLInlineeLinesSubsection>(); + + Result->InlineeLines.HasExtraFiles = Lines.hasExtraFiles(); + for (const auto &IL : Lines) { + InlineeSite Site; + auto ExpF = getFileName(Strings, Checksums, IL.Header->FileID); + if (!ExpF) + return ExpF.takeError(); + Site.FileName = *ExpF; + Site.Inlinee = IL.Header->Inlinee.getIndex(); + Site.SourceLineNum = IL.Header->SourceLineNum; + if (Lines.hasExtraFiles()) { + for (const auto EF : IL.ExtraFiles) { + auto ExpF2 = getFileName(Strings, Checksums, EF); + if (!ExpF2) + return ExpF2.takeError(); + Site.ExtraFiles.push_back(*ExpF2); + } + } + Result->InlineeLines.Sites.push_back(Site); + } + return Result; +} + +Expected<std::vector<std::unique_ptr<DebugSubsection>>> +llvm::CodeViewYAML::convertSubsectionList( + ArrayRef<YAMLDebugSubsection> Subsections, + DebugStringTableSubsection &Strings) { + std::vector<std::unique_ptr<DebugSubsection>> Result; + if (Subsections.empty()) + return std::move(Result); + + auto Checksums = findChecksums(Subsections); + if (!Checksums) + return Checksums.takeError(); + auto ChecksumsBase = Checksums->toCodeViewSubsection(&Strings, nullptr); + DebugChecksumsSubsection &CS = + llvm::cast<DebugChecksumsSubsection>(*ChecksumsBase); + for (const auto &SS : Subsections) { + // We've already converted the checksums subsection, don't do it + // twice. + std::unique_ptr<DebugSubsection> CVS; + if (SS.Subsection->Kind == DebugSubsectionKind::FileChecksums) + CVS = std::move(ChecksumsBase); + else + CVS = SS.Subsection->toCodeViewSubsection(&Strings, &CS); + Result.push_back(std::move(CVS)); + } + return std::move(Result); +} + +namespace { +struct SubsectionConversionVisitor : public DebugSubsectionVisitor { + explicit SubsectionConversionVisitor( + const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums) + : Strings(Strings), Checksums(Checksums) {} + + Error visitUnknown(DebugUnknownSubsectionRef &Unknown) override; + Error visitLines(DebugLinesSubsectionRef &Lines) override; + Error visitFileChecksums(DebugChecksumsSubsectionRef &Checksums) override; + Error visitInlineeLines(DebugInlineeLinesSubsectionRef &Inlinees) override; + + YAMLDebugSubsection Subsection; + +private: + const DebugStringTableSubsectionRef &Strings; + const DebugChecksumsSubsectionRef &Checksums; +}; + +Error SubsectionConversionVisitor::visitUnknown( + DebugUnknownSubsectionRef &Unknown) { + return make_error<CodeViewError>(cv_error_code::operation_unsupported); +} + +Error SubsectionConversionVisitor::visitLines(DebugLinesSubsectionRef &Lines) { + auto Result = + YAMLLinesSubsection::fromCodeViewSubsection(Strings, Checksums, Lines); + if (!Result) + return Result.takeError(); + Subsection.Subsection = *Result; + return Error::success(); +} + +Error SubsectionConversionVisitor::visitFileChecksums( + DebugChecksumsSubsectionRef &Checksums) { + auto Result = + YAMLChecksumsSubsection::fromCodeViewSubsection(Strings, Checksums); + if (!Result) + return Result.takeError(); + Subsection.Subsection = *Result; + return Error::success(); +} + +Error SubsectionConversionVisitor::visitInlineeLines( + DebugInlineeLinesSubsectionRef &Inlinees) { + auto Result = YAMLInlineeLinesSubsection::fromCodeViewSubsection( + Strings, Checksums, Inlinees); + if (!Result) + return Result.takeError(); + Subsection.Subsection = *Result; + return Error::success(); +} +} + +Expected<YAMLDebugSubsection> YAMLDebugSubsection::fromCodeViewSubection( + const DebugStringTableSubsectionRef &Strings, + const DebugChecksumsSubsectionRef &Checksums, + const DebugSubsectionRecord &SS) { + SubsectionConversionVisitor V(Strings, Checksums); + if (auto EC = visitDebugSubsection(SS, V)) + return std::move(EC); + + return V.Subsection; } diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp index 6e8bb5c7372c..bd97af3a9323 100644 --- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp +++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp @@ -148,7 +148,8 @@ struct SymbolRecordBase { virtual ~SymbolRecordBase() {} virtual void map(yaml::IO &io) = 0; virtual codeview::CVSymbol - toCodeViewSymbol(BumpPtrAllocator &Allocator) const = 0; + toCodeViewSymbol(BumpPtrAllocator &Allocator, + CodeViewContainer Container) const = 0; virtual Error fromCodeViewSymbol(codeview::CVSymbol Type) = 0; }; @@ -159,8 +160,9 @@ template <typename T> struct SymbolRecordImpl : public SymbolRecordBase { void map(yaml::IO &io) override; codeview::CVSymbol - toCodeViewSymbol(BumpPtrAllocator &Allocator) const override { - return SymbolSerializer::writeOneSymbol(Symbol, Allocator); + toCodeViewSymbol(BumpPtrAllocator &Allocator, + CodeViewContainer Container) const override { + return SymbolSerializer::writeOneSymbol(Symbol, Allocator, Container); } Error fromCodeViewSymbol(codeview::CVSymbol CVS) override { return SymbolDeserializer::deserializeAs<T>(CVS, Symbol); @@ -429,8 +431,8 @@ template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) { } CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol( - BumpPtrAllocator &Allocator) const { - return Symbol->toCodeViewSymbol(Allocator); + BumpPtrAllocator &Allocator, CodeViewContainer Container) const { + return Symbol->toCodeViewSymbol(Allocator, Container); } namespace llvm { diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index eb81e58b9b0e..17c60348633c 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -310,6 +310,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // Catch trivial redundancies FPM.addPass(EarlyCSEPass()); + // Hoisting of scalars and load expressions. + if (EnableGVNHoist) + FPM.addPass(GVNHoistPass()); + // Speculative execution if the target has divergent branches; otherwise nop. FPM.addPass(SpeculativeExecutionPass()); @@ -473,8 +477,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); - if (EnableGVNHoist) - EarlyFPM.addPass(GVNHoistPass()); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); // Interprocedural constant propagation now that basic cleanup has occured diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index 318e21da999d..f7b7ad89e959 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -649,12 +649,10 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { case Triple::tce: case Triple::tcele: case Triple::thumbeb: - case Triple::xcore: - return Triple::ELF; - case Triple::wasm32: case Triple::wasm64: - return Triple::Wasm; + case Triple::xcore: + return Triple::ELF; case Triple::ppc: case Triple::ppc64: diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h index 4f656f94ea12..b99c1d1d6b3e 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.h +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -1,4 +1,4 @@ -//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===// +//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -15,6 +15,8 @@ namespace llvm { +class TargetRegisterInfo; + /// Add the accumulator chaining constraint to a PBQP graph class A57ChainingConstraint : public PBQPRAConstraint { public: @@ -33,6 +35,7 @@ private: // Add constraints between existing chains void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); }; -} + +} // end namespace llvm #endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index d098cf7a5a37..7402bcf1346c 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -56,12 +56,14 @@ def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } +def FalkorWr_1XYZ_0cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 0; } def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } +def FalkorWr_1VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 0; } def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } @@ -76,6 +78,7 @@ def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } +def FalkorWr_1GTOV_0cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 0; } def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } @@ -83,6 +86,10 @@ def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } //===----------------------------------------------------------------------===// // Define 2 micro-op types +def FalkorWr_2VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 0; + let NumMicroOps = 2; +} def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 1; let NumMicroOps = 2; @@ -476,17 +483,19 @@ def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast // ----------------------------------------------------------------------------- def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; -def FalkorFMOVZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || +def FalkorOp1ZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || + MI->getOperand(1).getReg() == AArch64::XZR}]>; def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>; def FalkorWr_FMOV : SchedWriteVariant<[ - SchedVar<FalkorFMOVZrReg, [FalkorWr_1none_0cyc]>, + SchedVar<FalkorOp1ZrReg, [FalkorWr_1none_0cyc]>, SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>; def FalkorWr_MOVZ : SchedWriteVariant<[ SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>; + SchedVar<NoSchedPred, [FalkorWr_1XYZB_0cyc]>]>; // imm fwd + def FalkorWr_ADDSUBsx : SchedWriteVariant<[ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>, @@ -500,6 +509,10 @@ def FalkorWr_LDRSro : SchedWriteVariant<[ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>, SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>; +def FalkorWr_ORRi : SchedWriteVariant<[ + SchedVar<FalkorOp1ZrReg, [FalkorWr_1XYZ_0cyc]>, // imm fwd + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1cyc]>]>; + def FalkorWr_PRFMro : SchedWriteVariant<[ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>, SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>; @@ -810,7 +823,8 @@ def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>; -def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(i|r|s)$")>; +def : InstRW<[FalkorWr_ORRi], (instregex "^ORR(W|X)ri$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>; def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>; @@ -825,7 +839,7 @@ def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; +def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>; @@ -849,7 +863,7 @@ def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; +def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>; @@ -1036,13 +1050,13 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFM // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>; -def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(S|D)i$")>; +def : InstRW<[FalkorWr_1GTOV_0cyc], (instregex "^FMOV(S|D)i$")>; // imm fwd def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; +def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; // imm fwd // FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr -def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>; +def : InstRW<[FalkorWr_2VXVY_0cyc], (instrs FMOVD0, FMOVS0)>; // imm fwd def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; @@ -1107,11 +1121,12 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], // Move and Shift Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|X).*")>; -def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^ADRP?$")>; -def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^MOVN(W|X)i$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV)(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_0cyc], (instregex "^MOVK(W|X)i$")>; // imm fwd +def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^ADRP?$")>; // imm fwd +def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^MOVN(W|X)i$")>; // imm fwd def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>; -def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs MOVi32imm, MOVi64imm)>; +def : InstRW<[FalkorWr_1XYZ_0cyc], (instrs MOVi32imm, MOVi64imm)>; // imm fwd (approximation) def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>], (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>; def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>], diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 78ff3bbe3d1a..55d18c3f3646 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -55,6 +55,8 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass(); void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); extern char &AMDGPUMachineCFGStructurizerID; +void initializeAMDGPUAlwaysInlinePass(PassRegistry&); + ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index e7ebb37a9d62..b50e8d1d659e 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -365,6 +365,13 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", "Force to generate flat instruction for global" >; +def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < + "auto-waitcnt-before-barrier", + "AutoWaitcntBeforeBarrier", + "true", + "Hardware automatically inserts waitcnt before barrier" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 1d03714874e2..8084d368c80f 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -22,18 +22,22 @@ using namespace llvm; namespace { class AMDGPUAlwaysInline : public ModulePass { - static char ID; - bool GlobalOpt; public: - AMDGPUAlwaysInline(bool GlobalOpt) : ModulePass(ID), GlobalOpt(GlobalOpt) { } + static char ID; + + AMDGPUAlwaysInline(bool GlobalOpt = false) : + ModulePass(ID), GlobalOpt(GlobalOpt) { } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; } }; } // End anonymous namespace +INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline", + "AMDGPU Inline All Functions", false, false) + char AMDGPUAlwaysInline::ID = 0; bool AMDGPUAlwaysInline::runOnModule(Module &M) { diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 57905be18813..267f4807a788 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -28,11 +28,16 @@ using namespace llvm; AMDGPULegalizerInfo::AMDGPULegalizerInfo() { using namespace TargetOpcode; + const LLT S1= LLT::scalar(1); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); const LLT P1 = LLT::pointer(1, 64); const LLT P2 = LLT::pointer(2, 64); + // FIXME: i1 operands to intrinsics should always be legal, but other i1 + // values may not be legal. We need to figure out how to distinguish + // between these two scenarios. + setAction({G_CONSTANT, S1}, Legal); setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 6e301b4ad527..8d157e2f98f2 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -91,6 +91,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FPExceptions(false), DX10Clamp(false), FlatForGlobal(false), + AutoWaitcntBeforeBarrier(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 0582ce95693a..ed9cbb994fad 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -110,6 +110,7 @@ protected: bool FPExceptions; bool DX10Clamp; bool FlatForGlobal; + bool AutoWaitcntBeforeBarrier; bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool HasApertureRegs; @@ -195,7 +196,8 @@ public: } bool isOpenCLEnv() const { - return TargetTriple.getEnvironment() == Triple::OpenCL; + return TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getEnvironmentName() == "amdgizcl"; } Generation getGeneration() const { @@ -363,6 +365,10 @@ public: return FlatForGlobal; } + bool hasAutoWaitcntBeforeBarrier() const { + return AutoWaitcntBeforeBarrier; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } @@ -727,12 +733,6 @@ public: /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; - /// \returns True if waitcnt instruction is needed before barrier instruction, - /// false otherwise. - bool needWaitcntBeforeBarrier() const { - return true; - } - /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 596f02ae4a64..404598ff4738 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -116,7 +116,7 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, static cl::opt<bool> EnableSIInsertWaitcntsPass( "enable-si-insert-waitcnts", cl::desc("Use new waitcnt insertion pass"), - cl::init(false)); + cl::init(true)); // Option to run late CFG structurizer static cl::opt<bool> LateCFGStructurize( @@ -139,6 +139,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f13629a3185f..dfac068d1f69 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -35,9 +35,12 @@ struct FoldCandidate { }; unsigned char UseOpNo; MachineOperand::MachineOperandType Kind; + bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, + bool Commuted_ = false) : + UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), + Commuted(Commuted_) { if (FoldOp->isImm()) { ImmToFold = FoldOp->getImm(); } else if (FoldOp->isFI()) { @@ -59,6 +62,10 @@ struct FoldCandidate { bool isReg() const { return Kind == MachineOperand::MO_Register; } + + bool isCommuted() const { + return Commuted; + } }; class SIFoldOperands : public MachineFunctionPass { @@ -237,8 +244,13 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); + return true; } FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); @@ -699,6 +711,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); tryFoldInst(TII, Fold.UseMI); + } else if (Fold.isCommuted()) { + // Restoring instruction's original operand order if fold has failed. + TII->commuteInstruction(*Fold.UseMI, false); } } } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 76c2644867aa..b48b23911105 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3571,7 +3571,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && - isMemOpHasNoClobberedMemOperand(Load)) + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index e22166d03e9a..c10badba88f3 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1009,7 +1009,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // occurs before the instruction. Doing it here prevents any additional // S_WAITCNTs from being emitted if the instruction was marked as // requiring a WAITCNT beforehand. - if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) { + if (MI.getOpcode() == AMDGPU::S_BARRIER && + !ST->hasAutoWaitcntBeforeBarrier()) { EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); EmitSwaitcnt |= ScoreBrackets->updateByWait( diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 9f32ecfa52ff..bc86515d8b1f 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -630,7 +630,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if ((I->getOpcode() == AMDGPU::S_BARRIER && - ST->needWaitcntBeforeBarrier()) || + !ST->hasAutoWaitcntBeforeBarrier()) || I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT) Required = LastIssued; diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 5b840a14dbc3..73dd8b7daa4e 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -229,6 +229,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && + !Ld->isVolatile() && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 001fc960b228..77fc9551cff9 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -245,9 +245,10 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; let SubtargetPredicate = Has16BitInsts in { +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; + let isCommutable = 1 in { -def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>; def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>; diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 46fd1f70ee99..ca68f5d42c32 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -205,6 +205,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", "AvoidCPSRPartialUpdate", "true", "Avoid CPSR partial update for OOO execution">; +/// Disable +1 predication cost for instructions updating CPSR. +/// Enabled for Cortex-A57. +def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr", + "CheapPredicableCPSRDef", + "true", + "Disable +1 predication cost for instructions updating CPSR">; + def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", "AvoidMOVsShifterOperand", "true", "Avoid movs instructions with shifter operand">; @@ -788,12 +795,14 @@ def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, FeatureCRC, FeatureFPAO]>; -def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC, - FeatureFPAO]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFPAO, + FeatureAvoidPartialCPSR, + FeatureCheapPredicableCPSR]>; def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, FeatureHWDivThumb, diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 5c9d589e2625..f8b65573f9cd 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -558,13 +558,68 @@ bool ARMBaseInstrInfo::DefinesPredicate( return Found; } -static bool isCPSRDefined(const MachineInstr *MI) { - for (const auto &MO : MI->operands()) +bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) { + for (const auto &MO : MI.operands()) if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead()) return true; return false; } +bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Offset = MI.getOperand(Op + 1); + return Offset.getReg() != 0; +} + +// Load with negative register offset requires additional 1cyc and +I unit +// for Cortex A57 +bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Offset = MI.getOperand(Op + 1); + const MachineOperand &Opc = MI.getOperand(Op + 2); + assert(Opc.isImm()); + assert(Offset.isReg()); + int64_t OpcImm = Opc.getImm(); + + bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub; + return (isSub && Offset.getReg() != 0); +} + +bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Opc = MI.getOperand(Op + 2); + unsigned OffImm = Opc.getImm(); + return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift; +} + +// Load, scaled register offset, not plus LSL2 +bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, + unsigned Op) const { + const MachineOperand &Opc = MI.getOperand(Op + 2); + unsigned OffImm = Opc.getImm(); + + bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm); + if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled + bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2); + return !SimpleScaled; +} + +// Minus reg for ldstso addr mode +bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI, + unsigned Op) const { + unsigned OffImm = MI.getOperand(Op + 2).getImm(); + return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; +} + +// Load, scaled register offset +bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI, + unsigned Op) const { + unsigned OffImm = MI.getOperand(Op + 2).getImm(); + return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift; +} + static bool isEligibleForITBlock(const MachineInstr *MI) { switch (MI->getOpcode()) { default: return true; @@ -590,7 +645,7 @@ static bool isEligibleForITBlock(const MachineInstr *MI) { case ARM::tSUBi3: // SUB (immediate) T1 case ARM::tSUBi8: // SUB (immediate) T2 case ARM::tSUBrr: // SUB (register) T1 - return !isCPSRDefined(MI); + return !ARMBaseInstrInfo::isCPSRDefined(*MI); } } @@ -3349,6 +3404,22 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, return DefCycle; } +bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const { + unsigned BaseReg = MI.getOperand(0).getReg(); + for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) { + const auto &Op = MI.getOperand(i); + if (Op.isReg() && Op.getReg() == BaseReg) + return true; + } + return false; +} +unsigned +ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const { + // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops + // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops) + return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands(); +} + int ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData, const MCInstrDesc &DefMCID, @@ -4119,7 +4190,8 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const { const MCInstrDesc &MCID = MI.getDesc(); - if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) { + if (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) && + !Subtarget.cheapPredicableCPSRDef())) { // When predicated, CPSR is an additional source operand for CPSR updating // instructions, this apparently increases their latencies. return 1; @@ -4148,7 +4220,8 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, } const MCInstrDesc &MCID = MI.getDesc(); - if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) { + if (PredCost && (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) && + !Subtarget.cheapPredicableCPSRDef()))) { // When predicated, CPSR is an additional source operand for CPSR updating // instructions, this apparently increases their latencies. *PredCost = 1; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index dd7fe871345a..c52e572786d4 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -159,6 +159,24 @@ public: bool isPredicable(const MachineInstr &MI) const override; + // CPSR defined in instruction + static bool isCPSRDefined(const MachineInstr &MI); + bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const; + bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const; + + // Load, scaled register offset + bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const; + // Load, scaled register offset, not plus LSL2 + bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const; + // Minus reg for ldstso addr mode + bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const; + // Scaled register offset in address mode 2 + bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const; + // Load multiple, base reg in list + bool isLDMBaseRegInList(const MachineInstr &MI) const; + // get LDM variable defs size + unsigned getLDMVariableDefsSize(const MachineInstr &MI) const; + /// GetInstSize - Returns the size of the specified MachineInstr. /// unsigned getInstSizeInBytes(const MachineInstr &MI) const override; diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 31a2f499a9a7..a33d025d114e 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -34,7 +34,7 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { - if (T->isArrayTy()) + if (T->isArrayTy() || T->isStructTy()) return true; EVT VT = TLI.getValueType(DL, T, true); @@ -167,8 +167,11 @@ void ARMCallLowering::splitToValueTypes( if (SplitVTs.size() == 1) { // Even if there is no splitting to do, we still want to replace the // original type (e.g. pointer type -> integer). - SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags, OrigArg.IsFixed); + auto Flags = OrigArg.Flags; + unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty); + Flags.setOrigAlign(OriginalAlignment); + SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), Flags, + OrigArg.IsFixed); return; } @@ -177,6 +180,10 @@ void ARMCallLowering::splitToValueTypes( EVT SplitVT = SplitVTs[i]; Type *SplitTy = SplitVT.getTypeForEVT(Ctx); auto Flags = OrigArg.Flags; + + unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy); + Flags.setOrigAlign(OriginalAlignment); + bool NeedsConsecutiveRegisters = TLI.functionArgumentNeedsConsecutiveRegisters( SplitTy, F->getCallingConv(), F->isVarArg()); @@ -185,6 +192,7 @@ void ARMCallLowering::splitToValueTypes( if (i == e - 1) Flags.setInConsecutiveRegsLast(); } + SplitArgs.push_back( ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), SplitTy, Flags, OrigArg.IsFixed}); diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index ec5b97cba8cd..1c7902520f2d 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -147,6 +147,9 @@ def : PredicateProlog<[{ const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo()); (void)TII; + const ARMSubtarget *STI = + static_cast<const ARMSubtarget*>(SchedModel->getSubtargetInfo()); + (void)STI; }]>; def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>; @@ -420,3 +423,4 @@ include "ARMScheduleA8.td" include "ARMScheduleA9.td" include "ARMScheduleSwift.td" include "ARMScheduleR52.td" +include "ARMScheduleA57.td" diff --git a/lib/Target/ARM/ARMScheduleA57.td b/lib/Target/ARM/ARMScheduleA57.td new file mode 100644 index 000000000000..525079d12d51 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleA57.td @@ -0,0 +1,1471 @@ +//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for ARM Cortex-A57 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// *** Common description and scheduling model parameters taken from AArch64 *** +// The Cortex-A57 is a traditional superscalar microprocessor with a +// conservative 3-wide in-order stage for decode and dispatch. Combined with the +// much wider out-of-order issue stage, this produced a need to carefully +// schedule micro-ops so that all three decoded each cycle are successfully +// issued as the reservation station(s) simply don't stay occupied for long. +// Therefore, IssueWidth is set to the narrower of the two at three, while still +// modeling the machine as out-of-order. + +def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>; +def IsCPSRDefinedAndPredicatedPred : + SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>; + +// Cortex A57 rev. r1p0 or later (false = r0px) +def IsR1P0AndLaterPred : SchedPredicate<[{false}]>; + +// If Addrmode3 contains register offset (not immediate) +def IsLdrAm3RegOffPred : + SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>; +// The same predicate with operand offset 2 and 3: +def IsLdrAm3RegOffPredX2 : + SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>; +def IsLdrAm3RegOffPredX3 : + SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>; + +// If Addrmode3 contains "minus register" +def IsLdrAm3NegRegOffPred : + SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>; +// The same predicate with operand offset 2 and 3: +def IsLdrAm3NegRegOffPredX2 : + SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>; +def IsLdrAm3NegRegOffPredX3 : + SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>; + +// Load, scaled register offset, not plus LSL2 +def IsLdstsoScaledNotOptimalPredX0 : + SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>; +def IsLdstsoScaledNotOptimalPred : + SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>; +def IsLdstsoScaledNotOptimalPredX2 : + SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>; + +// Load, scaled register offset +def IsLdstsoScaledPred : + SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>; +def IsLdstsoScaledPredX2 : + SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>; + +def IsLdstsoMinusRegPredX0 : + SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>; +def IsLdstsoMinusRegPred : + SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>; +def IsLdstsoMinusRegPredX2 : + SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>; + +// Load, scaled register offset +def IsLdrAm2ScaledPred : + SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>; + +// LDM, base reg in list +def IsLdmBaseRegInList : + SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>; + +class A57WriteLMOpsListType<list<SchedWriteRes> writes> { + list <SchedWriteRes> Writes = writes; + SchedMachineModel SchedModel = ?; +} + +// *** Common description and scheduling model parameters taken from AArch64 *** +// (AArch64SchedA57.td) +def CortexA57Model : SchedMachineModel { + let IssueWidth = 3; // 3-way decode and dispatch + let MicroOpBufferSize = 128; // 128 micro-op re-order buffer + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 16; // Fetch + Decode/Rename/Dispatch + Branch + + // Enable partial & runtime unrolling. + let LoopMicroOpBufferSize = 16; + let CompleteModel = 1; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Cortex-A57. +// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where +// micro-ops wait for their operands and then issue out-of-order. + +def A57UnitB : ProcResource<1>; // Type B micro-ops +def A57UnitI : ProcResource<2>; // Type I micro-ops +def A57UnitM : ProcResource<1>; // Type M micro-ops +def A57UnitL : ProcResource<1>; // Type L micro-ops +def A57UnitS : ProcResource<1>; // Type S micro-ops + +def A57UnitX : ProcResource<1>; // Type X micro-ops (F1) +def A57UnitW : ProcResource<1>; // Type W micro-ops (F0) + +let SchedModel = CortexA57Model in { + def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops +} + +let SchedModel = CortexA57Model in { + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Cortex-A57. + +include "ARMScheduleA57WriteRes.td" + +// To have "CompleteModel = 1", support of pseudos and special instructions +def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$", + "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$", + "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$", + "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$", + "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE", + "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG", + "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>; + +def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>; + +// Specific memory instrs +def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC", + "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>; + +// coprocessor moves +def : InstRW<[WriteNoop, WriteNoop], (instregex + "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$", + "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$", + "(t2)?MSR(banked|i|_AR|_M)?$")>; + +// Deprecated instructions +def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>; + +// Pseudos +def : InstRW<[WriteNoop], (instregex "(t2)?ABS$", + "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj", + "tLDRpci_pic", "t2SUBS_PC_LR", + "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp", + "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm", + "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm", + "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm", + "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm", + "WIN__CHKSTK", "WIN__DBZCHK")>; + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>; + +// --- 3.2 Branch Instructions --- +// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ + +def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$", + "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>; +def : InstRW<[A57Write_1cyc_1B_1I], + (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>; +def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>; +// Pseudos +def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>; +def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr", + "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>; +def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>; + +// --- 3.3 Arithmetic and Logical Instructions --- +// ADD{S}, ADC{S}, ADR, AND{S}, BIC{S}, CMN, CMP, EOR{S}, ORN{S}, ORR{S}, +// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST + +def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>; + +// shift by register, conditional or unconditional +// TODO: according to the doc, conditional uses I0/I1, unconditional uses M +// Why more complex instruction uses more simple pipeline? +// May be an error in doc. +def A57WriteALUsi : SchedWriteVariant<[ + // lsl #2, lsl #1, or lsr #1. + SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def A57WriteALUsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def A57WriteALUSsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def A57ReadALUsr : SchedReadVariant<[ + SchedVar<IsPredicatedPred, [ReadDefault]>, + SchedVar<NoSchedPred, [ReadDefault]> +]>; +def : SchedAlias<WriteALUsi, A57WriteALUsi>; +def : SchedAlias<WriteALUsr, A57WriteALUsr>; +def : SchedAlias<WriteALUSsr, A57WriteALUSsr>; +def : SchedAlias<ReadALUsr, A57ReadALUsr>; + +def A57WriteCMPsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def : SchedAlias<WriteCMP, A57Write_1cyc_1I>; +def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>; +def : SchedAlias<WriteCMPsr, A57WriteCMPsr>; + +// --- 3.4 Move and Shift Instructions --- +// Move, basic +// MOV{S}, MOVW, MVN{S} +def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)", + "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL", + "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>; + +// Move, shift by immed, setflags/no setflags +// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN +// setflags = isCPSRDefined +def A57WriteMOVsi : SchedWriteVariant<[ + SchedVar<IsCPSRDefinedPred, [A57Write_2cyc_1M]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1I]> +]>; +def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi", + "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi", + "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>; + +// shift by register, conditional or unconditional, setflags/no setflags +def A57WriteMOVsr : SchedWriteVariant<[ + SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<IsCPSRDefinedPred, [A57Write_2cyc_1M]>, + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1I]> +]>; +def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs", + "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr", + "(t2|t)RORrr")>; + +// Move, top +// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later +def A57WriteMOVT : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_1cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1M]> +]>; +def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>; + +def A57WriteI2pc : + WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>; +def A57WriteI2ld : + WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>; +def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>; +def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>; + +// +2cyc for branch forms +def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>; + +// --- 3.5 Divide and Multiply Instructions --- +// Divide: SDIV, UDIV +// latency from documentration: 4 ‐ 20, maximum taken +def : SchedAlias<WriteDIV, A57Write_20cyc_1M>; +// Multiply: tMul not bound to common WriteRes types +def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>; +def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>; +def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>; +def : ReadAdvance<ReadMUL, 0>; + +// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, +// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R} +// Multiply-accumulate pipelines support late-forwarding of accumulate operands +// from similar μops, allowing a typical sequence of multiply-accumulate μops +// to issue one every 1 cycle (sched advance = 2). +def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; } +def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; } +def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>; + +def : SchedAlias<WriteMAC16, A57WriteMLA>; +def : SchedAlias<WriteMAC32, A57WriteMLA>; +def : SchedAlias<ReadMAC, A57ReadMLA>; + +def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>; +def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>; + +// Multiply long: SMULL, UMULL +def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>; +def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>; + +// --- 3.6 Saturating and Parallel Arithmetic Instructions --- +// Parallel arith +// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8 +// Conditional GE-setting instructions require three extra μops +// and two additional cycles to conditionally update the GE field. +def A57WriteParArith : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1I_1M]> +]>; +def : InstRW< [A57WriteParArith], (instregex + "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)", + "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>; + +// Parallel arith with exchange: SASX, SSAX, UASX, USAX +def A57WriteParArithExch : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>, + SchedVar<NoSchedPred, [A57Write_3cyc_1I_1M]> +]>; +def : InstRW<[A57WriteParArithExch], + (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>; + +// Parallel halving arith +// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16, UHSUB8 +def : InstRW<[A57Write_2cyc_1M], (instregex + "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)", + "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>; + +// Parallel halving arith with exchange +// SHASX, SHSAX, UHASX, UHSAX +def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX", + "(t2)?UHASX", "(t2)?UHSAX")>; + +// Parallel saturating arith +// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8 +def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)", + "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>; + +// Parallel saturating arith with exchange +// QASX, QSAX, UQASX, UQSAX +def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX", + "(t2)?UQASX", "(t2)?UQSAX")>; + +// Saturate: SSAT, SSAT16, USAT, USAT16 +def : InstRW<[A57Write_2cyc_1M], + (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>; + +// Saturating arith: QADD, QSUB +def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>; + +// Saturating doubling arith: QDADD, QDSUB +def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>; + +// --- 3.7 Miscellaneous Data-Processing Instructions --- +// Bit field extract: SBFX, UBFX +def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>; + +// Bit field insert/clear: BFI, BFC +def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>; + +// Select bytes, conditional/unconditional +def A57WriteSEL : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1I]> +]>; +def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>; + +// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH +def : InstRW<[A57Write_1cyc_1I], + (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>; + +// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH +def : InstRW<[A57Write_2cyc_1M], + (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>; + +// Sign/zero extend and add, parallel: SXTAB16, UXTAB16 +def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>; + +// Sum of absolute differences: USAD8, USADA8 +def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>; + +// --- 3.8 Load Instructions --- + +// Load, immed offset +// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate +def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12", + "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)", + "PICLDR", "tLDR")>; + +def : InstRW<[A57Write_4cyc_1L], + (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>; + +// For "Load, register offset, minus" we need +1cyc, +1I +def A57WriteLdrAm3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>; +def A57WriteLdrAm3X2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>; +def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>; + +def A57WriteLdrAmLDSTSO : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>, + SchedVar<IsLdstsoMinusRegPred, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>; + +def A57WrBackOne : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; +} +def A57WrBackTwo : SchedWriteRes<[]> { + let Latency = 2; + let NumMicroOps = 0; +} +def A57WrBackThree : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} + +// --- LDR pre-indexed --- +// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update) +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM", + "LDRB_PRE_IMM", "t2LDRB_PRE")>; + +// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update) +// (5 cyc load result for not-lsl2 scaled) +def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> +]>; +def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo], + (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>; + +def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack], + (instregex "LDR(H|SH|SB)_PRE")>; +def : InstRW<[A57Write_4cyc_1L, A57WrBackOne], + (instregex "t2LDR(H|SH|SB)?_PRE")>; + +// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm. +def A57WriteLdrDAm3Pre : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> +]>; +def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack], + (instregex "LDRD_PRE")>; +def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne], + (instregex "t2LDRD_PRE")>; + +// --- LDR post-indexed --- +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM", + "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>; + +def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack], + (instregex "LDR(H|SH|SB)_POST")>; +def : InstRW<[A57Write_4cyc_1L, A57WrBackOne], + (instregex "t2LDR(H|SH|SB)?_POST")>; + +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG", + "LDRB_POST_REG", "LDR(B?)T_POST$")>; + +def A57WriteLdrTRegPost : SchedWriteVariant<[ + SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> +]>; +def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[ + SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>, + SchedVar<NoSchedPred, [A57WrBackTwo]> +]>; +// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L" +def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack], + (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>; + +def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>; + +def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm. +def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, + A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>; +def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne], + (instregex "t2LDRD_POST")>; + +// --- Preload instructions --- +// Preload, immed offset +def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12", + "t2PLDW?(i8|pci|s)", "(t2)?PLI")>; + +// Preload, register offset, +// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2 +// otherwise 4cyc "L" +def A57WritePLD : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>, + SchedVar<IsLdstsoMinusRegPredX0, [A57Write_5cyc_1I_1L]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1L]> +]>; +def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>; + +// --- Load multiple instructions --- +foreach NumAddr = 1-8 in { + def A57LMAddrPred#NumAddr : + SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>; +} + +def A57LDMOpsListNoregin : A57WriteLMOpsListType< + [A57Write_3cyc_1L, A57Write_3cyc_1L, + A57Write_4cyc_1L, A57Write_4cyc_1L, + A57Write_5cyc_1L, A57Write_5cyc_1L, + A57Write_6cyc_1L, A57Write_6cyc_1L, + A57Write_7cyc_1L, A57Write_7cyc_1L, + A57Write_8cyc_1L, A57Write_8cyc_1L, + A57Write_9cyc_1L, A57Write_9cyc_1L, + A57Write_10cyc_1L, A57Write_10cyc_1L]>; +def A57WriteLDMnoreginlist : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57LDMOpsListNoregin.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57LDMOpsListNoregin.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57LDMOpsListNoregin.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57LDMOpsListNoregin.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57LDMOpsListNoregin.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57LDMOpsListNoregin.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57LDMOpsListNoregin.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57LDMOpsListNoregin.Writes[0-15]>, + SchedVar<NoSchedPred, A57LDMOpsListNoregin.Writes[0-15]> +]> { let Variadic=1; } + +def A57LDMOpsListRegin : A57WriteLMOpsListType< + [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, + A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I, + A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I, + A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, + A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I, + A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>; +def A57WriteLDMreginlist : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57LDMOpsListRegin.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57LDMOpsListRegin.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57LDMOpsListRegin.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57LDMOpsListRegin.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57LDMOpsListRegin.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57LDMOpsListRegin.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57LDMOpsListRegin.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57LDMOpsListRegin.Writes[0-15]>, + SchedVar<NoSchedPred, A57LDMOpsListRegin.Writes[0-15]> +]> { let Variadic=1; } + +def A57LDMOpsList_Upd : A57WriteLMOpsListType< + [A57WrBackOne, + A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I, + A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, + A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I, + A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I, + A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, + A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>; +def A57WriteLDM_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57LDMOpsList_Upd.Writes[0-2]>, + SchedVar<A57LMAddrPred2, A57LDMOpsList_Upd.Writes[0-4]>, + SchedVar<A57LMAddrPred3, A57LDMOpsList_Upd.Writes[0-6]>, + SchedVar<A57LMAddrPred4, A57LDMOpsList_Upd.Writes[0-8]>, + SchedVar<A57LMAddrPred5, A57LDMOpsList_Upd.Writes[0-10]>, + SchedVar<A57LMAddrPred6, A57LDMOpsList_Upd.Writes[0-12]>, + SchedVar<A57LMAddrPred7, A57LDMOpsList_Upd.Writes[0-14]>, + SchedVar<A57LMAddrPred8, A57LDMOpsList_Upd.Writes[0-16]>, + SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]> +]> { let Variadic=1; } + +def A57WriteLDM : SchedWriteVariant<[ + SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>, + SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]> +]> { let Variadic=1; } + +def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>; + +// TODO: no writeback latency defined in documentation (implemented as 1 cyc) +def : InstRW<[A57WriteLDM_Upd], + (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>; + +// --- 3.9 Store Instructions --- + +// Store, immed offset +def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR", + "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>; + +// Store, register offset +// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S", +// otherwise 1cyc S. +def A57WriteStrAmLDSTSO : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>, + SchedVar<IsLdstsoMinusRegPred, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S]> +]>; +def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>; + +// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg. +def A57WriteStrAm3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S]> +]>; +def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>; +def A57WriteStrAm3X2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S]> +]>; +def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>; + +// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback) +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM", + "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)", + "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>; + +// Store, register pre-indexed: +// 1(1) "S, I0/I1" for plus reg +// 3(2) "I0/I1, S" for minus reg +// 1(2) "S, M" for scaled plus lsl2 +// 3(2) "I0/I1, S" for other scaled +def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<IsLdstsoMinusRegPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<IsLdstsoScaledPredX2, [A57Write_1cyc_1S_1M]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]> +]>; +def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[ + SchedVar<IsLdstsoScaledPredX2, [A57WrBackTwo]>, + SchedVar<IsLdstsoMinusRegPredX2, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre], + (instregex "STR_PRE_REG", "STRB_PRE_REG")>; + +// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE) +// 1(1) "S, I0/I1" for imm or reg plus +// 3(2) "I0/I1, S" for reg minus +def A57WriteStrAm3PreX2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]> +]>; +def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2], + (instregex "STRH_PRE")>; + +def A57WriteStrAm3PreX3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>, + SchedVar<NoSchedPred, [A57Write_1cyc_1S_1I]> +]>; +def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[ + SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>, + SchedVar<NoSchedPred, [A57WrBackOne]> +]>; +def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3], + (instregex "STRD_PRE")>; + +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM", + "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>; + +// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not) +def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG", + "STRB(T?)_POST_REG", "STR(B?)T_POST$")>; + +// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr +// 1(1) "S, I0/I1" both for reg or imm +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], + (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>; + +// --- Store multiple instructions --- +// TODO: no writeback latency defined in documentation +def A57WriteSTM : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S]> +]>; +def A57WriteSTM_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]> +]>; + +def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>; +def : InstRW<[A57WrBackOne, A57WriteSTM_Upd], + (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>; + +// --- 3.10 FP Data Processing Instructions --- +def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>; +def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>; + +def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>; + +// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional +def A57WriteVcmp : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>, + SchedVar<NoSchedPred, [A57Write_3cyc_1X]> +]>; +def : InstRW<[A57WriteVcmp], + (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>; + +// fp convert +def : InstRW<[A57Write_5cyc_1V], (instregex + "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>; + +def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>; + +// FP round to integral +def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>; + +// FP divide, FP square root +def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>; +def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>; +def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>; +def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>; + +// FP max/min +def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>; + +// FP multiply-accumulate pipelines support late forwarding of the result +// from FP multiply μops to the accumulate operands of an +// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle +// after the FP multiply μop has been issued +// FP multiply, FZ +def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; } + +def : SchedAlias<WriteFPMUL32, A57WriteVMUL>; +def : SchedAlias<WriteFPMUL64, A57WriteVMUL>; +def : ReadAdvance<ReadFPMUL, 0>; + +// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate +// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS +def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } + +// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.) +// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.) +// Currently, there is no way to define different read advances for VFMA operand +// from VFMA or from VMUL, so there will be 5 read advance. +// Zero latency (instead of one) for VMUL->VFMA shouldn't break something. +// The same situation with ASIMD VMUL/VFMA instructions +// def A57ReadVFMA : SchedRead; +// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>; +// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>; +def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>; + +def : SchedAlias<WriteFPMAC32, A57WriteVFMA>; +def : SchedAlias<WriteFPMAC64, A57WriteVFMA>; +def : SchedAlias<ReadFPMAC, A57ReadVFMA5>; + +def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>; +def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>; + +// --- 3.11 FP Miscellaneous Instructions --- +// VMOV: 3cyc "F0/F1" for imm/reg +def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>; +def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>; + +// 5cyc L for FP transfer, vfp to core reg, +// 5cyc L for FP transfer, core reg to vfp +def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>; +// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2). +def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>; + +// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg +def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>; + +// --- 3.12 FP Load Instructions --- +def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>; + +def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>; + +// FP load multiple (VLDM) + +def A57VLDMOpsListUncond : A57WriteLMOpsListType< + [A57Write_5cyc_1L, A57Write_5cyc_1L, + A57Write_6cyc_1L, A57Write_6cyc_1L, + A57Write_7cyc_1L, A57Write_7cyc_1L, + A57Write_8cyc_1L, A57Write_8cyc_1L, + A57Write_9cyc_1L, A57Write_9cyc_1L, + A57Write_10cyc_1L, A57Write_10cyc_1L, + A57Write_11cyc_1L, A57Write_11cyc_1L, + A57Write_12cyc_1L, A57Write_12cyc_1L]>; +def A57WriteVLDMuncond : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListUncond.Writes[0-15]> +]> { let Variadic=1; } + +def A57VLDMOpsListCond : A57WriteLMOpsListType< + [A57Write_5cyc_1L, A57Write_6cyc_1L, + A57Write_7cyc_1L, A57Write_8cyc_1L, + A57Write_9cyc_1L, A57Write_10cyc_1L, + A57Write_11cyc_1L, A57Write_12cyc_1L, + A57Write_13cyc_1L, A57Write_14cyc_1L, + A57Write_15cyc_1L, A57Write_16cyc_1L, + A57Write_17cyc_1L, A57Write_18cyc_1L, + A57Write_19cyc_1L, A57Write_20cyc_1L]>; +def A57WriteVLDMcond : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListCond.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListCond.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListCond.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListCond.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListCond.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListCond.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListCond.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListCond.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListCond.Writes[0-15]> +]> { let Variadic=1; } + +def A57WriteVLDM : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>, + SchedVar<NoSchedPred, [A57WriteVLDMuncond]> +]> { let Variadic=1; } + +def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>; + +def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType< + [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I, + A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I, + A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, + A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I, + A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I, + A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>; +def A57WriteVLDMuncond_UPD : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListUncond_Upd.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListUncond_Upd.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListUncond_Upd.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListUncond_Upd.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListUncond_Upd.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListUncond_Upd.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListUncond_Upd.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListUncond_Upd.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListUncond_Upd.Writes[0-15]> +]> { let Variadic=1; } + +def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType< + [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I, + A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I, + A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I, + A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I, + A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I, + A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I, + A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I, + A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>; +def A57WriteVLDMcond_UPD : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, A57VLDMOpsListCond_Upd.Writes[0-1]>, + SchedVar<A57LMAddrPred2, A57VLDMOpsListCond_Upd.Writes[0-3]>, + SchedVar<A57LMAddrPred3, A57VLDMOpsListCond_Upd.Writes[0-5]>, + SchedVar<A57LMAddrPred4, A57VLDMOpsListCond_Upd.Writes[0-7]>, + SchedVar<A57LMAddrPred5, A57VLDMOpsListCond_Upd.Writes[0-9]>, + SchedVar<A57LMAddrPred6, A57VLDMOpsListCond_Upd.Writes[0-11]>, + SchedVar<A57LMAddrPred7, A57VLDMOpsListCond_Upd.Writes[0-13]>, + SchedVar<A57LMAddrPred8, A57VLDMOpsListCond_Upd.Writes[0-15]>, + SchedVar<NoSchedPred, A57VLDMOpsListCond_Upd.Writes[0-15]> +]> { let Variadic=1; } + +def A57WriteVLDM_UPD : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>, + SchedVar<NoSchedPred, [A57WriteVLDMuncond_UPD]> +]> { let Variadic=1; } + +def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD], + (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>; + +// --- 3.13 FP Store Instructions --- +def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>; + +def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>; + +def A57WriteVSTMs : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S]> +]>; +def A57WriteVSTMd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>, + SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>, + SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>, + SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>, + SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>, + SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>, + SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>, + SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1S]> +]>; +def A57WriteVSTMs_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>, + SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>, + SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>, + SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>, + SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>, + SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>, + SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>, + SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]> +]>; +def A57WriteVSTMd_Upd : SchedWriteVariant<[ + SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>, + SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>, + SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>, + SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>, + SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>, + SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>, + SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>, + SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>, + SchedVar<NoSchedPred, [A57Write_2cyc_1S_1I]> +]>; + +def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>; +def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>; +def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd], + (instregex "VSTM(SIA_UPD|SDB_UPD)")>; +def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd], + (instregex "VSTM(DIA_UPD|DDB_UPD)")>; + +// --- 3.14 ASIMD Integer Instructions --- + +// ASIMD absolute diff, 3cyc F0/F1 for integer VABD +def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>; + +// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form +def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVABAD : SchedReadAdvance<3, [A57WriteVABAD]>; +def : InstRW<[A57WriteVABAD, A57ReadVABAD], + (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>; +def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; } +def A57ReadVABAQ : SchedReadAdvance<3, [A57WriteVABAQ]>; +def : InstRW<[A57WriteVABAQ, A57ReadVABAQ], + (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>; + +// ASIMD absolute diff accum long: 4(1) F1 for VABAL +def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVABAL : SchedReadAdvance<3, [A57WriteVABAL]>; +def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>; + +// ASIMD absolute diff long: 3cyc F0/F1 for VABDL +def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>; + +// ASIMD arith, basic +def : InstRW<[A57Write_3cyc_1V], (instregex "VADD", "VADDL", "VADDW", + "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)", + "VPADDi", "VPADDL", "VSUB", "VSUBL", "VSUBW")>; + +// ASIMD arith, complex +def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB", + "VQABS", "VQADD", "VQNEG", "VQSUB", + "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>; + +// ASIMD compare +def : InstRW<[A57Write_3cyc_1V], + (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>; + +// ASIMD logical +def : InstRW<[A57Write_3cyc_1V], + (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>; + +// ASIMD max/min +def : InstRW<[A57Write_3cyc_1V], + (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>; + +// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later +// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply +// and multiply-with-accumulate instructions relative to r0pX. +def A57WriteVMULD_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def : InstRW<[A57WriteVMULD_VecInt], (instregex + "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)", + "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>; + +// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later +def A57WriteVMULQ_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_6cyc_1W]>]>; +def : InstRW<[A57WriteVMULQ_VecInt], (instregex + "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)", + "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>; + +// ASIMD multiply accumulate, D-form +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence +// (4 or 3 ReadAdvance) +def A57WriteVMLAD_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def A57ReadVMLAD_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]> +]>; +def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt], + (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>; + +// ASIMD multiply accumulate, Q-form +// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence +// (4 or 3 ReadAdvance) +def A57WriteVMLAQ_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_6cyc_1W]>]>; +def A57ReadVMLAQ_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]> +]>; +def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt], + (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>; + +// ASIMD multiply accumulate long +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence +// (4 or 3 ReadAdvance) +def A57WriteVMLAL_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def A57ReadVMLAL_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]> +]>; +def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt], + (instregex "VMLAL(s|u)", "VMLSL(s|u)")>; + +// ASIMD multiply accumulate saturating long +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence +// (3 or 2 ReadAdvance) +def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def A57ReadVQDMLAL_VecInt : SchedReadVariant<[ + SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>, + SchedVar<NoSchedPred, [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]> +]>; +def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt], + (instregex "VQDMLAL", "VQDMLSL")>; + +// ASIMD multiply long +// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later +def A57WriteVMULL_VecInt : SchedWriteVariant<[ + SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>, + SchedVar<NoSchedPred, [A57Write_5cyc_1W]>]>; +def : InstRW<[A57WriteVMULL_VecInt], + (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>; + +// ASIMD pairwise add and accumulate +// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance) +def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVPADAL : SchedReadAdvance<3, [A57WriteVPADAL]>; +def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>; + +// ASIMD shift accumulate +// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance) +def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadVSRA : SchedReadAdvance<3, [A57WriteVSRA]>; +def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>; + +// ASIMD shift by immed, basic +def : InstRW<[A57Write_3cyc_1X], + (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>; + +// ASIMD shift by immed, complex +def : InstRW<[A57Write_4cyc_1X], (instregex + "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)", + "VRSHRN")>; + +// ASIMD shift by immed and insert, basic, D-form +def : InstRW<[A57Write_4cyc_1X], (instregex + "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>; + +// ASIMD shift by immed and insert, basic, Q-form +def : InstRW<[A57Write_5cyc_1X], (instregex + "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD shift by register, basic, D-form +def : InstRW<[A57Write_3cyc_1X], (instregex + "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>; + +// ASIMD shift by register, basic, Q-form +def : InstRW<[A57Write_4cyc_1X], (instregex + "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD shift by register, complex, D-form +// VQRSHL, VQSHL, VRSHL +def : InstRW<[A57Write_4cyc_1X], (instregex + "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", + "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>; + +// ASIMD shift by register, complex, Q-form +def : InstRW<[A57Write_5cyc_1X], (instregex + "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", + "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>; + +// --- 3.15 ASIMD Floating-Point Instructions --- +// ASIMD FP absolute value +def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>; + +// ASIMD FP arith +def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)", + "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>; + +// ASIMD FP compare +def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)", + "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>; + +// ASIMD FP convert, integer +def : InstRW<[A57Write_5cyc_1V], (instregex + "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)", + "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)", + "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>; + +// ASIMD FP convert, half-precision: 8cyc F0/F1 +def : InstRW<[A57Write_8cyc_1V], (instregex + "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)", + "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)", + "VCVT(f2h|h2f)")>; + +// ASIMD FP max/min +def : InstRW<[A57Write_5cyc_1V], (instregex + "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>; + +// ASIMD FP multiply +def A57WriteVMUL_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>; + +// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence +def A57WriteVMLA_VecFP : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57ReadVMLA_VecFP : + SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>; +def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP], + (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>; + +// ASIMD FP negate +def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>; + +// ASIMD FP round to integral +def : InstRW<[A57Write_5cyc_1V], (instregex + "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>; + +// --- 3.16 ASIMD Miscellaneous Instructions --- + +// ASIMD bitwise insert +def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>; + +// ASIMD count +def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>; + +// ASIMD duplicate, core reg: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>; + +// ASIMD duplicate, scalar: 3cyc "F0/F1" +def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>; + +// ASIMD extract +def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>; + +// ASIMD move, immed +def : InstRW<[A57Write_3cyc_1V], (instregex + "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)", + "VMOVQ0")>; + +// ASIMD move, narrowing +def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>; + +// ASIMD move, saturating +def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>; + +// ASIMD reciprocal estimate +def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>; + +// ASIMD reciprocal step, FZ +def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>; + +// ASIMD reverse, swap, table lookup (1-2 reg) +def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>; + +// ASIMD table lookup (3-4 reg) +def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>; + +// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1" +def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>; + +// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>; + +// ASIMD transpose +def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>; + +// ASIMD unzip/zip, D-form +def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], + (instregex "VUZPd", "VZIPd")>; + +// ASIMD unzip/zip, Q-form +def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V], + (instregex "VUZPq", "VZIPq")>; + +// --- 3.17 ASIMD Load Instructions --- + +// Overriden via InstRW for this processor. +def : WriteRes<WriteVLD1, []>; +def : WriteRes<WriteVLD2, []>; +def : WriteRes<WriteVLD3, []>; +def : WriteRes<WriteVLD4, []>; +def : WriteRes<WriteVST1, []>; +def : WriteRes<WriteVST2, []>; +def : WriteRes<WriteVST3, []>; +def : WriteRes<WriteVST4, []>; + +// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency +def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>; +def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne], + (instregex "VLD1(d|q)(8|16|32|64)wb")>; + +// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency +def : InstRW<[A57Write_6cyc_1L], + (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>; + +def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne], + (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>; + +// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], (instregex + "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex + "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>; + +// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V], + (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>; + +// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD2b(8|16|32)wb")>; + +// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V], + (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$", + "VLD2LN(d|q)(8|16|32)Pseudo$")>; +// 2 results + wb result +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne], + (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>; +// 1 result + wb result +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb", + "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>; + +// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1" +// 3 results +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V], + (instregex "VLD3(d|q)(8|16|32)$")>; +// 1 result +def : InstRW<[A57Write_9cyc_1L_1V], + (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>; +// 3 results + wb +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3(d|q)(8|16|32)_UPD$")>; +// 1 result + wb +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>; + +// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V], + (instregex "VLD3LN(d|q)32$", + "VLD3LN(d|q)32Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)32_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)32Pseudo_UPD")>; + +// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V], + (instregex "VLD3LN(d|q)(8|16)$", + "VLD3LN(d|q)(8|16)Pseudo$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)(8|16)_UPD")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>; + +// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V], + (instregex "VLD3DUP(d|q)(8|16|32)$", + "VLD3DUP(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>; + +// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, + A57Write_9cyc_1L_1V], + (instregex "VLD4(d|q)(8|16|32)$")>; +def : InstRW<[A57Write_9cyc_1L_1V], + (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4(d|q)(8|16|32)_UPD")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>; + +// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, + A57Write_8cyc_1L_1V], + (instregex "VLD4LN(d|q)32$", + "VLD4LN(d|q)32Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57WrBackOne], + (instregex "VLD4LN(d|q)32_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4LN(d|q)32Pseudo_UPD")>; + +// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1" +def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, + A57Write_9cyc_1L_1V], + (instregex "VLD4LN(d|q)(8|16)$", + "VLD4LN(d|q)(8|16)Pseudo$")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, + A57WrBackOne], + (instregex "VLD4LN(d|q)(8|16)_UPD")>; +def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>; + +// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1" +def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, + A57Write_8cyc_1L_1V], + (instregex "VLD4DUP(d|q)(8|16|32)$", + "VLD4DUP(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I, + A57WrBackOne], + (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>; +def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], + (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>; + +// --- 3.18 ASIMD Store Instructions --- + +// ASIMD store, 1 element, multiple, 1 reg: 1cyc S +def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>; +def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], + (instregex "VST1d(8|16|32|64)wb")>; +// ASIMD store, 1 element, multiple, 2 reg: 2cyc S +def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>; +def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I], + (instregex "VST1q(8|16|32|64)wb")>; +// ASIMD store, 1 element, multiple, 3 reg: 3cyc S +def : InstRW<[A57Write_3cyc_1S], + (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I], + (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>; +// ASIMD store, 1 element, multiple, 4 reg: 4cyc S +def : InstRW<[A57Write_4cyc_1S], + (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>; +def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I], + (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>; +// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S" +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>; +// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S" +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST2(d|b)(8|16|32)$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST2(b|d)(8|16|32)wb")>; +// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S" +def : InstRW<[A57Write_4cyc_1S_1V], + (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I], + (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>; +// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S" +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST2LN(d|q)(8|16|32)_UPD", + "VST2LN(d|q)(8|16|32)Pseudo_UPD")>; +// ASIMD store, 3 element, multiple, 3 reg +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST3(d|q)(8|16|32)_UPD", + "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>; +// ASIMD store, 3 element, one lane +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST3LN(d|q)(8|16|32)_UPD", + "VST3LN(d|q)(8|16|32)Pseudo_UPD")>; +// ASIMD store, 4 element, multiple, 4 reg +def : InstRW<[A57Write_4cyc_1S_1V], + (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>; +def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I], + (instregex "VST4(d|q)(8|16|32)_UPD", + "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>; +// ASIMD store, 4 element, one lane +def : InstRW<[A57Write_3cyc_1S_1V], + (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>; +def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I], + (instregex "VST4LN(d|q)(8|16|32)_UPD", + "VST4LN(d|q)(8|16|32)Pseudo_UPD")>; + +// --- 3.19 Cryptography Extensions --- +// Crypto AES ops +// AESD, AESE, AESIMC, AESMC: 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>; +// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>; +// Crypto SHA1 xor ops: 6cyc F0/F1 +def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>; +// Crypto SHA1 fast ops: 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>; +// Crypto SHA1 slow ops: 6cyc F0 +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>; +// Crypto SHA256 fast ops: 3cyc F0 +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>; +// Crypto SHA256 slow ops: 6cyc F0 +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>; + +// --- 3.20 CRC --- +def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>; + +// ----------------------------------------------------------------------------- +// Common definitions +def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; } +def : SchedAlias<WriteALU, A57Write_1cyc_1I>; + +def : SchedAlias<WriteBr, A57Write_1cyc_1B>; +def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>; +def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>; +def : SchedAlias<WritePreLd, A57Write_4cyc_1L>; + +def : SchedAlias<WriteLd, A57Write_4cyc_1L>; +def : SchedAlias<WriteST, A57Write_1cyc_1S>; +def : ReadAdvance<ReadALU, 0>; + +} // SchedModel = CortexA57Model + diff --git a/lib/Target/ARM/ARMScheduleA57WriteRes.td b/lib/Target/ARM/ARMScheduleA57WriteRes.td new file mode 100644 index 000000000000..670717dc7c13 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleA57WriteRes.td @@ -0,0 +1,323 @@ +//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: A57Write +// Latency: #cyc +// MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// +// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are +// 11 micro-ops to be issued as follows: one to I pipe, six to S pipes and +// four to V pipes. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define Generic 1 micro-op types + +def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } +def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } +def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; + let ResourceCycles = [17]; } +def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; + let ResourceCycles = [18]; } +def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; + let ResourceCycles = [19]; } +def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20; + let ResourceCycles = [20]; } +def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } +def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; } +def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; } +def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; } +def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } +def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; } +def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; } +def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; } +def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32; + let ResourceCycles = [32]; } +def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; + let ResourceCycles = [32]; } +def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; + let ResourceCycles = [35]; } +def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; } +def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } +def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } +def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } + +// A57Write_3cyc_1L - A57Write_20cyc_1L +foreach Lat = 3-20 in { + def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> { + let Latency = Lat; + } +} + +// A57Write_4cyc_1S - A57Write_16cyc_1S +foreach Lat = 4-16 in { + def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> { + let Latency = Lat; + } +} + +def A57Write_4cyc_1M : SchedWriteRes<[A57UnitL]> { let Latency = 4; } +def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; } +def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; } +def A57Write_6cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 6; } +def A57Write_6cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 6; } +def A57Write_8cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 8; } +def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } +def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } + + +//===----------------------------------------------------------------------===// +// Define Generic 2 micro-op types + +def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 64; + let NumMicroOps = 2; + let ResourceCycles = [32, 32]; +} +def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_1V_1X : SchedWriteRes<[A57UnitV, + A57UnitX]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV, + A57UnitX]> { + let Latency = 7; + let NumMicroOps = 2; +} +def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_9cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} +def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} +def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_1cyc_1S_1I : SchedWriteRes<[A57UnitS, + A57UnitI]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_2cyc_1S_1I : SchedWriteRes<[A57UnitS, + A57UnitI]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_3cyc_1S_1I : SchedWriteRes<[A57UnitS, + A57UnitI]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_1cyc_1S_1M : SchedWriteRes<[A57UnitS, + A57UnitM]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_3cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_6cyc_1B_1L : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_2cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 36; + let NumMicroOps = 2; + let ResourceCycles = [18, 18]; +} +def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_4cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I +foreach Lat = 3-20 in { + def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> { + let Latency = Lat; let NumMicroOps = 2; + } +} + +def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_4cyc_1S_1V : SchedWriteRes<[A57UnitS, + A57UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} +def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I +foreach Lat = 4-16 in { + def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> { + let Latency = Lat; let NumMicroOps = 2; + } +} + +def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 4; + let NumMicroOps = 2; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 3 micro-op types + +def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 3; +} +def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} +def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI, + A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_1S_1V_1I : SchedWriteRes<[A57UnitS, + A57UnitV, + A57UnitI]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_4cyc_1S_1V_1I : SchedWriteRes<[A57UnitS, + A57UnitV, + A57UnitI]> { + let Latency = 4; + let NumMicroOps = 3; +} +def A57Write_4cyc_1I_1L_1M : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> { + let Latency = 4; + let NumMicroOps = 3; +} +def A57Write_8cyc_1L_1V_1I : SchedWriteRes<[A57UnitL, + A57UnitV, + A57UnitI]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_9cyc_1L_1V_1I : SchedWriteRes<[A57UnitL, + A57UnitV, + A57UnitI]> { + let Latency = 9; + let NumMicroOps = 3; +} diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index d2630685d91b..af682dd8321c 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -234,6 +234,10 @@ protected: /// CPSR setting instruction. bool AvoidCPSRPartialUpdate = false; + /// CheapPredicableCPSRDef - If true, disable +1 predication cost + /// for instructions updating CPSR. Enabled for Cortex-A57. + bool CheapPredicableCPSRDef = false; + /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting /// movs with shifter operand (i.e. asr, lsl, lsr). bool AvoidMOVsShifterOperand = false; @@ -543,6 +547,7 @@ public: bool nonpipelinedVFP() const { return NonpipelinedVFP; } bool prefers32BitThumb() const { return Pref32BitThumb; } bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } + bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRetAddrStack() const { return HasRetAddrStack; } bool hasMPExtension() const { return HasMPExtension; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 0fef91ec4d3e..b76da727237c 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -3419,9 +3419,7 @@ int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI, int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode()); if (NewOpcode >= 0) return NewOpcode; - - dbgs() << "Cannot convert to .new: " << getName(MI.getOpcode()) << '\n'; - llvm_unreachable(nullptr); + return 0; } int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const { diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp index 4593fc92ca6f..35948e36ad91 100644 --- a/lib/Target/Mips/MicroMipsSizeReduction.cpp +++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp @@ -135,6 +135,14 @@ private: // returns true on success. static bool ReduceXWtoXWSP(MachineInstr *MI, const ReduceEntry &Entry); + // Attempts to reduce LBU/LHU instruction into LBU16/LHU16, + // returns true on success. + static bool ReduceLXUtoLXU16(MachineInstr *MI, const ReduceEntry &Entry); + + // Attempts to reduce SB/SH instruction into SB16/SH16, + // returns true on success. + static bool ReduceSXtoSX16(MachineInstr *MI, const ReduceEntry &Entry); + // Attempts to reduce arithmetic instructions, returns true on success static bool ReduceArithmeticInstructions(MachineInstr *MI, const ReduceEntry &Entry); @@ -162,10 +170,26 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = { {RT_OneInstr, OpCodes(Mips::ADDu_MM, Mips::ADDU16_MM), ReduceArithmeticInstructions, OpInfo(OT_OperandsAll), ImmField(0, 0, 0, -1)}, + {RT_OneInstr, OpCodes(Mips::LBu, Mips::LBU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)}, + {RT_OneInstr, OpCodes(Mips::LBu_MM, Mips::LBU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)}, + {RT_OneInstr, OpCodes(Mips::LHu, Mips::LHU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::LHu_MM, Mips::LHU16_MM), ReduceLXUtoLXU16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, {RT_OneInstr, OpCodes(Mips::LW, Mips::LWSP_MM), ReduceXWtoXWSP, OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)}, {RT_OneInstr, OpCodes(Mips::LW_MM, Mips::LWSP_MM), ReduceXWtoXWSP, OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)}, + {RT_OneInstr, OpCodes(Mips::SB, Mips::SB16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::SB_MM, Mips::SB16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::SH, Mips::SH16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, + {RT_OneInstr, OpCodes(Mips::SH_MM, Mips::SH16_MM), ReduceSXtoSX16, + OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)}, {RT_OneInstr, OpCodes(Mips::SUBu, Mips::SUBU16_MM), ReduceArithmeticInstructions, OpInfo(OT_OperandsAll), ImmField(0, 0, 0, -1)}, @@ -193,6 +217,13 @@ static bool isMMThreeBitGPRegister(const MachineOperand &MO) { return false; } +// Returns true if the machine operand MO is register $0, $17, or $2-$7. +static bool isMMSourceRegister(const MachineOperand &MO) { + if (MO.isReg() && Mips::GPRMM16ZeroRegClass.contains(MO.getReg())) + return true; + return false; +} + // Returns true if the operand Op is an immediate value // and writes the immediate value into variable Imm static bool GetImm(MachineInstr *MI, unsigned Op, int64_t &Imm) { @@ -279,6 +310,32 @@ bool MicroMipsSizeReduce::ReduceArithmeticInstructions( return ReplaceInstruction(MI, Entry); } +bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI, + const ReduceEntry &Entry) { + + if (!ImmInRange(MI, Entry)) + return false; + + if (!isMMThreeBitGPRegister(MI->getOperand(0)) || + !isMMThreeBitGPRegister(MI->getOperand(1))) + return false; + + return ReplaceInstruction(MI, Entry); +} + +bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI, + const ReduceEntry &Entry) { + + if (!ImmInRange(MI, Entry)) + return false; + + if (!isMMSourceRegister(MI->getOperand(0)) || + !isMMThreeBitGPRegister(MI->getOperand(1))) + return false; + + return ReplaceInstruction(MI, Entry); +} + bool MicroMipsSizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool Modified = false; MachineBasicBlock::instr_iterator MII = MBB.instr_begin(), diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt index 54619589c341..35a67134775a 100644 --- a/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -88,6 +88,3 @@ pr45695.c wasm-o pr49279.c wasm-o pr49390.c wasm-o pr52286.c wasm-o - -# fatal error: error in backend: data symbols must have a size set with .size -921110-1.c wasm-o diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0a41f35f9320..5303d7a406ad 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4753,7 +4753,7 @@ static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, SmallVectorImpl<int> &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); int NumElts = Mask.size(); - ScaledMask.assign(NumElts * Scale, -1); + ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; @@ -5848,17 +5848,39 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } case ISD::SCALAR_TO_VECTOR: { - // Match against a scalar_to_vector of an extract from a similar vector. + // Match against a scalar_to_vector of an extract from a vector, + // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. SDValue N0 = N.getOperand(0); - if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - N0.getOperand(0).getValueType() != VT || - !isa<ConstantSDNode>(N0.getOperand(1)) || - NumElts <= N0.getConstantOperandVal(1) || - !N->isOnlyUserOf(N0.getNode())) + SDValue SrcExtract; + + if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + N0.getOperand(0).getValueType() == VT) { + SrcExtract = N0; + } else if (N0.getOpcode() == ISD::AssertZext && + N0.getOperand(0).getOpcode() == X86ISD::PEXTRW && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) { + SrcExtract = N0.getOperand(0); + assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16); + } else if (N0.getOpcode() == ISD::AssertZext && + N0.getOperand(0).getOpcode() == X86ISD::PEXTRB && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) { + SrcExtract = N0.getOperand(0); + assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8); + } + + if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)) || + NumElts <= SrcExtract.getConstantOperandVal(1)) return false; - Ops.push_back(N0.getOperand(0)); - Mask.push_back(N0.getConstantOperandVal(1)); - Mask.append(NumElts - 1, SM_SentinelUndef); + + SDValue SrcVec = SrcExtract.getOperand(0); + EVT SrcVT = SrcVec.getValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; + + Ops.push_back(SrcVec); + Mask.push_back(SrcExtract.getConstantOperandVal(1)); + Mask.append(NumZeros, SM_SentinelZero); + Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); return true; } case X86ISD::PINSRB: @@ -6542,12 +6564,12 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue, APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { - assert((ScalarSize == 32 || ScalarSize == 64) && - "Unsupported floating point scalar size"); - if (ScalarSize == 32) - Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); - else - Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + if (ScalarSize == 32) { + Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); + } else { + assert(ScalarSize == 64 && "Unsupported floating point scalar size"); + Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); + } } else Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); ConstantVec.push_back(Const); @@ -6633,11 +6655,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // AVX have support for 32 and 64 bit broadcast for floats only. // No 64bit integer in 32bit subtarget. MVT CVT = MVT::getFloatingPointVT(SplatBitSize); - Constant *C = SplatBitSize == 32 - ? ConstantFP::get(Type::getFloatTy(*Ctx), - SplatValue.bitsToFloat()) - : ConstantFP::get(Type::getDoubleTy(*Ctx), - SplatValue.bitsToDouble()); + // Lower the splat via APFloat directly, to avoid any conversion. + Constant *C = + SplatBitSize == 32 + ? ConstantFP::get(*Ctx, + APFloat(APFloat::IEEEsingle(), SplatValue)) + : ConstantFP::get(*Ctx, + APFloat(APFloat::IEEEdouble(), SplatValue)); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; @@ -8003,7 +8027,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef<int> Mask, SmallVectorImpl<int> &RepeatedMask) { - int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { @@ -16997,7 +17021,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); - ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); @@ -17024,18 +17048,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is // available. SDValue Cmp; - unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); + unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); if (SSECC == 8) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; - if (SetCCOpcode == ISD::SETUEQ) { + if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) : static_cast<unsigned>(ISD::OR); } else { - assert(SetCCOpcode == ISD::SETONE); + assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) : @@ -17082,7 +17106,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // 2. The original operand type has been promoted to a 256-bit vector. // // Note that condition 2. only applies for AVX targets. - SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); + SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond); return DAG.getZExtOrTrunc(NewOp, dl, VT); } @@ -17122,7 +17146,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; - switch (SetCCOpcode) { + switch (Cond) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; @@ -17137,60 +17161,49 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } // Are we comparing unsigned or signed integers? - unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) - ? X86ISD::VPCOMU : X86ISD::VPCOM; + unsigned Opc = + ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CmpMode, dl, MVT::i8)); } - // We are handling one of the integer comparisons here. Since SSE only has + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc; - bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; - bool Subus = false; - - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; - case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; - case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETGT: Opc = X86ISD::PCMPGT; break; - case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETLE: Opc = X86ISD::PCMPGT; - Invert = true; break; - case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGT: Opc = X86ISD::PCMPGT; - FlipSigns = true; break; - case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETULE: Opc = X86ISD::PCMPGT; - FlipSigns = true; Invert = true; break; - } + unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ + : X86ISD::PCMPGT; + bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || + Cond == ISD::SETGE || Cond == ISD::SETUGE; + bool Invert = Cond == ISD::SETNE || + (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); + bool FlipSigns = ISD::isUnsignedIntSetCC(Cond); // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); - bool hasMinMax = - (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) - || (Subtarget.hasSSE2() && (VET == MVT::i8)); - - if (hasMinMax) { - switch (SetCCOpcode) { + bool HasMinMax = + (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) || + (Subtarget.hasSSE2() && (VET == MVT::i8)); + bool MinMax = false; + if (HasMinMax) { + switch (Cond) { default: break; case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } - if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } + if (MinMax) + Swap = Invert = FlipSigns = false; } - bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); - if (!MinMax && hasSubus) { + bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); + bool Subus = false; + if (!MinMax && HasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 // pcmpeq t, <0..0> - switch (SetCCOpcode) { + switch (Cond) { default: break; case ISD::SETULT: { // If the comparison is against a constant we can turn this into a diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index 613b4a7f03e9..626a891f65c6 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -228,7 +228,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape, SmallVector<ReturnInst *, 4> Returns; - CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/false, Returns); + CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns); // Remove old returns. for (ReturnInst *Return : Returns) diff --git a/lib/Transforms/Coroutines/Coroutines.cpp b/lib/Transforms/Coroutines/Coroutines.cpp index ea48043f9381..44e1f9b404ed 100644 --- a/lib/Transforms/Coroutines/Coroutines.cpp +++ b/lib/Transforms/Coroutines/Coroutines.cpp @@ -218,6 +218,8 @@ void coro::Shape::buildFrom(Function &F) { size_t FinalSuspendIndex = 0; clear(*this); SmallVector<CoroFrameInst *, 8> CoroFrames; + SmallVector<CoroSaveInst *, 2> UnusedCoroSaves; + for (Instruction &I : instructions(F)) { if (auto II = dyn_cast<IntrinsicInst>(&I)) { switch (II->getIntrinsicID()) { @@ -229,6 +231,12 @@ void coro::Shape::buildFrom(Function &F) { case Intrinsic::coro_frame: CoroFrames.push_back(cast<CoroFrameInst>(II)); break; + case Intrinsic::coro_save: + // After optimizations, coro_suspends using this coro_save might have + // been removed, remember orphaned coro_saves to remove them later. + if (II->use_empty()) + UnusedCoroSaves.push_back(cast<CoroSaveInst>(II)); + break; case Intrinsic::coro_suspend: CoroSuspends.push_back(cast<CoroSuspendInst>(II)); if (CoroSuspends.back()->isFinal()) { @@ -311,4 +319,8 @@ void coro::Shape::buildFrom(Function &F) { if (HasFinalSuspend && FinalSuspendIndex != CoroSuspends.size() - 1) std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back()); + + // Remove orphaned coro.saves. + for (CoroSaveInst *CoroSave : UnusedCoroSaves) + CoroSave->eraseFromParent(); } diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 28cc81c76d4f..5cc29a493798 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1188,6 +1188,10 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) { SCCNodes.insert(F); } + // Skip it if the SCC only contains optnone functions. + if (SCCNodes.empty()) + return Changed; + Changed |= addArgumentReturnedAttrs(SCCNodes); Changed |= addReadAttrs(SCCNodes, AARGetter); Changed |= addArgumentAttrs(SCCNodes); diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 231487923fad..6d34ab8b0d96 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -292,8 +292,7 @@ static void computeImportForFunction( static void ComputeImportForModule( const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index, FunctionImporter::ImportMapTy &ImportList, - StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr, - const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) { + StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) { // Worklist contains the list of function imported in this module, for which // we will analyse the callees and may import further down the callgraph. SmallVector<EdgeInfo, 128> Worklist; @@ -301,7 +300,7 @@ static void ComputeImportForModule( // Populate the worklist with the import for the functions in the current // module for (auto &GVSummary : DefinedGVSummaries) { - if (DeadSymbols && DeadSymbols->count(GVSummary.first)) { + if (!Index.isGlobalValueLive(GVSummary.second)) { DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n"); continue; } @@ -344,15 +343,14 @@ void llvm::ComputeCrossModuleImport( const ModuleSummaryIndex &Index, const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries, StringMap<FunctionImporter::ImportMapTy> &ImportLists, - StringMap<FunctionImporter::ExportSetTy> &ExportLists, - const DenseSet<GlobalValue::GUID> *DeadSymbols) { + StringMap<FunctionImporter::ExportSetTy> &ExportLists) { // For each module that has function defined, compute the import/export lists. for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) { auto &ImportList = ImportLists[DefinedGVSummaries.first()]; DEBUG(dbgs() << "Computing import for Module '" << DefinedGVSummaries.first() << "'\n"); ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList, - &ExportLists, DeadSymbols); + &ExportLists); } // When computing imports we added all GUIDs referenced by anything @@ -414,82 +412,71 @@ void llvm::ComputeCrossModuleImportForModule( #endif } -DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols( - const ModuleSummaryIndex &Index, +void llvm::computeDeadSymbols( + ModuleSummaryIndex &Index, const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) { + assert(!Index.withGlobalValueDeadStripping()); if (!ComputeDead) - return DenseSet<GlobalValue::GUID>(); + return; if (GUIDPreservedSymbols.empty()) // Don't do anything when nothing is live, this is friendly with tests. - return DenseSet<GlobalValue::GUID>(); - DenseSet<ValueInfo> LiveSymbols; + return; + unsigned LiveSymbols = 0; SmallVector<ValueInfo, 128> Worklist; Worklist.reserve(GUIDPreservedSymbols.size() * 2); for (auto GUID : GUIDPreservedSymbols) { ValueInfo VI = Index.getValueInfo(GUID); if (!VI) continue; - DEBUG(dbgs() << "Live root: " << VI.getGUID() << "\n"); - LiveSymbols.insert(VI); - Worklist.push_back(VI); + for (auto &S : VI.getSummaryList()) + S->setLive(true); } + // Add values flagged in the index as live roots to the worklist. - for (const auto &Entry : Index) { - bool IsLiveRoot = llvm::any_of( - Entry.second.SummaryList, - [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) { - return Summary->liveRoot(); - }); - if (!IsLiveRoot) - continue; - DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n"); - Worklist.push_back(ValueInfo(&Entry)); - } + for (const auto &Entry : Index) + for (auto &S : Entry.second.SummaryList) + if (S->isLive()) { + DEBUG(dbgs() << "Live root: " << Entry.first << "\n"); + Worklist.push_back(ValueInfo(&Entry)); + ++LiveSymbols; + break; + } + + // Make value live and add it to the worklist if it was not live before. + // FIXME: we should only make the prevailing copy live here + auto visit = [&](ValueInfo VI) { + for (auto &S : VI.getSummaryList()) + if (S->isLive()) + return; + for (auto &S : VI.getSummaryList()) + S->setLive(true); + ++LiveSymbols; + Worklist.push_back(VI); + }; while (!Worklist.empty()) { auto VI = Worklist.pop_back_val(); - - // FIXME: we should only make the prevailing copy live here for (auto &Summary : VI.getSummaryList()) { - for (auto Ref : Summary->refs()) { - if (LiveSymbols.insert(Ref).second) { - DEBUG(dbgs() << "Marking live (ref): " << Ref.getGUID() << "\n"); - Worklist.push_back(Ref); - } - } - if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) { - for (auto Call : FS->calls()) { - if (LiveSymbols.insert(Call.first).second) { - DEBUG(dbgs() << "Marking live (call): " << Call.first.getGUID() - << "\n"); - Worklist.push_back(Call.first); - } - } - } + for (auto Ref : Summary->refs()) + visit(Ref); + if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) + for (auto Call : FS->calls()) + visit(Call.first); if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) { auto AliaseeGUID = AS->getAliasee().getOriginalName(); ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID); - if (AliaseeVI && LiveSymbols.insert(AliaseeVI).second) { - DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n"); - Worklist.push_back(AliaseeVI); - } + if (AliaseeVI) + visit(AliaseeVI); } } } - DenseSet<GlobalValue::GUID> DeadSymbols; - DeadSymbols.reserve( - std::min(Index.size(), Index.size() - LiveSymbols.size())); - for (auto &Entry : Index) { - if (!LiveSymbols.count(ValueInfo(&Entry))) { - DEBUG(dbgs() << "Marking dead: " << Entry.first << "\n"); - DeadSymbols.insert(Entry.first); - } - } - DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and " - << DeadSymbols.size() << " symbols Dead \n"); - NumDeadSymbols += DeadSymbols.size(); - NumLiveSymbols += LiveSymbols.size(); - return DeadSymbols; + Index.setWithGlobalValueDeadStripping(); + + unsigned DeadSymbols = Index.size() - LiveSymbols; + DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols + << " symbols Dead \n"); + NumDeadSymbols += DeadSymbols; + NumLiveSymbols += LiveSymbols; } /// Compute the set of summaries needed for a ThinLTO backend compilation of diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index ca4ee92f971a..7bec50d9d25f 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1442,9 +1442,8 @@ bool LowerTypeTestsModule::lower() { for (auto &P : *ExportSummary) { for (auto &S : P.second.SummaryList) { auto *FS = dyn_cast<FunctionSummary>(S.get()); - if (!FS) + if (!FS || !ExportSummary->isGlobalValueLive(FS)) continue; - // FIXME: Only add live functions. for (GlobalValue::GUID G : FS->type_tests()) for (Metadata *MD : MetadataByGUID[G]) AddTypeIdUse(MD).IsExported = true; diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index bc0967448cdd..ea805efc66b7 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -68,6 +68,10 @@ static cl::opt<int> cl::desc("Relative frequency of outline region to " "the entry block")); +static cl::opt<unsigned> ExtraOutliningPenalty( + "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, + cl::desc("A debug option to add additional penalty to the computed one.")); + namespace { struct FunctionOutliningInfo { @@ -83,7 +87,7 @@ struct FunctionOutliningInfo { SmallVector<BasicBlock *, 4> Entries; // The return block that is not included in the outlined region. BasicBlock *ReturnBlock; - // The dominating block of the region ot be outlined. + // The dominating block of the region to be outlined. BasicBlock *NonReturnBlock; // The set of blocks in Entries that that are predecessors to ReturnBlock SmallVector<BasicBlock *, 4> ReturnBlockPreds; @@ -407,11 +411,23 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( if (hasProfileData(F, OI)) return OutlineRegionRelFreq; - // When profile data is not available, we need to be very - // conservative in estimating the overall savings. We need to make sure - // the outline region relative frequency is not below the threshold - // specified by the option. - OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); + // When profile data is not available, we need to be conservative in + // estimating the overall savings. Static branch prediction can usually + // guess the branch direction right (taken/non-taken), but the guessed + // branch probability is usually not biased enough. In case when the + // outlined region is predicted to be likely, its probability needs + // to be made higher (more biased) to not under-estimate the cost of + // function outlining. On the other hand, if the outlined region + // is predicted to be less likely, the predicted probablity is usually + // higher than the actual. For instance, the actual probability of the + // less likely target is only 5%, but the guessed probablity can be + // 40%. In the latter case, there is no need for further adjustement. + // FIXME: add an option for this. + if (OutlineRegionRelFreq < BranchProbability(45, 100)) + return OutlineRegionRelFreq; + + OutlineRegionRelFreq = std::max( + OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); return OutlineRegionRelFreq; } @@ -496,6 +512,26 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { if (isa<DbgInfoIntrinsic>(I)) continue; + switch (I->getOpcode()) { + case Instruction::BitCast: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::Alloca: + continue; + case Instruction::GetElementPtr: + if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) + continue; + default: + break; + } + + IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); + if (IntrInst) { + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || + IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) + continue; + } + if (CallInst *CI = dyn_cast<CallInst>(I)) { InlineCost += getCallsiteCost(CallSite(CI), DL); continue; @@ -519,7 +555,13 @@ std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, BasicBlock *OutliningCallBB) { // First compute the cost of the outlined region 'OI' in the original - // function 'F': + // function 'F'. + // FIXME: The code extractor (outliner) can now do code sinking/hoisting + // to reduce outlining cost. The hoisted/sunk code currently do not + // incur any runtime cost so it is still OK to compare the outlined + // function cost with the outlined region in the original function. + // If this ever changes, we will need to introduce new extractor api + // to pass the information. int OutlinedRegionCost = 0; for (BasicBlock &BB : *F) { if (&BB != OI->ReturnBlock && @@ -542,8 +584,14 @@ std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( assert(OutlinedFunctionCost >= OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); - int OutliningRuntimeOverhead = - OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); + // The code extractor introduces a new root and exit stub blocks with + // additional unconditional branches. Those branches will be eliminated + // later with bb layout. The cost should be adjusted accordingly: + OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; + + int OutliningRuntimeOverhead = OutliningFuncCallCost + + (OutlinedFunctionCost - OutlinedRegionCost) + + ExtraOutliningPenalty; return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, OutlinedRegionCost); diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 9fd3a9021a27..16fba32e9805 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -157,7 +157,7 @@ static cl::opt<bool> static cl::opt<bool> EnableGVNSink( "enable-gvn-sink", cl::init(false), cl::Hidden, - cl::desc("Enable the GVN sinking pass (default = on)")); + cl::desc("Enable the GVN sinking pass (default = off)")); PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 2c2b7317a1c0..c0798e164c39 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4508,13 +4508,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Builder->CreateAnd(A, B), Op1); - // ~x < ~y --> y < x - // ~x < cst --> ~cst < x + // ~X < ~Y --> Y < X + // ~X < C --> X > ~C if (match(Op0, m_Not(m_Value(A)))) { if (match(Op1, m_Not(m_Value(B)))) return new ICmpInst(I.getPredicate(), B, A); - if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) - return new ICmpInst(I.getPredicate(), ConstantExpr::getNot(RHSC), A); + + const APInt *C; + if (match(Op1, m_APInt(C))) + return new ICmpInst(I.getSwappedPredicate(), A, + ConstantInt::get(Op1->getType(), ~(*C))); } Instruction *AddI = nullptr; diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index ff753c20a94a..df4ee9969c02 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2087,6 +2087,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { switch (I.getNumArgOperands()) { case 3: assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode"); + LLVM_FALLTHROUGH; case 2: CopyOp = I.getArgOperand(0); ConvertOp = I.getArgOperand(1); diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 325b64cd8b43..8aa40d1759de 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -57,6 +57,11 @@ static const char *const SanCovTracePCGuardName = "__sanitizer_cov_trace_pc_guard"; static const char *const SanCovTracePCGuardInitName = "__sanitizer_cov_trace_pc_guard_init"; +static const char *const SanCov8bitCountersInitName = + "__sanitizer_cov_8bit_counters_init"; + +static const char *const SanCovGuardsSectionName = "sancov_guards"; +static const char *const SanCovCountersSectionName = "sancov_counters"; static cl::opt<int> ClCoverageLevel( "sanitizer-coverage-level", @@ -64,14 +69,18 @@ static cl::opt<int> ClCoverageLevel( "3: all blocks and critical edges"), cl::Hidden, cl::init(0)); -static cl::opt<bool> ClExperimentalTracePC("sanitizer-coverage-trace-pc", - cl::desc("Experimental pc tracing"), - cl::Hidden, cl::init(false)); +static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc", + cl::desc("Experimental pc tracing"), cl::Hidden, + cl::init(false)); static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard", cl::desc("pc tracing with a guard"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters", + cl::desc("increments 8-bit counter for every edge"), + cl::Hidden, cl::init(false)); + static cl::opt<bool> ClCMPTracing("sanitizer-coverage-trace-compares", cl::desc("Tracing of CMP and similar instructions"), @@ -123,9 +132,10 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { Options.TraceCmp |= ClCMPTracing; Options.TraceDiv |= ClDIVTracing; Options.TraceGep |= ClGEPTracing; - Options.TracePC |= ClExperimentalTracePC; + Options.TracePC |= ClTracePC; Options.TracePCGuard |= ClTracePCGuard; - if (!Options.TracePCGuard && !Options.TracePC) + Options.Inline8bitCounters |= ClInline8bitCounters; + if (!Options.TracePCGuard && !Options.TracePC && !Options.Inline8bitCounters) Options.TracePCGuard = true; // TracePCGuard is default. Options.NoPrune |= !ClPruneBlocks; return Options; @@ -159,11 +169,22 @@ private: void InjectTraceForSwitch(Function &F, ArrayRef<Instruction *> SwitchTraceTargets); bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks); - void CreateFunctionGuardArray(size_t NumGuards, Function &F); + GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements, + Function &F, Type *Ty, + const char *Section); + void CreateFunctionLocalArrays(size_t NumGuards, Function &F); void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx); - StringRef getSanCovTracePCGuardSection() const; - StringRef getSanCovTracePCGuardSectionStart() const; - StringRef getSanCovTracePCGuardSectionEnd() const; + void CreateInitCallForSection(Module &M, const char *InitFunctionName, + Type *Ty, const std::string &Section); + + void SetNoSanitizeMetadata(Instruction *I) { + I->setMetadata(I->getModule()->getMDKindID("nosanitize"), + MDNode::get(*C, None)); + } + + std::string getSectionName(const std::string &Section) const; + std::string getSectionStart(const std::string &Section) const; + std::string getSectionEnd(const std::string &Section) const; Function *SanCovTracePCIndir; Function *SanCovTracePC, *SanCovTracePCGuard; Function *SanCovTraceCmpFunction[4]; @@ -171,20 +192,48 @@ private: Function *SanCovTraceGepFunction; Function *SanCovTraceSwitchFunction; InlineAsm *EmptyAsm; - Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy; + Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy, + *Int8Ty, *Int8PtrTy; Module *CurModule; Triple TargetTriple; LLVMContext *C; const DataLayout *DL; GlobalVariable *FunctionGuardArray; // for trace-pc-guard. - bool HasSancovGuardsSection; + GlobalVariable *Function8bitCounterArray; // for inline-8bit-counters. SanitizerCoverageOptions Options; }; } // namespace +void SanitizerCoverageModule::CreateInitCallForSection( + Module &M, const char *InitFunctionName, Type *Ty, + const std::string &Section) { + IRBuilder<> IRB(M.getContext()); + Function *CtorFunc; + GlobalVariable *SecStart = + new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr, + getSectionStart(Section)); + SecStart->setVisibility(GlobalValue::HiddenVisibility); + GlobalVariable *SecEnd = + new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, + nullptr, getSectionEnd(Section)); + SecEnd->setVisibility(GlobalValue::HiddenVisibility); + + std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions( + M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty}, + {IRB.CreatePointerCast(SecStart, Ty), IRB.CreatePointerCast(SecEnd, Ty)}); + + if (TargetTriple.supportsCOMDAT()) { + // Use comdat to dedup CtorFunc. + CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName)); + appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc); + } else { + appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); + } +} + bool SanitizerCoverageModule::runOnModule(Module &M) { if (Options.CoverageType == SanitizerCoverageOptions::SCK_None) return false; @@ -192,15 +241,18 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { DL = &M.getDataLayout(); CurModule = &M; TargetTriple = Triple(M.getTargetTriple()); - HasSancovGuardsSection = false; + FunctionGuardArray = nullptr; + Function8bitCounterArray = nullptr; IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits()); IntptrPtrTy = PointerType::getUnqual(IntptrTy); Type *VoidTy = Type::getVoidTy(*C); IRBuilder<> IRB(*C); Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty()); Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); + Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); Int64Ty = IRB.getInt64Ty(); Int32Ty = IRB.getInt32Ty(); + Int8Ty = IRB.getInt8Ty(); SanCovTracePCIndir = checkSanitizerInterfaceFunction( M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy)); @@ -243,34 +295,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { for (auto &F : M) runOnFunction(F); - // Create variable for module (compilation unit) name - if (Options.TracePCGuard) { - if (HasSancovGuardsSection) { - Function *CtorFunc; - GlobalVariable *SecStart = new GlobalVariable( - M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr, - getSanCovTracePCGuardSectionStart()); - SecStart->setVisibility(GlobalValue::HiddenVisibility); - GlobalVariable *SecEnd = new GlobalVariable( - M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr, - getSanCovTracePCGuardSectionEnd()); - SecEnd->setVisibility(GlobalValue::HiddenVisibility); - - std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions( - M, SanCovModuleCtorName, SanCovTracePCGuardInitName, - {Int32PtrTy, Int32PtrTy}, - {IRB.CreatePointerCast(SecStart, Int32PtrTy), - IRB.CreatePointerCast(SecEnd, Int32PtrTy)}); - - if (TargetTriple.supportsCOMDAT()) { - // Use comdat to dedup CtorFunc. - CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName)); - appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc); - } else { - appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); - } - } - } + if (FunctionGuardArray) + CreateInitCallForSection(M, SanCovTracePCGuardInitName, Int32PtrTy, + SanCovGuardsSectionName); + if (Function8bitCounterArray) + CreateInitCallForSection(M, SanCov8bitCountersInitName, Int8PtrTy, + SanCovCountersSectionName); + return true; } @@ -393,17 +424,26 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { InjectTraceForGep(F, GepTraceTargets); return true; } -void SanitizerCoverageModule::CreateFunctionGuardArray(size_t NumGuards, - Function &F) { - if (!Options.TracePCGuard) return; - HasSancovGuardsSection = true; - ArrayType *ArrayOfInt32Ty = ArrayType::get(Int32Ty, NumGuards); - FunctionGuardArray = new GlobalVariable( - *CurModule, ArrayOfInt32Ty, false, GlobalVariable::PrivateLinkage, - Constant::getNullValue(ArrayOfInt32Ty), "__sancov_gen_"); + +GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection( + size_t NumElements, Function &F, Type *Ty, const char *Section) { + ArrayType *ArrayTy = ArrayType::get(Ty, NumElements); + auto Array = new GlobalVariable( + *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage, + Constant::getNullValue(ArrayTy), "__sancov_gen_"); if (auto Comdat = F.getComdat()) - FunctionGuardArray->setComdat(Comdat); - FunctionGuardArray->setSection(getSanCovTracePCGuardSection()); + Array->setComdat(Comdat); + Array->setSection(getSectionName(Section)); + return Array; +} +void SanitizerCoverageModule::CreateFunctionLocalArrays(size_t NumGuards, + Function &F) { + if (Options.TracePCGuard) + FunctionGuardArray = CreateFunctionLocalArrayInSection( + NumGuards, F, Int32Ty, SanCovGuardsSectionName); + if (Options.Inline8bitCounters) + Function8bitCounterArray = CreateFunctionLocalArrayInSection( + NumGuards, F, Int8Ty, SanCovCountersSectionName); } bool SanitizerCoverageModule::InjectCoverage(Function &F, @@ -413,11 +453,11 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F, case SanitizerCoverageOptions::SCK_None: return false; case SanitizerCoverageOptions::SCK_Function: - CreateFunctionGuardArray(1, F); + CreateFunctionLocalArrays(1, F); InjectCoverageAtBlock(F, F.getEntryBlock(), 0); return true; default: { - CreateFunctionGuardArray(AllBlocks.size(), F); + CreateFunctionLocalArrays(AllBlocks.size(), F); for (size_t i = 0, N = AllBlocks.size(); i < N; i++) InjectCoverageAtBlock(F, *AllBlocks[i], i); return true; @@ -436,7 +476,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( Function &F, ArrayRef<Instruction *> IndirCalls) { if (IndirCalls.empty()) return; - assert(Options.TracePC || Options.TracePCGuard); + assert(Options.TracePC || Options.TracePCGuard || Options.Inline8bitCounters); for (auto I : IndirCalls) { IRBuilder<> IRB(I); CallSite CS(I); @@ -564,8 +604,8 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, if (Options.TracePC) { IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC. IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } else { - assert(Options.TracePCGuard); + } + if (Options.TracePCGuard) { auto GuardPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy), ConstantInt::get(IntptrTy, Idx * 4)), @@ -573,26 +613,39 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, IRB.CreateCall(SanCovTracePCGuard, GuardPtr); IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. } + if (Options.Inline8bitCounters) { + auto CounterPtr = IRB.CreateGEP( + Function8bitCounterArray, + {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)}); + auto Load = IRB.CreateLoad(CounterPtr); + auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1)); + auto Store = IRB.CreateStore(Inc, CounterPtr); + SetNoSanitizeMetadata(Load); + SetNoSanitizeMetadata(Store); + } } -StringRef SanitizerCoverageModule::getSanCovTracePCGuardSection() const { +std::string +SanitizerCoverageModule::getSectionName(const std::string &Section) const { if (TargetTriple.getObjectFormat() == Triple::COFF) return ".SCOV$M"; if (TargetTriple.isOSBinFormatMachO()) - return "__DATA,__sancov_guards"; - return "__sancov_guards"; + return "__DATA,__" + Section; + return "__" + Section; } -StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionStart() const { +std::string +SanitizerCoverageModule::getSectionStart(const std::string &Section) const { if (TargetTriple.isOSBinFormatMachO()) - return "\1section$start$__DATA$__sancov_guards"; - return "__start___sancov_guards"; + return "\1section$start$__DATA$__" + Section; + return "__start___" + Section; } -StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionEnd() const { +std::string +SanitizerCoverageModule::getSectionEnd(const std::string &Section) const { if (TargetTriple.isOSBinFormatMachO()) - return "\1section$end$__DATA$__sancov_guards"; - return "__stop___sancov_guards"; + return "\1section$end$__DATA$__" + Section; + return "__stop___" + Section; } diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 3953198fe605..9a7882211bac 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1823,6 +1823,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { // An IV counter must preserve its type. if (IncI->getNumOperands() == 2) break; + LLVM_FALLTHROUGH; default: return nullptr; } diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 930696b036c0..7d8da9b453f9 100644 --- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -83,6 +84,149 @@ static bool handleSwitchExpect(SwitchInst &SI) { return true; } +/// Handler for PHINodes that define the value argument to an +/// @llvm.expect call. +/// +/// If the operand of the phi has a constant value and it 'contradicts' +/// with the expected value of phi def, then the corresponding incoming +/// edge of the phi is unlikely to be taken. Using that information, +/// the branch probability info for the originating branch can be inferred. +static void handlePhiDef(CallInst *Expect) { + Value &Arg = *Expect->getArgOperand(0); + ConstantInt *ExpectedValue = cast<ConstantInt>(Expect->getArgOperand(1)); + const APInt &ExpectedPhiValue = ExpectedValue->getValue(); + + // Walk up in backward a list of instructions that + // have 'copy' semantics by 'stripping' the copies + // until a PHI node or an instruction of unknown kind + // is reached. Negation via xor is also handled. + // + // C = PHI(...); + // B = C; + // A = B; + // D = __builtin_expect(A, 0); + // + Value *V = &Arg; + SmallVector<Instruction *, 4> Operations; + while (!isa<PHINode>(V)) { + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) { + V = ZExt->getOperand(0); + Operations.push_back(ZExt); + continue; + } + + if (SExtInst *SExt = dyn_cast<SExtInst>(V)) { + V = SExt->getOperand(0); + Operations.push_back(SExt); + continue; + } + + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); + if (!BinOp || BinOp->getOpcode() != Instruction::Xor) + return; + + ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1)); + if (!CInt) + return; + + V = BinOp->getOperand(0); + Operations.push_back(BinOp); + } + + // Executes the recorded operations on input 'Value'. + auto ApplyOperations = [&](const APInt &Value) { + APInt Result = Value; + for (auto Op : llvm::reverse(Operations)) { + switch (Op->getOpcode()) { + case Instruction::Xor: + Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue(); + break; + case Instruction::ZExt: + Result = Result.zext(Op->getType()->getIntegerBitWidth()); + break; + case Instruction::SExt: + Result = Result.sext(Op->getType()->getIntegerBitWidth()); + break; + default: + llvm_unreachable("Unexpected operation"); + } + } + return Result; + }; + + auto *PhiDef = dyn_cast<PHINode>(V); + + // Get the first dominating conditional branch of the operand + // i's incoming block. + auto GetDomConditional = [&](unsigned i) -> BranchInst * { + BasicBlock *BB = PhiDef->getIncomingBlock(i); + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (BI && BI->isConditional()) + return BI; + BB = BB->getSinglePredecessor(); + if (!BB) + return nullptr; + BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || BI->isUnconditional()) + return nullptr; + return BI; + }; + + // Now walk through all Phi operands to find phi oprerands with values + // conflicting with the expected phi output value. Any such operand + // indicates the incoming edge to that operand is unlikely. + for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) { + + Value *PhiOpnd = PhiDef->getIncomingValue(i); + ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd); + if (!CI) + continue; + + // Not an interesting case when IsUnlikely is false -- we can not infer + // anything useful when the operand value matches the expected phi + // output. + if (ExpectedPhiValue == ApplyOperations(CI->getValue())) + continue; + + BranchInst *BI = GetDomConditional(i); + if (!BI) + continue; + + MDBuilder MDB(PhiDef->getContext()); + + // There are two situations in which an operand of the PhiDef comes + // from a given successor of a branch instruction BI. + // 1) When the incoming block of the operand is the successor block; + // 2) When the incoming block is BI's enclosing block and the + // successor is the PhiDef's enclosing block. + // + // Returns true if the operand which comes from OpndIncomingBB + // comes from outgoing edge of BI that leads to Succ block. + auto *OpndIncomingBB = PhiDef->getIncomingBlock(i); + auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) { + if (OpndIncomingBB == Succ) + // If this successor is the incoming block for this + // Phi operand, then this successor does lead to the Phi. + return true; + if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent()) + // Otherwise, if the edge is directly from the branch + // to the Phi, this successor is the one feeding this + // Phi operand. + return true; + return false; + }; + + if (IsOpndComingFromSuccessor(BI->getSuccessor(1))) + BI->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight)); + else if (IsOpndComingFromSuccessor(BI->getSuccessor(0))) + BI->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight)); + } +} + // Handle both BranchInst and SelectInst. template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { @@ -99,25 +243,31 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition()); CmpInst::Predicate Predicate; - uint64_t ValueComparedTo = 0; + ConstantInt *CmpConstOperand = nullptr; if (!CmpI) { CI = dyn_cast<CallInst>(BSI.getCondition()); Predicate = CmpInst::ICMP_NE; - ValueComparedTo = 0; } else { Predicate = CmpI->getPredicate(); if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ) return false; - ConstantInt *CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1)); + + CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1)); if (!CmpConstOperand) return false; - ValueComparedTo = CmpConstOperand->getZExtValue(); CI = dyn_cast<CallInst>(CmpI->getOperand(0)); } if (!CI) return false; + uint64_t ValueComparedTo = 0; + if (CmpConstOperand) { + if (CmpConstOperand->getBitWidth() > 64) + return false; + ValueComparedTo = CmpConstOperand->getZExtValue(); + } + Function *Fn = CI->getCalledFunction(); if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect) return false; @@ -181,6 +331,10 @@ static bool lowerExpectIntrinsic(Function &F) { Function *Fn = CI->getCalledFunction(); if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) { + // Before erasing the llvm.expect, walk backward to find + // phi that define llvm.expect's first arg, and + // infer branch probability: + handlePhiDef(CI); Value *Exp = CI->getArgOperand(0); CI->replaceAllUsesWith(Exp); CI->eraseFromParent(); diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 77b2bd84f9b6..350b50ffcdd4 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// Rewrite an existing set of gc.statepoints such that they make potential -// relocations performed by the garbage collector explicit in the IR. +// Rewrite call/invoke instructions so as to make potential relocations +// performed by the garbage collector explicit in the IR. // //===----------------------------------------------------------------------===// @@ -2094,9 +2094,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, // live in the IR. We'll remove all of these when done. SmallVector<CallInst *, 64> Holders; - // Insert a dummy call with all of the arguments to the vm_state we'll need - // for the actual safepoint insertion. This ensures reference arguments in - // the deopt argument list are considered live through the safepoint (and + // Insert a dummy call with all of the deopt operands we'll need for the + // actual safepoint insertion as arguments. This ensures reference operands + // in the deopt argument list are considered live through the safepoint (and // thus makes sure they get relocated.) for (CallSite CS : ToUpdate) { SmallVector<Value *, 64> DeoptValues; diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 6e113bccff94..fb1b5813fd79 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -3698,7 +3698,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { int Idx = 0, Size = Offsets.Splits.size(); for (;;) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); - auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); // Either lookup a split load or create one. LoadInst *PLoad; @@ -3709,7 +3710,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), - PartPtrTy, LoadBasePtr->getName() + "."), + LoadPartPtrTy, LoadBasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); } @@ -3719,7 +3720,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), - PartPtrTy, StoreBasePtr->getName() + "."), + StorePartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); // Now build a new slice for the alloca. diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 1ec3d0d49637..1c1a75c111e9 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -37,10 +37,10 @@ using namespace llvm; /// See comments in Cloning.h. -BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, - ValueToValueMapTy &VMap, +BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix, Function *F, - ClonedCodeInfo *CodeInfo) { + ClonedCodeInfo *CodeInfo, + DebugInfoFinder *DIFinder) { DenseMap<const MDNode *, MDNode *> Cache; BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); @@ -50,10 +50,11 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, // Loop over all instructions, and copy them over. for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { + + if (DIFinder && F->getParent() && II->getDebugLoc()) + DIFinder->processLocation(*F->getParent(), II->getDebugLoc().get()); + Instruction *NewInst = II->clone(); - if (F && F->getSubprogram()) - DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(), - F->getSubprogram(), Cache); if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); @@ -122,31 +123,38 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(), OldAttrs.getRetAttributes(), NewArgAttrs)); + bool MustCloneSP = + OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent(); + DISubprogram *SP = OldFunc->getSubprogram(); + if (SP) { + assert(!MustCloneSP || ModuleLevelChanges); + // Add mappings for some DebugInfo nodes that we don't want duplicated + // even if they're distinct. + auto &MD = VMap.MD(); + MD[SP->getUnit()].reset(SP->getUnit()); + MD[SP->getType()].reset(SP->getType()); + MD[SP->getFile()].reset(SP->getFile()); + // If we're not cloning into the same module, no need to clone the + // subprogram + if (!MustCloneSP) + MD[SP].reset(SP); + } + SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; OldFunc->getAllMetadata(MDs); for (auto MD : MDs) { - MDNode *NewMD; - bool MustCloneSP = - (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() && - OldFunc->getParent() == NewFunc->getParent()); - if (MustCloneSP) { - auto *SP = cast<DISubprogram>(MD.second); - NewMD = DISubprogram::getDistinct( - NewFunc->getContext(), SP->getScope(), SP->getName(), - SP->getLinkageName(), SP->getFile(), SP->getLine(), SP->getType(), - SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(), - SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(), - SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(), - SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(), - SP->getVariables(), SP->getThrownTypes()); - } else - NewMD = - MapMetadata(MD.second, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer); - NewFunc->addMetadata(MD.first, *NewMD); + NewFunc->addMetadata( + MD.first, + *MapMetadata(MD.second, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); } + // When we remap instructions, we want to avoid duplicating inlined + // DISubprograms, so record all subprograms we find as we duplicate + // instructions and then freeze them in the MD map. + DebugInfoFinder DIFinder; + // Loop over all of the basic blocks in the function, cloning them as // appropriate. Note that we save BE this way in order to handle cloning of // recursive functions into themselves. @@ -156,7 +164,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, const BasicBlock &BB = *BI; // Create a new basic block and copy instructions into it! - BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo); + BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo, + SP ? &DIFinder : nullptr); // Add basic block mapping. VMap[&BB] = CBB; @@ -178,6 +187,12 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, Returns.push_back(RI); } + for (DISubprogram *ISP : DIFinder.subprograms()) { + if (ISP != SP) { + VMap.MD()[ISP].reset(ISP); + } + } + // Loop over all of the instructions in the function, fixing up operand // references as we go. This uses VMap to do all the hard work. for (Function::iterator BB = @@ -226,7 +241,7 @@ Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap, } SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned. - CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns, "", + CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "", CodeInfo); return NewF; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8b9a64c220cc..799eef21dc4e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4779,6 +4779,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { scalarizeInstruction(&I, true); break; } + LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -7396,6 +7397,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // likely. return Cost / getReciprocalPredBlockProb(); } + LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index e6f78e6b94a3..d1349535f298 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -259,6 +259,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, if (hasVectorInstrinsicScalarOpd(ID, 1)) { return (CI->getArgOperand(1) == Scalar); } + LLVM_FALLTHROUGH; } default: return false; @@ -4749,56 +4750,18 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, return nullptr; } -namespace { -/// Tracks instructons and its children. -class WeakTrackingVHWithLevel final : public CallbackVH { - /// Operand index of the instruction currently beeing analized. - unsigned Level = 0; - /// Is this the instruction that should be vectorized, or are we now - /// processing children (i.e. operands of this instruction) for potential - /// vectorization? - bool IsInitial = true; - -public: - explicit WeakTrackingVHWithLevel() = default; - WeakTrackingVHWithLevel(Value *V) : CallbackVH(V){}; - /// Restart children analysis each time it is repaced by the new instruction. - void allUsesReplacedWith(Value *New) override { - setValPtr(New); - Level = 0; - IsInitial = true; - } - /// Check if the instruction was not deleted during vectorization. - bool isValid() const { return !getValPtr(); } - /// Is the istruction itself must be vectorized? - bool isInitial() const { return IsInitial; } - /// Try to vectorize children. - void clearInitial() { IsInitial = false; } - /// Are all children processed already? - bool isFinal() const { - assert(getValPtr() && - (isa<Instruction>(getValPtr()) && - cast<Instruction>(getValPtr())->getNumOperands() >= Level)); - return getValPtr() && - cast<Instruction>(getValPtr())->getNumOperands() == Level; - } - /// Get next child operation. - Value *nextOperand() { - assert(getValPtr() && isa<Instruction>(getValPtr()) && - cast<Instruction>(getValPtr())->getNumOperands() > Level); - return cast<Instruction>(getValPtr())->getOperand(Level++); - } - virtual ~WeakTrackingVHWithLevel() = default; -}; -} // namespace - -/// \brief Attempt to reduce a horizontal reduction. -/// If it is legal to match a horizontal reduction feeding -/// the phi node P with reduction operators Root in a basic block BB, then check -/// if it can be done. -/// \returns true if a horizontal reduction was matched and reduced. -/// \returns false if a horizontal reduction was not matched. -static bool canBeVectorized( +/// Attempt to reduce a horizontal reduction. +/// If it is legal to match a horizontal reduction feeding the phi node \a P +/// with reduction operators \a Root (or one of its operands) in a basic block +/// \a BB, then check if it can be done. If horizontal reduction is not found +/// and root instruction is a binary operation, vectorization of the operands is +/// attempted. +/// \returns true if a horizontal reduction was matched and reduced or operands +/// of one of the binary instruction were vectorized. +/// \returns false if a horizontal reduction was not matched (or not possible) +/// or no vectorization of any binary operation feeding \a Root instruction was +/// performed. +static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) { @@ -4810,56 +4773,62 @@ static bool canBeVectorized( if (Root->getParent() != BB) return false; - SmallVector<WeakTrackingVHWithLevel, 8> Stack(1, Root); + // Start analysis starting from Root instruction. If horizontal reduction is + // found, try to vectorize it. If it is not a horizontal reduction or + // vectorization is not possible or not effective, and currently analyzed + // instruction is a binary operation, try to vectorize the operands, using + // pre-order DFS traversal order. If the operands were not vectorized, repeat + // the same procedure considering each operand as a possible root of the + // horizontal reduction. + // Interrupt the process if the Root instruction itself was vectorized or all + // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. + SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0}); SmallSet<Value *, 8> VisitedInstrs; bool Res = false; while (!Stack.empty()) { - Value *V = Stack.back(); - if (!V) { - Stack.pop_back(); + Value *V; + unsigned Level; + std::tie(V, Level) = Stack.pop_back_val(); + if (!V) continue; - } auto *Inst = dyn_cast<Instruction>(V); - if (!Inst || isa<PHINode>(Inst)) { - Stack.pop_back(); + if (!Inst || isa<PHINode>(Inst)) continue; - } - if (Stack.back().isInitial()) { - Stack.back().clearInitial(); - if (auto *BI = dyn_cast<BinaryOperator>(Inst)) { - HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, BI)) { - if (HorRdx.tryToReduce(R, TTI)) { - Res = true; - P = nullptr; - continue; - } - } - if (P) { - Inst = dyn_cast<Instruction>(BI->getOperand(0)); - if (Inst == P) - Inst = dyn_cast<Instruction>(BI->getOperand(1)); - if (!Inst) { - P = nullptr; - continue; - } + if (auto *BI = dyn_cast<BinaryOperator>(Inst)) { + HorizontalReduction HorRdx; + if (HorRdx.matchAssociativeReduction(P, BI)) { + if (HorRdx.tryToReduce(R, TTI)) { + Res = true; + // Set P to nullptr to avoid re-analysis of phi node in + // matchAssociativeReduction function unless this is the root node. + P = nullptr; + continue; } } - P = nullptr; - if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) { - Res = true; - continue; + if (P) { + Inst = dyn_cast<Instruction>(BI->getOperand(0)); + if (Inst == P) + Inst = dyn_cast<Instruction>(BI->getOperand(1)); + if (!Inst) { + // Set P to nullptr to avoid re-analysis of phi node in + // matchAssociativeReduction function unless this is the root node. + P = nullptr; + continue; + } } } - if (Stack.back().isFinal()) { - Stack.pop_back(); + // Set P to nullptr to avoid re-analysis of phi node in + // matchAssociativeReduction function unless this is the root node. + P = nullptr; + if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) { + Res = true; continue; } - if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand())) - if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second && - Stack.size() < RecursionMaxDepth) - Stack.push_back(NextV); + // Try to vectorize operands. + if (++Level < RecursionMaxDepth) + for (auto *Op : Inst->operand_values()) + Stack.emplace_back(Op, Level); } return Res; } @@ -4876,10 +4845,10 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, if (!isa<BinaryOperator>(I)) P = nullptr; // Try to match and vectorize a horizontal reduction. - return canBeVectorized(P, I, BB, R, TTI, - [this](BinaryOperator *BI, BoUpSLP &R) -> bool { - return tryToVectorize(BI, R); - }); + return tryToVectorizeHorReductionOrInstOperands( + P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool { + return tryToVectorize(BI, R); + }); } bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { |