diff options
Diffstat (limited to 'lib')
170 files changed, 3595 insertions, 2077 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp index dd2db1e5b27b..55df66714178 100644 --- a/lib/Analysis/AliasAnalysis.cpp +++ b/lib/Analysis/AliasAnalysis.cpp @@ -133,9 +133,9 @@ ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {  }  ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) { -  // We may have two calls +  // We may have two calls.    if (auto CS = ImmutableCallSite(I)) { -    // Check if the two calls modify the same memory +    // Check if the two calls modify the same memory.      return getModRefInfo(CS, Call);    } else if (I->isFenceLike()) {      // If this is a fence, just return ModRef. @@ -179,6 +179,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,    if (onlyAccessesArgPointees(MRB) || onlyAccessesInaccessibleOrArgMem(MRB)) {      bool DoesAlias = false; +    bool IsMustAlias = true;      ModRefInfo AllArgsMask = ModRefInfo::NoModRef;      if (doesAccessArgPointees(MRB)) {        for (auto AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) { @@ -193,6 +194,8 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,            DoesAlias = true;            AllArgsMask = unionModRef(AllArgsMask, ArgMask);          } +        // Conservatively clear IsMustAlias unless only MustAlias is found. +        IsMustAlias &= (ArgAlias == MustAlias);        }      }      // Return NoModRef if no alias found with any argument. @@ -200,6 +203,8 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,        return ModRefInfo::NoModRef;      // Logical & between other AA analyses and argument analysis.      Result = intersectModRef(Result, AllArgsMask); +    // If only MustAlias found above, set Must bit. +    Result = IsMustAlias ? setMust(Result) : clearMust(Result);    }    // If Loc is a constant memory location, the call definitely could not @@ -251,6 +256,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,    if (onlyAccessesArgPointees(CS2B)) {      ModRefInfo R = ModRefInfo::NoModRef;      if (doesAccessArgPointees(CS2B)) { +      bool IsMustAlias = true;        for (auto I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {          const Value *Arg = *I;          if (!Arg->getType()->isPointerTy()) @@ -274,10 +280,19 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,          ModRefInfo ModRefCS1 = getModRefInfo(CS1, CS2ArgLoc);          ArgMask = intersectModRef(ArgMask, ModRefCS1); +        // Conservatively clear IsMustAlias unless only MustAlias is found. +        IsMustAlias &= isMustSet(ModRefCS1); +          R = intersectModRef(unionModRef(R, ArgMask), Result); -        if (R == Result) +        if (R == Result) { +          // On early exit, not all args were checked, cannot set Must. +          if (I + 1 != E) +            IsMustAlias = false;            break; +        }        } +      // If Alias found and only MustAlias found above, set Must bit. +      R = IsMustAlias ? setMust(R) : clearMust(R);      }      return R;    } @@ -287,6 +302,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,    if (onlyAccessesArgPointees(CS1B)) {      ModRefInfo R = ModRefInfo::NoModRef;      if (doesAccessArgPointees(CS1B)) { +      bool IsMustAlias = true;        for (auto I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {          const Value *Arg = *I;          if (!Arg->getType()->isPointerTy()) @@ -303,9 +319,18 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,              (isRefSet(ArgModRefCS1) && isModSet(ModRefCS2)))            R = intersectModRef(unionModRef(R, ArgModRefCS1), Result); -        if (R == Result) +        // Conservatively clear IsMustAlias unless only MustAlias is found. +        IsMustAlias &= isMustSet(ModRefCS2); + +        if (R == Result) { +          // On early exit, not all args were checked, cannot set Must. +          if (I + 1 != E) +            IsMustAlias = false;            break; +        }        } +      // If Alias found and only MustAlias found above, set Must bit. +      R = IsMustAlias ? setMust(R) : clearMust(R);      }      return R;    } @@ -353,9 +378,13 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L,    // If the load address doesn't alias the given address, it doesn't read    // or write the specified memory. -  if (Loc.Ptr && !alias(MemoryLocation::get(L), Loc)) -    return ModRefInfo::NoModRef; - +  if (Loc.Ptr) { +    AliasResult AR = alias(MemoryLocation::get(L), Loc); +    if (AR == NoAlias) +      return ModRefInfo::NoModRef; +    if (AR == MustAlias) +      return ModRefInfo::MustRef; +  }    // Otherwise, a load just reads.    return ModRefInfo::Ref;  } @@ -367,15 +396,20 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S,      return ModRefInfo::ModRef;    if (Loc.Ptr) { +    AliasResult AR = alias(MemoryLocation::get(S), Loc);      // If the store address cannot alias the pointer in question, then the      // specified memory cannot be modified by the store. -    if (!alias(MemoryLocation::get(S), Loc)) +    if (AR == NoAlias)        return ModRefInfo::NoModRef;      // If the pointer is a pointer to constant memory, then it could not have      // been modified by this store.      if (pointsToConstantMemory(Loc))        return ModRefInfo::NoModRef; + +    // If the store address aliases the pointer as must alias, set Must. +    if (AR == MustAlias) +      return ModRefInfo::MustMod;    }    // Otherwise, a store just writes. @@ -393,15 +427,20 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *S, const MemoryLocation &Lo  ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,                                      const MemoryLocation &Loc) {    if (Loc.Ptr) { +    AliasResult AR = alias(MemoryLocation::get(V), Loc);      // If the va_arg address cannot alias the pointer in question, then the      // specified memory cannot be accessed by the va_arg. -    if (!alias(MemoryLocation::get(V), Loc)) +    if (AR == NoAlias)        return ModRefInfo::NoModRef;      // If the pointer is a pointer to constant memory, then it could not have      // been modified by this va_arg.      if (pointsToConstantMemory(Loc))        return ModRefInfo::NoModRef; + +    // If the va_arg aliases the pointer as must alias, set Must. +    if (AR == MustAlias) +      return ModRefInfo::MustModRef;    }    // Otherwise, a va_arg reads and writes. @@ -440,9 +479,17 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,    if (isStrongerThanMonotonic(CX->getSuccessOrdering()))      return ModRefInfo::ModRef; -  // If the cmpxchg address does not alias the location, it does not access it. -  if (Loc.Ptr && !alias(MemoryLocation::get(CX), Loc)) -    return ModRefInfo::NoModRef; +  if (Loc.Ptr) { +    AliasResult AR = alias(MemoryLocation::get(CX), Loc); +    // If the cmpxchg address does not alias the location, it does not access +    // it. +    if (AR == NoAlias) +      return ModRefInfo::NoModRef; + +    // If the cmpxchg address aliases the pointer as must alias, set Must. +    if (AR == MustAlias) +      return ModRefInfo::MustModRef; +  }    return ModRefInfo::ModRef;  } @@ -453,9 +500,17 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,    if (isStrongerThanMonotonic(RMW->getOrdering()))      return ModRefInfo::ModRef; -  // If the atomicrmw address does not alias the location, it does not access it. -  if (Loc.Ptr && !alias(MemoryLocation::get(RMW), Loc)) -    return ModRefInfo::NoModRef; +  if (Loc.Ptr) { +    AliasResult AR = alias(MemoryLocation::get(RMW), Loc); +    // If the atomicrmw address does not alias the location, it does not access +    // it. +    if (AR == NoAlias) +      return ModRefInfo::NoModRef; + +    // If the atomicrmw address aliases the pointer as must alias, set Must. +    if (AR == MustAlias) +      return ModRefInfo::MustModRef; +  }    return ModRefInfo::ModRef;  } @@ -493,6 +548,8 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,    unsigned ArgNo = 0;    ModRefInfo R = ModRefInfo::NoModRef; +  bool MustAlias = true; +  // Set flag only if no May found and all operands processed.    for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();         CI != CE; ++CI, ++ArgNo) {      // Only look at the no-capture or byval pointer arguments.  If this @@ -503,11 +560,14 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,           ArgNo < CS.getNumArgOperands() && !CS.isByValArgument(ArgNo)))        continue; +    AliasResult AR = alias(MemoryLocation(*CI), MemoryLocation(Object));      // If this is a no-capture pointer argument, see if we can tell that it      // is impossible to alias the pointer we're checking.  If not, we have to      // assume that the call could touch the pointer, even though it doesn't      // escape. -    if (isNoAlias(MemoryLocation(*CI), MemoryLocation(Object))) +    if (AR != MustAlias) +      MustAlias = false; +    if (AR == NoAlias)        continue;      if (CS.doesNotAccessMemory(ArgNo))        continue; @@ -515,9 +575,10 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,        R = ModRefInfo::Ref;        continue;      } +    // Not returning MustModRef since we have not seen all the arguments.      return ModRefInfo::ModRef;    } -  return R; +  return MustAlias ? setMust(R) : clearMust(R);  }  /// canBasicBlockModify - Return true if it is possible for execution of the diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp index 423acf739f58..f737cecc43d1 100644 --- a/lib/Analysis/AliasAnalysisEvaluator.cpp +++ b/lib/Analysis/AliasAnalysisEvaluator.cpp @@ -31,9 +31,13 @@ static cl::opt<bool> PrintPartialAlias("print-partial-aliases", cl::ReallyHidden  static cl::opt<bool> PrintMustAlias("print-must-aliases", cl::ReallyHidden);  static cl::opt<bool> PrintNoModRef("print-no-modref", cl::ReallyHidden); -static cl::opt<bool> PrintMod("print-mod", cl::ReallyHidden);  static cl::opt<bool> PrintRef("print-ref", cl::ReallyHidden); +static cl::opt<bool> PrintMod("print-mod", cl::ReallyHidden);  static cl::opt<bool> PrintModRef("print-modref", cl::ReallyHidden); +static cl::opt<bool> PrintMust("print-must", cl::ReallyHidden); +static cl::opt<bool> PrintMustRef("print-mustref", cl::ReallyHidden); +static cl::opt<bool> PrintMustMod("print-mustmod", cl::ReallyHidden); +static cl::opt<bool> PrintMustModRef("print-mustmodref", cl::ReallyHidden);  static cl::opt<bool> EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden); @@ -262,6 +266,25 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {                             F.getParent());          ++ModRefCount;          break; +      case ModRefInfo::Must: +        PrintModRefResults("Must", PrintMust, I, Pointer, F.getParent()); +        ++MustCount; +        break; +      case ModRefInfo::MustMod: +        PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, I, Pointer, +                           F.getParent()); +        ++MustModCount; +        break; +      case ModRefInfo::MustRef: +        PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, I, Pointer, +                           F.getParent()); +        ++MustRefCount; +        break; +      case ModRefInfo::MustModRef: +        PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, I, +                           Pointer, F.getParent()); +        ++MustModRefCount; +        break;        }      }    } @@ -288,6 +311,25 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {          PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent());          ++ModRefCount;          break; +      case ModRefInfo::Must: +        PrintModRefResults("Must", PrintMust, *C, *D, F.getParent()); +        ++MustCount; +        break; +      case ModRefInfo::MustMod: +        PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, *C, *D, +                           F.getParent()); +        ++MustModCount; +        break; +      case ModRefInfo::MustRef: +        PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, *C, *D, +                           F.getParent()); +        ++MustRefCount; +        break; +      case ModRefInfo::MustModRef: +        PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, *C, *D, +                           F.getParent()); +        ++MustModRefCount; +        break;        }      }    } @@ -325,7 +367,8 @@ AAEvaluator::~AAEvaluator() {    }    // Display the summary for mod/ref analysis -  int64_t ModRefSum = NoModRefCount + ModCount + RefCount + ModRefCount; +  int64_t ModRefSum = NoModRefCount + RefCount + ModCount + ModRefCount + +                      MustCount + MustRefCount + MustModCount + MustModRefCount;    if (ModRefSum == 0) {      errs() << "  Alias Analysis Mod/Ref Evaluator Summary: no "                "mod/ref!\n"; @@ -339,10 +382,22 @@ AAEvaluator::~AAEvaluator() {      PrintPercent(RefCount, ModRefSum);      errs() << "  " << ModRefCount << " mod & ref responses ";      PrintPercent(ModRefCount, ModRefSum); +    errs() << "  " << MustCount << " must responses "; +    PrintPercent(MustCount, ModRefSum); +    errs() << "  " << MustModCount << " must mod responses "; +    PrintPercent(MustModCount, ModRefSum); +    errs() << "  " << MustRefCount << " must ref responses "; +    PrintPercent(MustRefCount, ModRefSum); +    errs() << "  " << MustModRefCount << " must mod & ref responses "; +    PrintPercent(MustModRefCount, ModRefSum);      errs() << "  Alias Analysis Evaluator Mod/Ref Summary: "             << NoModRefCount * 100 / ModRefSum << "%/"             << ModCount * 100 / ModRefSum << "%/" << RefCount * 100 / ModRefSum -           << "%/" << ModRefCount * 100 / ModRefSum << "%\n"; +           << "%/" << ModRefCount * 100 / ModRefSum << "%/" +           << MustCount * 100 / ModRefSum << "%/" +           << MustRefCount * 100 / ModRefSum << "%/" +           << MustModCount * 100 / ModRefSum << "%/" +           << MustModRefCount * 100 / ModRefSum << "%\n";    }  } diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index 81b9f842249e..537813b6b752 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -781,6 +781,7 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,      // Optimistically assume that call doesn't touch Object and check this      // assumption in the following loop.      ModRefInfo Result = ModRefInfo::NoModRef; +    bool IsMustAlias = true;      unsigned OperandNo = 0;      for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end(); @@ -802,7 +803,8 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,        // is impossible to alias the pointer we're checking.        AliasResult AR =            getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object)); - +      if (AR != MustAlias) +        IsMustAlias = false;        // Operand doesnt alias 'Object', continue looking for other aliases        if (AR == NoAlias)          continue; @@ -818,13 +820,20 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,          continue;        }        // This operand aliases 'Object' and call reads and writes into it. +      // Setting ModRef will not yield an early return below, MustAlias is not +      // used further.        Result = ModRefInfo::ModRef;        break;      } +    // No operand aliases, reset Must bit. Add below if at least one aliases +    // and all aliases found are MustAlias. +    if (isNoModRef(Result)) +      IsMustAlias = false; +      // Early return if we improved mod ref information      if (!isModAndRefSet(Result)) -      return Result; +      return IsMustAlias ? setMust(Result) : clearMust(Result);    }    // If the CallSite is to malloc or calloc, we can assume that it doesn't diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp index a85af6c9c93f..fb261755e5d1 100644 --- a/lib/Analysis/CFGPrinter.cpp +++ b/lib/Analysis/CFGPrinter.cpp @@ -82,7 +82,7 @@ PreservedAnalyses CFGOnlyViewerPass::run(Function &F,    return PreservedAnalyses::all();  } -static void writeCFGToDotFile(Function &F) { +static void writeCFGToDotFile(Function &F, bool CFGOnly = false) {    std::string Filename = ("cfg." + F.getName() + ".dot").str();    errs() << "Writing '" << Filename << "'..."; @@ -90,7 +90,7 @@ static void writeCFGToDotFile(Function &F) {    raw_fd_ostream File(Filename, EC, sys::fs::F_Text);    if (!EC) -    WriteGraph(File, (const Function*)&F); +    WriteGraph(File, (const Function*)&F, CFGOnly);    else      errs() << "  error opening file for writing!";    errs() << "\n"; @@ -134,7 +134,7 @@ namespace {      }      bool runOnFunction(Function &F) override { -      writeCFGToDotFile(F); +      writeCFGToDotFile(F, /*CFGOnly=*/true);        return false;      }      void print(raw_ostream &OS, const Module* = nullptr) const override {} @@ -152,7 +152,7 @@ INITIALIZE_PASS(CFGOnlyPrinterLegacyPass, "dot-cfg-only",  PreservedAnalyses CFGOnlyPrinterPass::run(Function &F,                                            FunctionAnalysisManager &AM) { -  writeCFGToDotFile(F); +  writeCFGToDotFile(F, /*CFGOnly=*/true);    return PreservedAnalyses::all();  } diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp index 23109c67e5c3..daee93267f56 100644 --- a/lib/Analysis/GlobalsModRef.cpp +++ b/lib/Analysis/GlobalsModRef.cpp @@ -85,12 +85,17 @@ class GlobalsAAResult::FunctionInfo {    /// The bit that flags that this function may read any global. This is    /// chosen to mix together with ModRefInfo bits.    /// FIXME: This assumes ModRefInfo lattice will remain 4 bits! +  /// It overlaps with ModRefInfo::Must bit! +  /// FunctionInfo.getModRefInfo() masks out everything except ModRef so +  /// this remains correct, but the Must info is lost.    enum { MayReadAnyGlobal = 4 };    /// Checks to document the invariants of the bit packing here. -  static_assert((MayReadAnyGlobal & static_cast<int>(ModRefInfo::ModRef)) == 0, +  static_assert((MayReadAnyGlobal & static_cast<int>(ModRefInfo::MustModRef)) == +                    0,                  "ModRef and the MayReadAnyGlobal flag bits overlap."); -  static_assert(((MayReadAnyGlobal | static_cast<int>(ModRefInfo::ModRef)) >> +  static_assert(((MayReadAnyGlobal | +                  static_cast<int>(ModRefInfo::MustModRef)) >>                   AlignedMapPointerTraits::NumLowBitsAvailable) == 0,                  "Insufficient low bits to store our flag and ModRef info."); @@ -125,14 +130,22 @@ public:      return *this;    } +  /// This method clears MayReadAnyGlobal bit added by GlobalsAAResult to return +  /// the corresponding ModRefInfo. It must align in functionality with +  /// clearMust(). +  ModRefInfo globalClearMayReadAnyGlobal(int I) const { +    return ModRefInfo((I & static_cast<int>(ModRefInfo::ModRef)) | +                      static_cast<int>(ModRefInfo::NoModRef)); +  } +    /// Returns the \c ModRefInfo info for this function.    ModRefInfo getModRefInfo() const { -    return ModRefInfo(Info.getInt() & static_cast<int>(ModRefInfo::ModRef)); +    return globalClearMayReadAnyGlobal(Info.getInt());    }    /// Adds new \c ModRefInfo for this function to its state.    void addModRefInfo(ModRefInfo NewMRI) { -    Info.setInt(Info.getInt() | static_cast<int>(NewMRI)); +    Info.setInt(Info.getInt() | static_cast<int>(setMust(NewMRI)));    }    /// Returns whether this function may read any global variable, and we don't diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index fba96c8976a6..b0cb29203a5a 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -249,8 +249,6 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {    bool visitCastInst(CastInst &I);    bool visitUnaryInstruction(UnaryInstruction &I);    bool visitCmpInst(CmpInst &I); -  bool visitAnd(BinaryOperator &I); -  bool visitOr(BinaryOperator &I);    bool visitSub(BinaryOperator &I);    bool visitBinaryOperator(BinaryOperator &I);    bool visitLoad(LoadInst &I); @@ -363,6 +361,7 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,  void CallAnalyzer::disableLoadElimination() {    if (EnableLoadElimination) {      Cost += LoadEliminationCost; +    LoadEliminationCost = 0;      EnableLoadElimination = false;    }  } @@ -700,6 +699,22 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {    // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.    disableSROA(I.getOperand(0)); +  // If this is a floating-point cast, and the target says this operation +  // is expensive, this may eventually become a library call. Treat the cost +  // as such. +  switch (I.getOpcode()) { +  case Instruction::FPTrunc: +  case Instruction::FPExt: +  case Instruction::UIToFP: +  case Instruction::SIToFP: +  case Instruction::FPToUI: +  case Instruction::FPToSI: +    if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) +      Cost += InlineConstants::CallPenalty; +  default: +    break; +  } +    return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);  } @@ -1004,34 +1019,6 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {    return false;  } -bool CallAnalyzer::visitOr(BinaryOperator &I) { -  // This is necessary because the generic simplify instruction only works if -  // both operands are constants. -  for (unsigned i = 0; i < 2; ++i) { -    if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( -            SimplifiedValues.lookup(I.getOperand(i)))) -      if (C->isAllOnesValue()) { -        SimplifiedValues[&I] = C; -        return true; -      } -  } -  return Base::visitOr(I); -} - -bool CallAnalyzer::visitAnd(BinaryOperator &I) { -  // This is necessary because the generic simplify instruction only works if -  // both operands are constants. -  for (unsigned i = 0; i < 2; ++i) { -    if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( -            SimplifiedValues.lookup(I.getOperand(i)))) -      if (C->isZero()) { -        SimplifiedValues[&I] = C; -        return true; -      } -  } -  return Base::visitAnd(I); -} -  bool CallAnalyzer::visitSub(BinaryOperator &I) {    // Try to handle a special case: we can fold computing the difference of two    // constant-related pointers. @@ -1061,23 +1048,38 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {  bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); -  auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) { -    Value *SimpleV = nullptr; -    if (auto FI = dyn_cast<FPMathOperator>(&I)) -      SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1], -                                FI->getFastMathFlags(), DL); -    else -      SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL); -    return dyn_cast_or_null<Constant>(SimpleV); -  }; +  Constant *CLHS = dyn_cast<Constant>(LHS); +  if (!CLHS) +    CLHS = SimplifiedValues.lookup(LHS); +  Constant *CRHS = dyn_cast<Constant>(RHS); +  if (!CRHS) +    CRHS = SimplifiedValues.lookup(RHS); + +  Value *SimpleV = nullptr; +  if (auto FI = dyn_cast<FPMathOperator>(&I)) +    SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, +                              CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); +  else +    SimpleV = +        SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); -  if (simplifyInstruction(I, Evaluate)) +  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) +    SimplifiedValues[&I] = C; + +  if (SimpleV)      return true;    // Disable any SROA on arguments to arbitrary, unsimplified binary operators.    disableSROA(LHS);    disableSROA(RHS); +  // If the instruction is floating point, and the target says this operation +  // is expensive, this may eventually become a library call. Treat the cost +  // as such. +  if (I.getType()->isFloatingPointTy() && +      TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) +    Cost += InlineConstants::CallPenalty; +    return false;  } @@ -1097,7 +1099,7 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {    // by any stores or calls, this load is likely to be redundant and can be    // eliminated.    if (EnableLoadElimination && -      !LoadAddrSet.insert(I.getPointerOperand()).second) { +      !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {      LoadEliminationCost += InlineConstants::InstrCost;      return true;    } @@ -1547,17 +1549,6 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,      if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())        ++NumVectorInstructions; -    // If the instruction is floating point, and the target says this operation -    // is expensive or the function has the "use-soft-float" attribute, this may -    // eventually become a library call. Treat the cost as such. -    if (I->getType()->isFloatingPointTy()) { -      // If the function has the "use-soft-float" attribute, mark it as -      // expensive. -      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || -          (F.getFnAttribute("use-soft-float").getValueAsString() == "true")) -        Cost += InlineConstants::CallPenalty; -    } -      // If the instruction simplified to a constant, there is no cost to this      // instruction. Visit the instructions using our InstVisitor to account for      // all of the per-instruction logic. The visit tree returns true if we diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp index ed8e5e8cc489..e141d6c58b65 100644 --- a/lib/Analysis/LoopAccessAnalysis.cpp +++ b/lib/Analysis/LoopAccessAnalysis.cpp @@ -1107,77 +1107,6 @@ static unsigned getAddressSpaceOperand(Value *I) {    return -1;  } -// TODO:This API can be improved by using the permutation of given width as the -// accesses are entered into the map. -bool llvm::sortLoadAccesses(ArrayRef<Value *> VL, const DataLayout &DL, -                           ScalarEvolution &SE, -                           SmallVectorImpl<Value *> &Sorted, -                           SmallVectorImpl<unsigned> *Mask) { -  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs; -  OffValPairs.reserve(VL.size()); -  Sorted.reserve(VL.size()); - -  // Walk over the pointers, and map each of them to an offset relative to -  // first pointer in the array. -  Value *Ptr0 = getPointerOperand(VL[0]); -  const SCEV *Scev0 = SE.getSCEV(Ptr0); -  Value *Obj0 = GetUnderlyingObject(Ptr0, DL); -  PointerType *PtrTy = dyn_cast<PointerType>(Ptr0->getType()); -  uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); - -  for (auto *Val : VL) { -    // The only kind of access we care about here is load. -    if (!isa<LoadInst>(Val)) -      return false; - -    Value *Ptr = getPointerOperand(Val); -    assert(Ptr && "Expected value to have a pointer operand."); -    // If a pointer refers to a different underlying object, bail - the -    // pointers are by definition incomparable. -    Value *CurrObj = GetUnderlyingObject(Ptr, DL); -    if (CurrObj != Obj0) -      return false; - -    const SCEVConstant *Diff = -        dyn_cast<SCEVConstant>(SE.getMinusSCEV(SE.getSCEV(Ptr), Scev0)); -    // The pointers may not have a constant offset from each other, or SCEV -    // may just not be smart enough to figure out they do. Regardless, -    // there's nothing we can do. -    if (!Diff || static_cast<unsigned>(Diff->getAPInt().abs().getSExtValue()) > -                     (VL.size() - 1) * Size) -      return false; - -    OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val); -  } -  SmallVector<unsigned, 4> UseOrder(VL.size()); -  for (unsigned i = 0; i < VL.size(); i++) { -    UseOrder[i] = i; -  } - -  // Sort the memory accesses and keep the order of their uses in UseOrder. -  std::sort(UseOrder.begin(), UseOrder.end(), -            [&OffValPairs](unsigned Left, unsigned Right) { -            return OffValPairs[Left].first < OffValPairs[Right].first; -            }); - -  for (unsigned i = 0; i < VL.size(); i++) -    Sorted.emplace_back(OffValPairs[UseOrder[i]].second); - -  // Sort UseOrder to compute the Mask. -  if (Mask) { -    Mask->reserve(VL.size()); -    for (unsigned i = 0; i < VL.size(); i++) -      Mask->emplace_back(i); -    std::sort(Mask->begin(), Mask->end(), -              [&UseOrder](unsigned Left, unsigned Right) { -              return UseOrder[Left] < UseOrder[Right]; -              }); -  } - -  return true; -} - -  /// Returns true if the memory operations \p A and \p B are consecutive.  bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,                                 ScalarEvolution &SE, bool CheckType) { diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index a6c590126c2f..bb7bf967994c 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -647,6 +647,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(        // Ok, this store might clobber the query pointer.  Check to see if it is        // a must alias: in this case, we want to return this as a def. +      // FIXME: Use ModRefInfo::Must bit from getModRefInfo call above.        MemoryLocation StoreLoc = MemoryLocation::get(SI);        // If we found a pointer, check if it could be the same as our pointer. @@ -690,7 +691,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(      // If necessary, perform additional analysis.      if (isModAndRefSet(MR))        MR = AA.callCapturesBefore(Inst, MemLoc, &DT, &OBB); -    switch (MR) { +    switch (clearMust(MR)) {      case ModRefInfo::NoModRef:        // If the call has no effect on the queried pointer, just ignore it.        continue; @@ -919,6 +920,14 @@ void MemoryDependenceResults::getNonLocalPointerDependency(      Instruction *QueryInst, SmallVectorImpl<NonLocalDepResult> &Result) {    const MemoryLocation Loc = MemoryLocation::get(QueryInst);    bool isLoad = isa<LoadInst>(QueryInst); +  return getNonLocalPointerDependencyFrom(QueryInst, Loc, isLoad, Result); +} + +void MemoryDependenceResults::getNonLocalPointerDependencyFrom( +    Instruction *QueryInst, +    const MemoryLocation &Loc, +    bool isLoad, +    SmallVectorImpl<NonLocalDepResult> &Result) {    BasicBlock *FromBB = QueryInst->getParent();    assert(FromBB); @@ -1118,21 +1127,15 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(    // If we already have a cache entry for this CacheKey, we may need to do some    // work to reconcile the cache entry and the current query.    if (!Pair.second) { -    if (CacheInfo->Size < Loc.Size) { -      // The query's Size is greater than the cached one. Throw out the -      // cached data and proceed with the query at the greater size. +    if (CacheInfo->Size != Loc.Size) { +      // The query's Size differs from the cached one. Throw out the +      // cached data and proceed with the query at the new size.        CacheInfo->Pair = BBSkipFirstBlockPair();        CacheInfo->Size = Loc.Size;        for (auto &Entry : CacheInfo->NonLocalDeps)          if (Instruction *Inst = Entry.getResult().getInst())            RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);        CacheInfo->NonLocalDeps.clear(); -    } else if (CacheInfo->Size > Loc.Size) { -      // This query's Size is less than the cached one. Conservatively restart -      // the query using the greater size. -      return getNonLocalPointerDepFromBB( -          QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad, -          StartBB, Result, Visited, SkipFirstBlock);      }      // If the query's AATags are inconsistent with the cached one, diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp index 8fe190e8bcf8..6e9368c49d65 100644 --- a/lib/Analysis/MemorySSA.cpp +++ b/lib/Analysis/MemorySSA.cpp @@ -192,8 +192,6 @@ template <> struct DenseMapInfo<MemoryLocOrCall> {    }  }; -enum class Reorderability { Always, IfNoAlias, Never }; -  } // end namespace llvm  /// This does one-way checks to see if Use could theoretically be hoisted above @@ -202,22 +200,16 @@ enum class Reorderability { Always, IfNoAlias, Never };  /// This assumes that, for the purposes of MemorySSA, Use comes directly after  /// MayClobber, with no potentially clobbering operations in between them.  /// (Where potentially clobbering ops are memory barriers, aliased stores, etc.) -static Reorderability getLoadReorderability(const LoadInst *Use, -                                            const LoadInst *MayClobber) { +static bool areLoadsReorderable(const LoadInst *Use, +                                const LoadInst *MayClobber) {    bool VolatileUse = Use->isVolatile();    bool VolatileClobber = MayClobber->isVolatile();    // Volatile operations may never be reordered with other volatile operations.    if (VolatileUse && VolatileClobber) -    return Reorderability::Never; - -  // The lang ref allows reordering of volatile and non-volatile operations. -  // Whether an aliasing nonvolatile load and volatile load can be reordered, -  // though, is ambiguous. Because it may not be best to exploit this ambiguity, -  // we only allow volatile/non-volatile reordering if the volatile and -  // non-volatile operations don't alias. -  Reorderability Result = VolatileUse || VolatileClobber -                              ? Reorderability::IfNoAlias -                              : Reorderability::Always; +    return false; +  // Otherwise, volatile doesn't matter here. From the language reference: +  // 'optimizers may change the order of volatile operations relative to +  // non-volatile operations.'"    // If a load is seq_cst, it cannot be moved above other loads. If its ordering    // is weaker, it can be moved above other loads. We just need to be sure that @@ -229,9 +221,7 @@ static Reorderability getLoadReorderability(const LoadInst *Use,    bool SeqCstUse = Use->getOrdering() == AtomicOrdering::SequentiallyConsistent;    bool MayClobberIsAcquire = isAtLeastOrStrongerThan(MayClobber->getOrdering(),                                                       AtomicOrdering::Acquire); -  if (SeqCstUse || MayClobberIsAcquire) -    return Reorderability::Never; -  return Result; +  return !(SeqCstUse || MayClobberIsAcquire);  }  static bool instructionClobbersQuery(MemoryDef *MD, @@ -265,18 +255,9 @@ static bool instructionClobbersQuery(MemoryDef *MD,      return isModOrRefSet(I);    } -  if (auto *DefLoad = dyn_cast<LoadInst>(DefInst)) { -    if (auto *UseLoad = dyn_cast<LoadInst>(UseInst)) { -      switch (getLoadReorderability(UseLoad, DefLoad)) { -      case Reorderability::Always: -        return false; -      case Reorderability::Never: -        return true; -      case Reorderability::IfNoAlias: -        return !AA.isNoAlias(UseLoc, MemoryLocation::get(DefLoad)); -      } -    } -  } +  if (auto *DefLoad = dyn_cast<LoadInst>(DefInst)) +    if (auto *UseLoad = dyn_cast<LoadInst>(UseInst)) +      return !areLoadsReorderable(UseLoad, DefLoad);    return isModSet(AA.getModRefInfo(DefInst, UseLoc));  } diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index d54fb700200d..10badd89a4a8 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -454,7 +454,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(      std::unique_ptr<BlockFrequencyInfo> BFIPtr;      if (GetBFICallback)        BFI = GetBFICallback(F); -    else if (F.getEntryCount().hasValue()) { +    else if (F.hasProfileData()) {        LoopInfo LI{DominatorTree(const_cast<Function &>(F))};        BranchProbabilityInfo BPI{F, LI};        BFIPtr = llvm::make_unique<BlockFrequencyInfo>(F, BPI, LI); diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp index 671744f93fb8..347d093b0f61 100644 --- a/lib/Analysis/ProfileSummaryInfo.cpp +++ b/lib/Analysis/ProfileSummaryInfo.cpp @@ -115,42 +115,62 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {    return FunctionCount && isHotCount(FunctionCount.getValue());  } -/// Returns true if the function's entry or total call edge count is hot. +/// Returns true if the function contains hot code. This can include a hot +/// function entry count, hot basic block, or (in the case of Sample PGO) +/// hot total call edge count.  /// If it returns false, it either means it is not hot or it is unknown -/// whether it is hot or not (for example, no profile data is available). -bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F) { +/// (for example, no profile data is available). +bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F, +                                                  BlockFrequencyInfo &BFI) {    if (!F || !computeSummary())      return false;    if (auto FunctionCount = F->getEntryCount())      if (isHotCount(FunctionCount.getValue()))        return true; -  uint64_t TotalCallCount = 0; +  if (hasSampleProfile()) { +    uint64_t TotalCallCount = 0; +    for (const auto &BB : *F) +      for (const auto &I : BB) +        if (isa<CallInst>(I) || isa<InvokeInst>(I)) +          if (auto CallCount = getProfileCount(&I, nullptr)) +            TotalCallCount += CallCount.getValue(); +    if (isHotCount(TotalCallCount)) +      return true; +  }    for (const auto &BB : *F) -    for (const auto &I : BB) -      if (isa<CallInst>(I) || isa<InvokeInst>(I)) -        if (auto CallCount = getProfileCount(&I, nullptr)) -          TotalCallCount += CallCount.getValue(); -  return isHotCount(TotalCallCount); +    if (isHotBB(&BB, &BFI)) +      return true; +  return false;  } -/// Returns true if the function's entry and total call edge count is cold. +/// Returns true if the function only contains cold code. This means that +/// the function entry and blocks are all cold, and (in the case of Sample PGO) +/// the total call edge count is cold.  /// If it returns false, it either means it is not cold or it is unknown -/// whether it is cold or not (for example, no profile data is available). -bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F) { +/// (for example, no profile data is available). +bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F, +                                                   BlockFrequencyInfo &BFI) {    if (!F || !computeSummary())      return false;    if (auto FunctionCount = F->getEntryCount())      if (!isColdCount(FunctionCount.getValue()))        return false; -   -  uint64_t TotalCallCount = 0; + +  if (hasSampleProfile()) { +    uint64_t TotalCallCount = 0; +    for (const auto &BB : *F) +      for (const auto &I : BB) +        if (isa<CallInst>(I) || isa<InvokeInst>(I)) +          if (auto CallCount = getProfileCount(&I, nullptr)) +            TotalCallCount += CallCount.getValue(); +    if (!isColdCount(TotalCallCount)) +      return false; +  }    for (const auto &BB : *F) -    for (const auto &I : BB)  -      if (isa<CallInst>(I) || isa<InvokeInst>(I)) -        if (auto CallCount = getProfileCount(&I, nullptr)) -          TotalCallCount += CallCount.getValue(); -  return isColdCount(TotalCallCount); +    if (!isColdBB(&BB, &BFI)) +      return false; +  return true;  }  /// Returns true if the function's entry is a cold. If it returns false, it @@ -231,7 +251,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,    // If there is no profile for the caller, and we know the profile is    // accurate, we consider the callsite as cold.    return (hasSampleProfile() && -          (CS.getCaller()->getEntryCount() || ProfileSampleAccurate || +          (CS.getCaller()->hasProfileData() || ProfileSampleAccurate ||             CS.getCaller()->hasFnAttribute("profile-sample-accurate")));  } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 0b8604187121..2a8088dc4452 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -4368,6 +4368,7 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {        default:          break;        } +    break;    }    default: diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index b744cae51ed7..c9e9c6d1a419 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -314,6 +314,10 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,    return Cost;  } +bool TargetTransformInfo::isOutOfOrder() const { +  return TTIImpl->isOutOfOrder(); +} +  unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {    return TTIImpl->getNumberOfRegisters(Vector);  } diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp index c9ed026a1e33..173db399b9d6 100644 --- a/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -544,21 +544,32 @@ static bool matchAccessTags(const MDNode *A, const MDNode *B,    TBAAStructTagNode TagA(A), TagB(B);    const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(),                                                  TagB.getAccessType()); -  if (GenericTag) -    *GenericTag = createAccessTag(CommonType);    // TODO: We need to check if AccessType of TagA encloses AccessType of    // TagB to support aggregate AccessType. If yes, return true.    // Climb the type DAG from base type of A to see if we reach base type of B.    uint64_t OffsetA; -  if (findAccessType(TagA, TagB.getBaseType(), OffsetA)) -    return OffsetA == TagB.getOffset(); +  if (findAccessType(TagA, TagB.getBaseType(), OffsetA)) { +    bool SameMemberAccess = OffsetA == TagB.getOffset(); +    if (GenericTag) +      *GenericTag = SameMemberAccess ? TagB.getNode() : +                                       createAccessTag(CommonType); +    return SameMemberAccess; +  }    // Climb the type DAG from base type of B to see if we reach base type of A.    uint64_t OffsetB; -  if (findAccessType(TagB, TagA.getBaseType(), OffsetB)) -    return OffsetB == TagA.getOffset(); +  if (findAccessType(TagB, TagA.getBaseType(), OffsetB)) { +    bool SameMemberAccess = OffsetB == TagA.getOffset(); +    if (GenericTag) +      *GenericTag = SameMemberAccess ? TagA.getNode() : +                                       createAccessTag(CommonType); +    return SameMemberAccess; +  } + +  if (GenericTag) +    *GenericTag = createAccessTag(CommonType);    // If the final access types have different roots, they're part of different    // potentially unrelated type systems, so we must be conservative. diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index fd13dbc1f1e2..a7201ed97350 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -3371,7 +3371,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(    for (auto &RI : FS->refs())      NameVals.push_back(VE.getValueID(RI.getValue())); -  bool HasProfileData = F.getEntryCount().hasValue(); +  bool HasProfileData = F.hasProfileData();    for (auto &ECI : FS->calls()) {      NameVals.push_back(getValueId(ECI.first));      if (HasProfileData) diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 31037095aa2b..d7995447592c 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2033,6 +2033,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {      }    }    // else fallthrough +  LLVM_FALLTHROUGH;    // The MC library also has a right-shift operator, but it isn't consistently    // signed or unsigned between different targets. diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 07ba5d36cc96..3aeb4910ab10 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -51,7 +51,7 @@ add_llvm_library(LLVMCodeGen    LiveRangeShrink.cpp    LiveRegMatrix.cpp    LiveRegUnits.cpp -  LiveStackAnalysis.cpp +  LiveStacks.cpp    LiveVariables.cpp    LLVMTargetMachine.cpp    LocalStackSlotAllocation.cpp diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index c4794380f791..d6f55bba716f 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -352,8 +352,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    // Clear per function information.    InsertedInsts.clear();    PromotedInsts.clear(); -  BFI.reset(); -  BPI.reset();    ModifiedDT = false;    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { @@ -365,14 +363,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +  BPI.reset(new BranchProbabilityInfo(F, *LI)); +  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));    OptSize = F.optForSize();    ProfileSummaryInfo *PSI =        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();    if (ProfileGuidedSectionPrefix) { -    if (PSI->isFunctionHotInCallGraph(&F)) +    if (PSI->isFunctionHotInCallGraph(&F, *BFI))        F.setSectionPrefix(".hot"); -    else if (PSI->isFunctionColdInCallGraph(&F)) +    else if (PSI->isFunctionColdInCallGraph(&F, *BFI))        F.setSectionPrefix(".unlikely");    } @@ -652,13 +652,6 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,    if (SameIncomingValueBBs.count(Pred))      return true; -  if (!BFI) { -    Function &F = *BB->getParent(); -    LoopInfo LI{DominatorTree(F)}; -    BPI.reset(new BranchProbabilityInfo(F, LI)); -    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); -  } -    BlockFrequency PredFreq = BFI->getBlockFreq(Pred);    BlockFrequency BBFreq = BFI->getBlockFreq(BB); @@ -3704,7 +3697,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,        } else {          uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());          if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { -          ConstantOffset += CI->getSExtValue()*TypeSize; +          ConstantOffset += CI->getSExtValue() * TypeSize;          } else if (TypeSize) {  // Scales of zero don't do anything.            // We only allow one variable index at the moment.            if (VariableOperand != -1) diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 87a658be4c29..a3b43c92a7fc 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -835,6 +835,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {      case 64:        ZeroTy = Type::getDoubleTy(Ctx);        break; +    case 128: +      ZeroTy = Type::getFP128Ty(Ctx); +      break;      default:        llvm_unreachable("unexpected floating-point type");      } diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index 1aaf7a0ceef8..86ce4b7a9464 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -28,7 +28,7 @@  #include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervals.h"  #include "llvm/CodeGen/LiveRangeEdit.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineDominators.h" diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 92edfb059ad6..77a7aaa95732 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -81,10 +81,9 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T,    this->OptLevel = OL;  } -TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(BasicTTIImpl(this, F)); -  }); +TargetTransformInfo +LLVMTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(BasicTTIImpl(this, F));  }  /// addPassesToX helper drives creation and initialization of TargetPassConfig. diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStacks.cpp index b0e58b0e3e5f..80ecfdb7a507 100644 --- a/lib/CodeGen/LiveStackAnalysis.cpp +++ b/lib/CodeGen/LiveStacks.cpp @@ -1,4 +1,4 @@ -//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===// +//===-- LiveStacks.cpp - Live Stack Slot Analysis -------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -13,7 +13,7 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/LiveIntervals.h"  #include "llvm/CodeGen/Passes.h"  #include "llvm/CodeGen/TargetRegisterInfo.h" diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 3568f96d2b9a..f91cca6e4e50 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -157,18 +157,14 @@ public:    void print(const MachineBasicBlock &MBB);    void print(const MachineInstr &MI); -  void printIRBlockReference(const BasicBlock &BB);    void printIRValueReference(const Value &V);    void printStackObjectReference(int FrameIndex); -  void printOffset(int64_t Offset);    void print(const MachineInstr &MI, unsigned OpIdx,               const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies,               LLT TypeToPrint, bool PrintDef = true);    void print(const LLVMContext &Context, const TargetInstrInfo &TII,               const MachineMemOperand &Op);    void printSyncScope(const LLVMContext &Context, SyncScope::ID SSID); - -  void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI);  };  } // end namespace llvm @@ -707,32 +703,6 @@ void MIPrinter::print(const MachineInstr &MI) {    }  } -static void printIRSlotNumber(raw_ostream &OS, int Slot) { -  if (Slot == -1) -    OS << "<badref>"; -  else -    OS << Slot; -} - -void MIPrinter::printIRBlockReference(const BasicBlock &BB) { -  OS << "%ir-block."; -  if (BB.hasName()) { -    printLLVMNameWithoutPrefix(OS, BB.getName()); -    return; -  } -  const Function *F = BB.getParent(); -  int Slot; -  if (F == MST.getCurrentFunction()) { -    Slot = MST.getLocalSlot(&BB); -  } else { -    ModuleSlotTracker CustomMST(F->getParent(), -                                /*ShouldInitializeAllMetadata=*/false); -    CustomMST.incorporateFunction(*F); -    Slot = CustomMST.getLocalSlot(&BB); -  } -  printIRSlotNumber(OS, Slot); -} -  void MIPrinter::printIRValueReference(const Value &V) {    if (isa<GlobalValue>(V)) {      V.printAsOperand(OS, /*PrintType=*/false, MST); @@ -750,7 +720,7 @@ void MIPrinter::printIRValueReference(const Value &V) {      printLLVMNameWithoutPrefix(OS, V.getName());      return;    } -  printIRSlotNumber(OS, MST.getLocalSlot(&V)); +  MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V));  }  void MIPrinter::printStackObjectReference(int FrameIndex) { @@ -762,16 +732,6 @@ void MIPrinter::printStackObjectReference(int FrameIndex) {                                              Operand.Name);  } -void MIPrinter::printOffset(int64_t Offset) { -  if (Offset == 0) -    return; -  if (Offset < 0) { -    OS << " - " << -Offset; -    return; -  } -  OS << " + " << Offset; -} -  void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,                        const TargetRegisterInfo *TRI,                        bool ShouldPrintRegisterTies, LLT TypeToPrint, @@ -787,6 +747,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,      LLVM_FALLTHROUGH;    case MachineOperand::MO_Register:    case MachineOperand::MO_CImmediate: +  case MachineOperand::MO_FPImmediate:    case MachineOperand::MO_MachineBasicBlock:    case MachineOperand::MO_ConstantPoolIndex:    case MachineOperand::MO_TargetIndex: @@ -795,7 +756,11 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,    case MachineOperand::MO_GlobalAddress:    case MachineOperand::MO_RegisterLiveOut:    case MachineOperand::MO_Metadata: -  case MachineOperand::MO_MCSymbol: { +  case MachineOperand::MO_MCSymbol: +  case MachineOperand::MO_CFIIndex: +  case MachineOperand::MO_IntrinsicID: +  case MachineOperand::MO_Predicate: +  case MachineOperand::MO_BlockAddress: {      unsigned TiedOperandIdx = 0;      if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())        TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx); @@ -804,21 +769,9 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,               TiedOperandIdx, TRI, TII);      break;    } -  case MachineOperand::MO_FPImmediate: -    Op.getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST); -    break;    case MachineOperand::MO_FrameIndex:      printStackObjectReference(Op.getIndex());      break; -  case MachineOperand::MO_BlockAddress: -    OS << "blockaddress("; -    Op.getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false, -                                                        MST); -    OS << ", "; -    printIRBlockReference(*Op.getBlockAddress()->getBasicBlock()); -    OS << ')'; -    printOffset(Op.getOffset()); -    break;    case MachineOperand::MO_RegisterMask: {      auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask());      if (RegMaskInfo != RegisterMaskIds.end()) @@ -827,28 +780,6 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,        printCustomRegMask(Op.getRegMask(), OS, TRI);      break;    } -  case MachineOperand::MO_CFIIndex: { -    const MachineFunction &MF = *Op.getParent()->getMF(); -    print(MF.getFrameInstructions()[Op.getCFIIndex()], TRI); -    break; -  } -  case MachineOperand::MO_IntrinsicID: { -    Intrinsic::ID ID = Op.getIntrinsicID(); -    if (ID < Intrinsic::num_intrinsics) -      OS << "intrinsic(@" << Intrinsic::getName(ID, None) << ')'; -    else { -      const MachineFunction &MF = *Op.getParent()->getMF(); -      const TargetIntrinsicInfo *TII = MF.getTarget().getIntrinsicInfo(); -      OS << "intrinsic(@" << TII->getName(ID) << ')'; -    } -    break; -  } -  case MachineOperand::MO_Predicate: { -    auto Pred = static_cast<CmpInst::Predicate>(Op.getPredicate()); -    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred(" -       << CmpInst::getPredicateName(Pred) << ')'; -    break; -  }    }  } @@ -938,7 +869,7 @@ void MIPrinter::print(const LLVMContext &Context, const TargetInstrInfo &TII,        break;      }    } -  printOffset(Op.getOffset()); +  MachineOperand::printOperandOffset(OS, Op.getOffset());    if (Op.getBaseAlignment() != Op.getSize())      OS << ", align " << Op.getBaseAlignment();    auto AAInfo = Op.getAAInfo(); @@ -978,118 +909,6 @@ void MIPrinter::printSyncScope(const LLVMContext &Context, SyncScope::ID SSID) {    }  } -static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS, -                             const TargetRegisterInfo *TRI) { -  int Reg = TRI->getLLVMRegNum(DwarfReg, true); -  if (Reg == -1) { -    OS << "<badreg>"; -    return; -  } -  OS << printReg(Reg, TRI); -} - -void MIPrinter::print(const MCCFIInstruction &CFI, -                      const TargetRegisterInfo *TRI) { -  switch (CFI.getOperation()) { -  case MCCFIInstruction::OpSameValue: -    OS << "same_value "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    break; -  case MCCFIInstruction::OpRememberState: -    OS << "remember_state "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    break; -  case MCCFIInstruction::OpRestoreState: -    OS << "restore_state "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    break; -  case MCCFIInstruction::OpOffset: -    OS << "offset "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    OS << ", " << CFI.getOffset(); -    break; -  case MCCFIInstruction::OpDefCfaRegister: -    OS << "def_cfa_register "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    break; -  case MCCFIInstruction::OpDefCfaOffset: -    OS << "def_cfa_offset "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    OS << CFI.getOffset(); -    break; -  case MCCFIInstruction::OpDefCfa: -    OS << "def_cfa "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    OS << ", " << CFI.getOffset(); -    break; -  case MCCFIInstruction::OpRelOffset: -    OS << "rel_offset "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    OS << ", " << CFI.getOffset(); -    break; -  case MCCFIInstruction::OpAdjustCfaOffset: -    OS << "adjust_cfa_offset "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    OS << CFI.getOffset(); -    break; -  case MCCFIInstruction::OpRestore: -    OS << "restore "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    break; -  case MCCFIInstruction::OpEscape: { -    OS << "escape "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    if (!CFI.getValues().empty()) { -      size_t e = CFI.getValues().size() - 1; -      for (size_t i = 0; i < e; ++i) -        OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", "; -      OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", "; -    } -    break; -  } -  case MCCFIInstruction::OpUndefined: -    OS << "undefined "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    break; -  case MCCFIInstruction::OpRegister: -    OS << "register "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    printCFIRegister(CFI.getRegister(), OS, TRI); -    OS << ", "; -    printCFIRegister(CFI.getRegister2(), OS, TRI); -    break; -  case MCCFIInstruction::OpWindowSave: -    OS << "window_save "; -    if (MCSymbol *Label = CFI.getLabel()) -      MachineOperand::printSymbol(OS, *Label); -    break; -  default: -    // TODO: Print the other CFI Operations. -    OS << "<unserializable cfi operation>"; -    break; -  } -} -  void llvm::printMIR(raw_ostream &OS, const Module &M) {    yaml::Output Out(OS);    Out << const_cast<Module &>(M); diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 4ce689607730..84c808ee7938 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -1235,7 +1235,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {  // When profile is available, we need to handle the triangle-shape CFG.  static BranchProbability getLayoutSuccessorProbThreshold(        const MachineBasicBlock *BB) { -  if (!BB->getParent()->getFunction().getEntryCount()) +  if (!BB->getParent()->getFunction().hasProfileData())      return BranchProbability(StaticLikelyProb, 100);    if (BB->succ_size() == 2) {      const MachineBasicBlock *Succ1 = *BB->succ_begin(); @@ -2178,7 +2178,7 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {    // will be merged into the first outer loop chain for which this block is not    // cold anymore. This needs precise profile data and we only do this when    // profile data is available. -  if (F->getFunction().getEntryCount() || ForceLoopColdBlock) { +  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {      BlockFrequency LoopFreq(0);      for (auto LoopPred : L.getHeader()->predecessors())        if (!L.contains(LoopPred)) @@ -2220,7 +2220,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {    // for better layout.    bool RotateLoopWithProfile =        ForcePreciseRotationCost || -      (PreciseRotationCost && F->getFunction().getEntryCount()); +      (PreciseRotationCost && F->getFunction().hasProfileData());    // First check to see if there is an obviously preferable top block for the    // loop. This will default to the header, but may end up as one of the diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp index d17c481862a1..ec81c6391171 100644 --- a/lib/CodeGen/MachineOperand.cpp +++ b/lib/CodeGen/MachineOperand.cpp @@ -380,16 +380,6 @@ static void tryToGetTargetInfo(const MachineOperand &MO,    }  } -static void printOffset(raw_ostream &OS, int64_t Offset) { -  if (Offset == 0) -    return; -  if (Offset < 0) { -    OS << " - " << -Offset; -    return; -  } -  OS << " + " << Offset; -} -  static const char *getTargetIndexName(const MachineFunction &MF, int Index) {    const auto *TII = MF.getSubtarget().getInstrInfo();    assert(TII && "expected instruction info"); @@ -412,6 +402,44 @@ static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) {    return nullptr;  } +static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS, +                             const TargetRegisterInfo *TRI) { +  if (!TRI) { +    OS << "%dwarfreg." << DwarfReg; +    return; +  } + +  int Reg = TRI->getLLVMRegNum(DwarfReg, true); +  if (Reg == -1) { +    OS << "<badreg>"; +    return; +  } +  OS << printReg(Reg, TRI); +} + +static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB, +                                  ModuleSlotTracker &MST) { +  OS << "%ir-block."; +  if (BB.hasName()) { +    printLLVMNameWithoutPrefix(OS, BB.getName()); +    return; +  } +  Optional<int> Slot; +  if (const Function *F = BB.getParent()) { +    if (F == MST.getCurrentFunction()) { +      Slot = MST.getLocalSlot(&BB); +    } else if (const Module *M = F->getParent()) { +      ModuleSlotTracker CustomMST(M, /*ShouldInitializeAllMetadata=*/false); +      CustomMST.incorporateFunction(*F); +      Slot = CustomMST.getLocalSlot(&BB); +    } +  } +  if (Slot) +    MachineOperand::printIRSlotNumber(OS, *Slot); +  else +    OS << "<unknown>"; +} +  void MachineOperand::printSubregIdx(raw_ostream &OS, uint64_t Index,                                      const TargetRegisterInfo *TRI) {    OS << "%subreg."; @@ -490,6 +518,125 @@ void MachineOperand::printStackObjectReference(raw_ostream &OS,      OS << '.' << Name;  } +void MachineOperand::printOperandOffset(raw_ostream &OS, int64_t Offset) { +  if (Offset == 0) +    return; +  if (Offset < 0) { +    OS << " - " << -Offset; +    return; +  } +  OS << " + " << Offset; +} + +void MachineOperand::printIRSlotNumber(raw_ostream &OS, int Slot) { +  if (Slot == -1) +    OS << "<badref>"; +  else +    OS << Slot; +} + +static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI, +                     const TargetRegisterInfo *TRI) { +  switch (CFI.getOperation()) { +  case MCCFIInstruction::OpSameValue: +    OS << "same_value "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    break; +  case MCCFIInstruction::OpRememberState: +    OS << "remember_state "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    break; +  case MCCFIInstruction::OpRestoreState: +    OS << "restore_state "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    break; +  case MCCFIInstruction::OpOffset: +    OS << "offset "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    OS << ", " << CFI.getOffset(); +    break; +  case MCCFIInstruction::OpDefCfaRegister: +    OS << "def_cfa_register "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    break; +  case MCCFIInstruction::OpDefCfaOffset: +    OS << "def_cfa_offset "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    OS << CFI.getOffset(); +    break; +  case MCCFIInstruction::OpDefCfa: +    OS << "def_cfa "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    OS << ", " << CFI.getOffset(); +    break; +  case MCCFIInstruction::OpRelOffset: +    OS << "rel_offset "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    OS << ", " << CFI.getOffset(); +    break; +  case MCCFIInstruction::OpAdjustCfaOffset: +    OS << "adjust_cfa_offset "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    OS << CFI.getOffset(); +    break; +  case MCCFIInstruction::OpRestore: +    OS << "restore "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    break; +  case MCCFIInstruction::OpEscape: { +    OS << "escape "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    if (!CFI.getValues().empty()) { +      size_t e = CFI.getValues().size() - 1; +      for (size_t i = 0; i < e; ++i) +        OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", "; +      OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", "; +    } +    break; +  } +  case MCCFIInstruction::OpUndefined: +    OS << "undefined "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    break; +  case MCCFIInstruction::OpRegister: +    OS << "register "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    printCFIRegister(CFI.getRegister(), OS, TRI); +    OS << ", "; +    printCFIRegister(CFI.getRegister2(), OS, TRI); +    break; +  case MCCFIInstruction::OpWindowSave: +    OS << "window_save "; +    if (MCSymbol *Label = CFI.getLabel()) +      MachineOperand::printSymbol(OS, *Label); +    break; +  default: +    // TODO: Print the other CFI Operations. +    OS << "<unserializable cfi directive>"; +    break; +  } +} +  void MachineOperand::print(raw_ostream &OS, const TargetRegisterInfo *TRI,                             const TargetIntrinsicInfo *IntrinsicInfo) const {    tryToGetTargetInfo(*this, TRI, IntrinsicInfo); @@ -561,29 +708,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,      getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);      break;    case MachineOperand::MO_FPImmediate: -    if (getFPImm()->getType()->isFloatTy()) { -      OS << getFPImm()->getValueAPF().convertToFloat(); -    } else if (getFPImm()->getType()->isHalfTy()) { -      APFloat APF = getFPImm()->getValueAPF(); -      bool Unused; -      APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused); -      OS << "half " << APF.convertToFloat(); -    } else if (getFPImm()->getType()->isFP128Ty()) { -      APFloat APF = getFPImm()->getValueAPF(); -      SmallString<16> Str; -      getFPImm()->getValueAPF().toString(Str); -      OS << "quad " << Str; -    } else if (getFPImm()->getType()->isX86_FP80Ty()) { -      APFloat APF = getFPImm()->getValueAPF(); -      OS << "x86_fp80 0xK"; -      APInt API = APF.bitcastToAPInt(); -      OS << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4, -                                 /*Upper=*/true); -      OS << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, -                                 /*Upper=*/true); -    } else { -      OS << getFPImm()->getValueAPF().convertToDouble(); -    } +    getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST);      break;    case MachineOperand::MO_MachineBasicBlock:      OS << printMBBReference(*getMBB()); @@ -606,7 +731,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,    }    case MachineOperand::MO_ConstantPoolIndex:      OS << "%const." << getIndex(); -    printOffset(OS, getOffset()); +    printOperandOffset(OS, getOffset());      break;    case MachineOperand::MO_TargetIndex: {      OS << "target-index("; @@ -615,7 +740,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,        if (const auto *TargetIndexName = getTargetIndexName(*MF, getIndex()))          Name = TargetIndexName;      OS << Name << ')'; -    printOffset(OS, getOffset()); +    printOperandOffset(OS, getOffset());      break;    }    case MachineOperand::MO_JumpTableIndex: @@ -623,7 +748,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,      break;    case MachineOperand::MO_GlobalAddress:      getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST); -    printOffset(OS, getOffset()); +    printOperandOffset(OS, getOffset());      break;    case MachineOperand::MO_ExternalSymbol: {      StringRef Name = getSymbolName(); @@ -633,16 +758,19 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,      } else {        printLLVMNameWithoutPrefix(OS, Name);      } -    printOffset(OS, getOffset()); +    printOperandOffset(OS, getOffset());      break;    } -  case MachineOperand::MO_BlockAddress: -    OS << '<'; -    getBlockAddress()->printAsOperand(OS, /*PrintType=*/false, MST); -    if (getOffset()) -      OS << "+" << getOffset(); -    OS << '>'; +  case MachineOperand::MO_BlockAddress: { +    OS << "blockaddress("; +    getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false, +                                                     MST); +    OS << ", "; +    printIRBlockReference(OS, *getBlockAddress()->getBasicBlock(), MST); +    OS << ')'; +    MachineOperand::printOperandOffset(OS, getOffset());      break; +  }    case MachineOperand::MO_RegisterMask: {      OS << "<regmask";      if (TRI) { @@ -693,23 +821,27 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,    case MachineOperand::MO_MCSymbol:      printSymbol(OS, *getMCSymbol());      break; -  case MachineOperand::MO_CFIIndex: -    OS << "<call frame instruction>"; +  case MachineOperand::MO_CFIIndex: { +    if (const MachineFunction *MF = getMFIfAvailable(*this)) +      printCFI(OS, MF->getFrameInstructions()[getCFIIndex()], TRI); +    else +      OS << "<cfi directive>";      break; +  }    case MachineOperand::MO_IntrinsicID: {      Intrinsic::ID ID = getIntrinsicID();      if (ID < Intrinsic::num_intrinsics) -      OS << "<intrinsic:@" << Intrinsic::getName(ID, None) << '>'; +      OS << "intrinsic(@" << Intrinsic::getName(ID, None) << ')';      else if (IntrinsicInfo) -      OS << "<intrinsic:@" << IntrinsicInfo->getName(ID) << '>'; +      OS << "intrinsic(@" << IntrinsicInfo->getName(ID) << ')';      else -      OS << "<intrinsic:" << ID << '>'; +      OS << "intrinsic(" << ID << ')';      break;    }    case MachineOperand::MO_Predicate: {      auto Pred = static_cast<CmpInst::Predicate>(getPredicate()); -    OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred") -       << CmpInst::getPredicateName(Pred) << '>'; +    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred(" +       << CmpInst::getPredicateName(Pred) << ')';      break;    }    } diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index c9fe7681e280..e0cc2ca9a2a2 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -37,7 +37,7 @@  #include "llvm/CodeGen/GlobalISel/RegisterBank.h"  #include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/LiveVariables.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt index 2fcbd1280da4..3318e109155b 100644 --- a/lib/CodeGen/README.txt +++ b/lib/CodeGen/README.txt @@ -164,7 +164,7 @@ synthesize the various copy insertion/inspection methods in TargetInstrInfo.  Stack coloring improvements: -1. Do proper LiveStackAnalysis on all stack objects including those which are +1. Do proper LiveStacks analysis on all stack objects including those which are     not spill slots.  2. Reorder objects to fill in gaps between objects.     e.g. 4, 1, <gap>, 4, 1, 1, 1, <gap>, 4 => 4, 1, 1, 1, 1, 4, 4 diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index 6e273277804b..1125d2c62bef 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -21,7 +21,7 @@  #include "llvm/CodeGen/LiveIntervals.h"  #include "llvm/CodeGen/LiveRangeEdit.h"  #include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineFunctionPass.h"  #include "llvm/CodeGen/MachineInstr.h" diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 186ef577e31d..e492c481a540 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -39,7 +39,7 @@  #include "llvm/CodeGen/LiveIntervals.h"  #include "llvm/CodeGen/LiveRangeEdit.h"  #include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineDominators.h" diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index 351e91c932eb..69a879701fae 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -45,7 +45,7 @@  #include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervals.h"  #include "llvm/CodeGen/LiveRangeEdit.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineDominators.h"  #include "llvm/CodeGen/MachineFunction.h" diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f97732c1c49d..17f907eb07e8 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3988,10 +3988,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {    if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))      return RAND;    // fold (and (or x, C), D) -> D if (C & D) == D -  if (N1C && N0.getOpcode() == ISD::OR) -    if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1))) -      if (N1C->getAPIntValue().isSubsetOf(ORI->getAPIntValue())) -        return N1; +  auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { +    return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); +  }; +  if (N0.getOpcode() == ISD::OR && +      matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) +    return N1;    // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.    if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {      SDValue N0Op0 = N0.getOperand(0); @@ -4675,16 +4677,16 @@ SDValue DAGCombiner::visitOR(SDNode *N) {    // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)    // iff (c1 & c2) != 0. -  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) { -    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { -      if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) { -        if (SDValue COR = -                DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1)) -          return DAG.getNode( -              ISD::AND, SDLoc(N), VT, -              DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); -        return SDValue(); -      } +  auto MatchIntersect = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { +    return LHS->getAPIntValue().intersects(RHS->getAPIntValue()); +  }; +  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && +      matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) { +    if (SDValue COR = DAG.FoldConstantArithmetic( +            ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) { +      SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); +      AddToWorklist(IOR.getNode()); +      return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);      }    } @@ -5380,21 +5382,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {      AddToWorklist(NotX.getNode());      return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);    } -  // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2)) -  if (N1C && N0.getOpcode() == ISD::XOR) { -    if (const ConstantSDNode *N00C = getAsNonOpaqueConstant(N0.getOperand(0))) { -      SDLoc DL(N); -      return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), -                         DAG.getConstant(N1C->getAPIntValue() ^ -                                         N00C->getAPIntValue(), DL, VT)); -    } -    if (const ConstantSDNode *N01C = getAsNonOpaqueConstant(N0.getOperand(1))) { -      SDLoc DL(N); -      return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), -                         DAG.getConstant(N1C->getAPIntValue() ^ -                                         N01C->getAPIntValue(), DL, VT)); -    } -  }    // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)    unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -10201,7 +10188,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {        case ISD::SETLT:        case ISD::SETLE:          std::swap(TrueOpnd, FalseOpnd); -        // Fall through +        LLVM_FALLTHROUGH;        case ISD::SETOGT:        case ISD::SETUGT:        case ISD::SETOGE: @@ -10555,7 +10542,7 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {      // value in one SSE register, but instruction selection cannot handle      // FCOPYSIGN on SSE registers yet.      EVT N1VT = N1->getValueType(0); -    EVT N1Op0VT = N1->getOperand(0)->getValueType(0); +    EVT N1Op0VT = N1->getOperand(0).getValueType();      return (N1VT == N1Op0VT || N1Op0VT != MVT::f128);    }    return false; @@ -13784,30 +13771,30 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {      }    } -  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { -    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && -        !ST1->isVolatile() && ST1->getBasePtr() == Ptr && -        ST->getMemoryVT() == ST1->getMemoryVT()) { -      // If this is a store followed by a store with the same value to the same -      // location, then the store is dead/noop. -      if (ST1->getValue() == Value) { -        // The store is dead, remove it. -        return Chain; -      } - -      // If this is a store who's preceeding store to the same location -      // and no one other node is chained to that store we can effectively -      // drop the store. Do not remove stores to undef as they may be used as -      // data sinks. -      if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && -          !ST1->getBasePtr().isUndef()) { -        // ST1 is fully overwritten and can be elided. Combine with it's chain -        // value. +  // Deal with elidable overlapping chained stores. +  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) +    if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && +        ST1->isUnindexed() && !ST1->isVolatile() && ST1->hasOneUse() && +        !ST1->getBasePtr().isUndef() && !ST->isVolatile()) { +      BaseIndexOffset STBasePtr = BaseIndexOffset::match(ST->getBasePtr(), DAG); +      BaseIndexOffset ST1BasePtr = +          BaseIndexOffset::match(ST1->getBasePtr(), DAG); +      unsigned STBytes = ST->getMemoryVT().getStoreSize(); +      unsigned ST1Bytes = ST1->getMemoryVT().getStoreSize(); +      int64_t PtrDiff; +      // If this is a store who's preceeding store to a subset of the same +      // memory and no one other node is chained to that store we can +      // effectively drop the store. Do not remove stores to undef as they may +      // be used as data sinks. + +      if (((ST->getBasePtr() == ST1->getBasePtr()) && +           (ST->getValue() == ST1->getValue())) || +          (STBasePtr.equalBaseIndex(ST1BasePtr, DAG, PtrDiff) && +           (0 <= PtrDiff) && (PtrDiff + ST1Bytes <= STBytes))) {          CombineTo(ST1, ST1->getChain()); -        return SDValue(); +        return SDValue(N, 0);        }      } -  }    // If this is an FP_ROUND or TRUNC followed by a store, fold this into a    // truncating store.  We can do this even if this is already a truncstore. @@ -15110,7 +15097,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {      // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).      if (In->getOpcode() == ISD::BITCAST && -        !In->getOperand(0)->getValueType(0).isVector()) { +        !In->getOperand(0).getValueType().isVector()) {        SDValue Scalar = In->getOperand(0);        // If the bitcast type isn't legal, it might be a trunc of a legal type; @@ -15157,7 +15144,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {        bool FoundMinVT = false;        for (const SDValue &Op : N->ops())          if (ISD::BUILD_VECTOR == Op.getOpcode()) { -          EVT OpSVT = Op.getOperand(0)->getValueType(0); +          EVT OpSVT = Op.getOperand(0).getValueType();            MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;            FoundMinVT = true;          } @@ -17418,43 +17405,6 @@ SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {    return buildSqrtEstimateImpl(Op, Flags, false);  } -/// Return true if base is a frame index, which is known not to alias with -/// anything but itself.  Provides base object and offset as results. -static bool findBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, -                           const GlobalValue *&GV, const void *&CV) { -  // Assume it is a primitive operation. -  Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr; - -  // If it's an adding a simple constant then integrate the offset. -  if (Base.getOpcode() == ISD::ADD) { -    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) { -      Base = Base.getOperand(0); -      Offset += C->getSExtValue(); -    } -  } - -  // Return the underlying GlobalValue, and update the Offset.  Return false -  // for GlobalAddressSDNode since the same GlobalAddress may be represented -  // by multiple nodes with different offsets. -  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Base)) { -    GV = G->getGlobal(); -    Offset += G->getOffset(); -    return false; -  } - -  // Return the underlying Constant value, and update the Offset.  Return false -  // for ConstantSDNodes since the same constant pool entry may be represented -  // by multiple nodes with different offsets. -  if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) { -    CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal() -                                         : (const void *)C->getConstVal(); -    Offset += C->getOffset(); -    return false; -  } -  // If it's any of the following then it can't alias with anything but itself. -  return isa<FrameIndexSDNode>(Base); -} -  /// Return true if there is any possibility that the two addresses overlap.  bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {    // If they are the same then they must be aliases. @@ -17496,39 +17446,18 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {          return false;      } -  // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis -  // modified to use BaseIndexOffset. - -  // Gather base node and offset information. -  SDValue Base0, Base1; -  int64_t Offset0, Offset1; -  const GlobalValue *GV0, *GV1; -  const void *CV0, *CV1; -  bool IsFrameIndex0 = findBaseOffset(Op0->getBasePtr(), -                                      Base0, Offset0, GV0, CV0); -  bool IsFrameIndex1 = findBaseOffset(Op1->getBasePtr(), -                                      Base1, Offset1, GV1, CV1); - -  // If they have the same base address, then check to see if they overlap. -  if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1))) -    return !((Offset0 + NumBytes0) <= Offset1 || -             (Offset1 + NumBytes1) <= Offset0); - -  // It is possible for different frame indices to alias each other, mostly -  // when tail call optimization reuses return address slots for arguments. -  // To catch this case, look up the actual index of frame indices to compute -  // the real alias relationship. -  if (IsFrameIndex0 && IsFrameIndex1) { -    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); -    Offset0 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base0)->getIndex()); -    Offset1 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex()); -    return !((Offset0 + NumBytes0) <= Offset1 || -             (Offset1 + NumBytes1) <= Offset0); -  } - -  // Otherwise, if we know what the bases are, and they aren't identical, then -  // we know they cannot alias. -  if ((IsFrameIndex0 || CV0 || GV0) && (IsFrameIndex1 || CV1 || GV1)) +  bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase()); +  bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase()); +  bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase()); +  bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase()); +  bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase()); +  bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase()); + +  // If of mismatched base types or checkable indices we can check +  // they do not alias. +  if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || +       (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && +      (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))      return false;    // If we know required SrcValue1 and SrcValue2 have relatively large alignment diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index eaf177d0661b..e28a3aa47ca3 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1887,7 +1887,7 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) {    SDLoc DL(N);    SDValue Promoted = GetPromotedFloat(Val); -  EVT VT = ST->getOperand(1)->getValueType(0); +  EVT VT = ST->getOperand(1).getValueType();    EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());    SDValue NewVal; diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index b60d7bca498a..4438ee7878b8 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -224,7 +224,7 @@ bool DAGTypeLegalizer::run() {      assert(N->getNodeId() == ReadyToProcess &&             "Node should be ready if on worklist!"); -    DEBUG(dbgs() << "Legalizing node: "; N->dump()); +    DEBUG(dbgs() << "Legalizing node: "; N->dump(&DAG));      if (IgnoreNodeResults(N)) {        DEBUG(dbgs() << "Ignoring node results\n");        goto ScanOperands; @@ -296,7 +296,7 @@ ScanOperands:          continue;        const auto Op = N->getOperand(i); -      DEBUG(dbgs() << "Analyzing operand: "; Op.dump()); +      DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG));        EVT OpVT = Op.getValueType();        switch (getTypeAction(OpVT)) {        case TargetLowering::TypeLegal: @@ -445,7 +445,7 @@ NodeDone:          if (!isTypeLegal(Node.getValueType(i)) &&              !TLI.isTypeLegal(Node.getValueType(i))) {            dbgs() << "Result type " << i << " illegal: "; -          Node.dump(); +          Node.dump(&DAG);            Failed = true;          } @@ -455,7 +455,7 @@ NodeDone:            !isTypeLegal(Node.getOperand(i).getValueType()) &&            !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {          dbgs() << "Operand type " << i << " illegal: "; -        Node.getOperand(i).dump(); +        Node.getOperand(i).dump(&DAG);          Failed = true;        } diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 8f2320f52a0f..ce1c01b621f0 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -331,7 +331,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {      // At least try the common case where the boolean is generated by a      // comparison.      if (Cond->getOpcode() == ISD::SETCC) { -      EVT OpVT = Cond->getOperand(0)->getValueType(0); +      EVT OpVT = Cond->getOperand(0).getValueType();        ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());        VecBool = TLI.getBooleanContents(OpVT);      } else @@ -1548,14 +1548,14 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {        break;      case ISD::FP_TO_SINT:      case ISD::FP_TO_UINT: -      if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0))) +      if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))          Res = SplitVecOp_TruncateHelper(N);        else          Res = SplitVecOp_UnaryOp(N);        break;      case ISD::SINT_TO_FP:      case ISD::UINT_TO_FP: -      if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0))) +      if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))          Res = SplitVecOp_TruncateHelper(N);        else          Res = SplitVecOp_UnaryOp(N); diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 379f0dcef513..7f369c746d24 100644 --- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -252,6 +252,7 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {        if (!ResourcesModel->canReserveResources(&TII->get(            SU->getNode()->getMachineOpcode())))             return false; +      break;      case TargetOpcode::EXTRACT_SUBREG:      case TargetOpcode::INSERT_SUBREG:      case TargetOpcode::SUBREG_TO_REG: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 12a21e74079e..a04c770c51c4 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3750,6 +3750,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,        case ISD::FP_TO_SINT:        case ISD::FP_TO_UINT:        case ISD::TRUNCATE: +      case ISD::ANY_EXTEND: +      case ISD::ZERO_EXTEND: +      case ISD::SIGN_EXTEND:        case ISD::UINT_TO_FP:        case ISD::SINT_TO_FP:        case ISD::ABS: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index 544da362be69..d5980919d03c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -37,6 +37,23 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,            return true;          } +    // Match Constants +    if (auto *A = dyn_cast<ConstantPoolSDNode>(Base)) +      if (auto *B = dyn_cast<ConstantPoolSDNode>(Other.Base)) { +        bool IsMatch = +            A->isMachineConstantPoolEntry() == B->isMachineConstantPoolEntry(); +        if (IsMatch) { +          if (A->isMachineConstantPoolEntry()) +            IsMatch = A->getMachineCPVal() == B->getMachineCPVal(); +          else +            IsMatch = A->getConstVal() == B->getConstVal(); +        } +        if (IsMatch) { +          Off += B->getOffset() - A->getOffset(); +          return true; +        } +      } +      const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();      // Match non-equal FrameIndexes - If both frame indices are fixed diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 18f6997ef83c..d13ccc263718 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -3117,7 +3117,16 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        continue;      }      case OPC_RecordMemRef: -      MatchedMemRefs.push_back(cast<MemSDNode>(N)->getMemOperand()); +      if (auto *MN = dyn_cast<MemSDNode>(N)) +        MatchedMemRefs.push_back(MN->getMemOperand()); +      else { +        DEBUG( +          dbgs() << "Expected MemSDNode "; +          N->dump(CurDAG); +          dbgs() << '\n' +        ); +      } +        continue;      case OPC_CaptureGlueInput: @@ -3563,7 +3572,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,          Ops.push_back(InputGlue);        // Create the node. -      SDNode *Res = nullptr; +      MachineSDNode *Res = nullptr;        bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo ||                       (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2);        if (!IsMorphNodeTo) { @@ -3589,7 +3598,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,                   "Chain node replaced during MorphNode");            Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end());          }); -        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo); +        Res = cast<MachineSDNode>(MorphNode(NodeToMatch, TargetOpc, VTList, +                                            Ops, EmitNodeInfo));        }        // If the node had chain/glue results, update our notion of the current @@ -3645,13 +3655,19 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,            }          } -        cast<MachineSDNode>(Res) -          ->setMemRefs(MemRefs, MemRefs + NumMemRefs); +        Res->setMemRefs(MemRefs, MemRefs + NumMemRefs);        } -      DEBUG(dbgs() << "  " -                   << (IsMorphNodeTo ? "Morphed" : "Created") -                   << " node: "; Res->dump(CurDAG); dbgs() << "\n"); +      DEBUG( +        if (!MatchedMemRefs.empty() && Res->memoperands_empty()) +          dbgs() << "  Dropping mem operands\n"; +        dbgs() << "  " +               << (IsMorphNodeTo ? "Morphed" : "Created") +               << " node: "; +        Res->dump(CurDAG); + +        dbgs() << '\n'; +      );        // If this was a MorphNodeTo then we're completely done!        if (IsMorphNodeTo) { diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 58276052c10b..d76e52d78870 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3812,7 +3812,7 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,    Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,                        DAG.getConstant(EltSize, dl, IdxVT)); -  return DAG.getNode(ISD::ADD, dl, IdxVT, Index, VecPtr); +  return DAG.getNode(ISD::ADD, dl, IdxVT, VecPtr, Index);  }  //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp index 62f662d1ade4..8fc7a4a32842 100644 --- a/lib/CodeGen/StackSlotColoring.cpp +++ b/lib/CodeGen/StackSlotColoring.cpp @@ -16,7 +16,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 543c12eebb45..224ae1a3236a 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -89,6 +89,21 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(      cl::desc("Minimum density for building a jump table in "               "an optsize function")); +static bool darwinHasSinCos(const Triple &TT) { +  assert(TT.isOSDarwin() && "should be called with darwin triple"); +  // Don't bother with 32 bit x86. +  if (TT.getArch() == Triple::x86) +    return false; +  // Macos < 10.9 has no sincos_stret. +  if (TT.isMacOSX()) +    return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit(); +  // iOS < 7.0 has no sincos_stret. +  if (TT.isiOS()) +    return !TT.isOSVersionLT(7, 0); +  // Any other darwin such as WatchOS/TvOS is new enough. +  return true; +} +  // Although this default value is arbitrary, it is not random. It is assumed  // that a condition that evaluates the same way by a higher percentage than this  // is best represented as control flow. Therefore, the default value N should be @@ -100,44 +115,56 @@ static cl::opt<int> MinPercentageForPredictableBranch(               "or false to assume that the condition is predictable"),      cl::Hidden); -/// InitLibcallNames - Set default libcall names. -static void InitLibcallNames(const char **Names, const Triple &TT) { +void TargetLoweringBase::InitLibcalls(const Triple &TT) {  #define HANDLE_LIBCALL(code, name) \ -  Names[RTLIB::code] = name; +  setLibcallName(RTLIB::code, name);  #include "llvm/CodeGen/RuntimeLibcalls.def"  #undef HANDLE_LIBCALL +  // Initialize calling conventions to their default. +  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) +    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);    // A few names are different on particular architectures or environments.    if (TT.isOSDarwin()) {      // For f16/f32 conversions, Darwin uses the standard naming scheme, instead      // of the gnueabi-style __gnu_*_ieee.      // FIXME: What about other targets? -    Names[RTLIB::FPEXT_F16_F32] = "__extendhfsf2"; -    Names[RTLIB::FPROUND_F32_F16] = "__truncsfhf2"; +    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); +    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + +    // Darwin 10 and higher has an optimized __bzero. +    if (!TT.isMacOSX() || !TT.isMacOSXVersionLT(10, 6) || TT.isArch64Bit()) { +      setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero"); +    } + +    if (darwinHasSinCos(TT)) { +      setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret"); +      setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret"); +      if (TT.isWatchABI()) { +        setLibcallCallingConv(RTLIB::SINCOS_STRET_F32, +                              CallingConv::ARM_AAPCS_VFP); +        setLibcallCallingConv(RTLIB::SINCOS_STRET_F64, +                              CallingConv::ARM_AAPCS_VFP); +      } +    }    } else { -    Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee"; -    Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee"; +    setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); +    setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");    }    if (TT.isGNUEnvironment() || TT.isOSFuchsia()) { -    Names[RTLIB::SINCOS_F32] = "sincosf"; -    Names[RTLIB::SINCOS_F64] = "sincos"; -    Names[RTLIB::SINCOS_F80] = "sincosl"; -    Names[RTLIB::SINCOS_F128] = "sincosl"; -    Names[RTLIB::SINCOS_PPCF128] = "sincosl"; +    setLibcallName(RTLIB::SINCOS_F32, "sincosf"); +    setLibcallName(RTLIB::SINCOS_F64, "sincos"); +    setLibcallName(RTLIB::SINCOS_F80, "sincosl"); +    setLibcallName(RTLIB::SINCOS_F128, "sincosl"); +    setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");    }    if (TT.isOSOpenBSD()) { -    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = nullptr; +    setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);    }  } -/// Set default libcall CallingConvs. -static void InitLibcallCallingConvs(CallingConv::ID *CCs) { -  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) -    CCs[LC] = CallingConv::C; -} -  /// getFPEXT - Return the FPEXT_*_* value for the given types, or  /// UNKNOWN_LIBCALL if there is none.  RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) { @@ -524,9 +551,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {    std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr); -  InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple()); +  InitLibcalls(TM.getTargetTriple());    InitCmpLibcallCCs(CmpLibcallCCs); -  InitLibcallCallingConvs(LibcallCallingConvs);  }  void TargetLoweringBase::initActions() { diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index 64bb37a280a6..13f7e83f3dd0 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -22,7 +22,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h" diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 17f29737bf93..6a6b7fc6fc20 100644 --- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -83,6 +83,7 @@ bool DWARFAcceleratorTable::validateForms() {             !FormValue.isFormClass(DWARFFormValue::FC_Flag)) ||            FormValue.getForm() == dwarf::DW_FORM_sdata)          return false; +      break;      default:        break;      } diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index a5defa90eb35..eb23ca8229a3 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -88,70 +88,101 @@ static void dumpUUID(raw_ostream &OS, const ObjectFile &Obj) {    }  } -static void -dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName, -                                const DWARFObject &Obj, -                                const DWARFSection &StringOffsetsSection, -                                StringRef StringSection, bool LittleEndian) { +using ContributionCollection = +    std::vector<Optional<StrOffsetsContributionDescriptor>>; + +// Collect all the contributions to the string offsets table from all units, +// sort them by their starting offsets and remove duplicates. +static ContributionCollection +collectContributionData(DWARFContext::cu_iterator_range CUs, +                        DWARFContext::tu_section_iterator_range TUSs) { +  ContributionCollection Contributions; +  for (const auto &CU : CUs) +    Contributions.push_back(CU->getStringOffsetsTableContribution()); +  for (const auto &TUS : TUSs) +    for (const auto &TU : TUS) +      Contributions.push_back(TU->getStringOffsetsTableContribution()); + +  // Sort the contributions so that any invalid ones are placed at +  // the start of the contributions vector. This way they are reported +  // first. +  std::sort(Contributions.begin(), Contributions.end(), +            [](const Optional<StrOffsetsContributionDescriptor> &L, +               const Optional<StrOffsetsContributionDescriptor> &R) { +              if (L && R) return L->Base < R->Base; +              return R.hasValue(); +            }); + +  // Uniquify contributions, as it is possible that units (specifically +  // type units in dwo or dwp files) share contributions. We don't want +  // to report them more than once. +  Contributions.erase( +      std::unique(Contributions.begin(), Contributions.end(), +                  [](const Optional<StrOffsetsContributionDescriptor> &L, +                     const Optional<StrOffsetsContributionDescriptor> &R) { +                    if (L && R) +                      return L->Base == R->Base && L->Size == R->Size; +                    return false; +                  }), +      Contributions.end()); +  return Contributions; +} + +static void dumpDWARFv5StringOffsetsSection( +    raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj, +    const DWARFSection &StringOffsetsSection, StringRef StringSection, +    DWARFContext::cu_iterator_range CUs, +    DWARFContext::tu_section_iterator_range TUSs, bool LittleEndian) { +  auto Contributions = collectContributionData(CUs, TUSs);    DWARFDataExtractor StrOffsetExt(Obj, StringOffsetsSection, LittleEndian, 0); -  uint32_t Offset = 0; +  DataExtractor StrData(StringSection, LittleEndian, 0);    uint64_t SectionSize = StringOffsetsSection.Data.size(); - -  while (Offset < SectionSize) { -    unsigned Version = 0; -    DwarfFormat Format = DWARF32; -    unsigned EntrySize = 4; -    // Perform validation and extract the segment size from the header. -    if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 4)) { +  uint32_t Offset = 0; +  for (auto &Contribution : Contributions) { +    // Report an ill-formed contribution. +    if (!Contribution) {        OS << "error: invalid contribution to string offsets table in section ."           << SectionName << ".\n";        return;      } -    uint32_t ContributionStart = Offset; -    uint64_t ContributionSize = StrOffsetExt.getU32(&Offset); -    // A contribution size of 0xffffffff indicates DWARF64, with the actual size -    // in the following 8 bytes. Otherwise, the DWARF standard mandates that -    // the contribution size must be at most 0xfffffff0. -    if (ContributionSize == 0xffffffff) { -      if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 8)) { -        OS << "error: invalid contribution to string offsets table in section ." -           << SectionName << ".\n"; -        return; -      } -      Format = DWARF64; -      EntrySize = 8; -      ContributionSize = StrOffsetExt.getU64(&Offset); -    } else if (ContributionSize > 0xfffffff0) { -      OS << "error: invalid contribution to string offsets table in section ." + +    dwarf::DwarfFormat Format = Contribution->getFormat(); +    uint16_t Version = Contribution->getVersion(); +    uint64_t ContributionHeader = Contribution->Base; +    // In DWARF v5 there is a contribution header that immediately precedes +    // the string offsets base (the location we have previously retrieved from +    // the CU DIE's DW_AT_str_offsets attribute). The header is located either +    // 8 or 16 bytes before the base, depending on the contribution's format. +    if (Version >= 5) +      ContributionHeader -= Format == DWARF32 ? 8 : 16; + +    // Detect overlapping contributions. +    if (Offset > ContributionHeader) { +      OS << "error: overlapping contributions to string offsets table in " +            "section ."           << SectionName << ".\n";        return;      } - -    // We must ensure that we don't read a partial record at the end, so we -    // validate for a multiple of EntrySize. Also, we're expecting a version -    // number and padding, which adds an additional 4 bytes. -    uint64_t ValidationSize = -        4 + ((ContributionSize + EntrySize - 1) & (-(uint64_t)EntrySize)); -    if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, ValidationSize)) { -      OS << "error: contribution to string offsets table in section ." -         << SectionName << " has invalid length.\n"; -      return; +    // Report a gap in the table. +    if (Offset < ContributionHeader) { +      OS << format("0x%8.8x: Gap, length = ", Offset); +      OS << (ContributionHeader - Offset) << "\n";      } - -    Version = StrOffsetExt.getU16(&Offset); -    Offset += 2; -    OS << format("0x%8.8x: ", ContributionStart); -    OS << "Contribution size = " << ContributionSize +    OS << format("0x%8.8x: ", (uint32_t)ContributionHeader); +    OS << "Contribution size = " << Contribution->Size +       << ", Format = " << (Format == DWARF32 ? "DWARF32" : "DWARF64")         << ", Version = " << Version << "\n"; -    uint32_t ContributionBase = Offset; -    DataExtractor StrData(StringSection, LittleEndian, 0); -    while (Offset - ContributionBase < ContributionSize) { +    Offset = Contribution->Base; +    unsigned EntrySize = Contribution->getDwarfOffsetByteSize(); +    while (Offset - Contribution->Base < Contribution->Size) {        OS << format("0x%8.8x: ", Offset); -      // FIXME: We can only extract strings in DWARF32 format at the moment. +      // FIXME: We can only extract strings if the offset fits in 32 bits.        uint64_t StringOffset =            StrOffsetExt.getRelocatedValue(EntrySize, &Offset); -      if (Format == DWARF32) { +      // Extract the string if we can and display it. Otherwise just report +      // the offset. +      if (StringOffset <= std::numeric_limits<uint32_t>::max()) {          uint32_t StringOffset32 = (uint32_t)StringOffset;          OS << format("%8.8x ", StringOffset32);          const char *S = StrData.getCStr(&StringOffset32); @@ -162,6 +193,11 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,        OS << "\n";      }    } +  // Report a gap at the end of the table. +  if (Offset < SectionSize) { +    OS << format("0x%8.8x: Gap, length = ", Offset); +    OS << (SectionSize - Offset) << "\n"; +  }  }  // Dump a DWARF string offsets section. This may be a DWARF v5 formatted @@ -170,17 +206,18 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,  // a header containing size and version number. Alternatively, it may be a  // monolithic series of string offsets, as generated by the pre-DWARF v5  // implementation of split DWARF. -static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName, -                                     const DWARFObject &Obj, -                                     const DWARFSection &StringOffsetsSection, -                                     StringRef StringSection, bool LittleEndian, -                                     unsigned MaxVersion) { +static void dumpStringOffsetsSection( +    raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj, +    const DWARFSection &StringOffsetsSection, StringRef StringSection, +    DWARFContext::cu_iterator_range CUs, +    DWARFContext::tu_section_iterator_range TUSs, bool LittleEndian, +    unsigned MaxVersion) {    // If we have at least one (compile or type) unit with DWARF v5 or greater,    // we assume that the section is formatted like a DWARF v5 string offsets    // section.    if (MaxVersion >= 5)      dumpDWARFv5StringOffsetsSection(OS, SectionName, Obj, StringOffsetsSection, -                                    StringSection, LittleEndian); +                                    StringSection, CUs, TUSs, LittleEndian);    else {      DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);      uint32_t offset = 0; @@ -468,12 +505,14 @@ void DWARFContext::dump(                   DObj->getStringOffsetSection().Data))      dumpStringOffsetsSection(          OS, "debug_str_offsets", *DObj, DObj->getStringOffsetSection(), -        DObj->getStringSection(), isLittleEndian(), getMaxVersion()); +        DObj->getStringSection(), compile_units(), type_unit_sections(), +        isLittleEndian(), getMaxVersion());    if (shouldDump(ExplicitDWO, ".debug_str_offsets.dwo", DIDT_ID_DebugStrOffsets,                   DObj->getStringOffsetDWOSection().Data))      dumpStringOffsetsSection(          OS, "debug_str_offsets.dwo", *DObj, DObj->getStringOffsetDWOSection(), -        DObj->getStringDWOSection(), isLittleEndian(), getMaxVersion()); +        DObj->getStringDWOSection(), dwo_compile_units(), +        dwo_type_unit_sections(), isLittleEndian(), getMaxVersion());    if (shouldDump(Explicit, ".gnu_index", DIDT_ID_GdbIndex,                   DObj->getGdbIndexSection())) { diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index c3d8ff2cbc29..df55d7debf92 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -8,6 +8,7 @@  //===----------------------------------------------------------------------===//  #include "llvm/DebugInfo/DWARF/DWARFUnit.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallString.h"  #include "llvm/ADT/StringRef.h"  #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h" @@ -79,8 +80,10 @@ bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,  bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,                                             uint64_t &Result) const { -  unsigned ItemSize = getDwarfOffsetByteSize(); -  uint32_t Offset = StringOffsetSectionBase + Index * ItemSize; +  if (!StringOffsetsTableContribution) +    return false; +  unsigned ItemSize = getDwarfStringOffsetsByteSize(); +  uint32_t Offset = getStringOffsetsBase() + Index * ItemSize;    if (StringOffsetSection.Data.size() < Offset + ItemSize)      return false;    DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, @@ -251,15 +254,28 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {        RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);      } -    // In general, we derive the offset of the unit's contibution to the -    // debug_str_offsets{.dwo} section from the unit DIE's -    // DW_AT_str_offsets_base attribute. In dwp files we add to it the offset -    // we get from the index table. -    StringOffsetSectionBase = -        toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0); +    // In general, in DWARF v5 and beyond we derive the start of the unit's +    // contribution to the string offsets table from the unit DIE's +    // DW_AT_str_offsets_base attribute. Split DWARF units do not use this +    // attribute, so we assume that there is a contribution to the string +    // offsets table starting at offset 0 of the debug_str_offsets.dwo section. +    // In both cases we need to determine the format of the contribution, +    // which may differ from the unit's format. +    uint64_t StringOffsetsContributionBase = +        isDWO ? 0 : toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);      if (IndexEntry)        if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS)) -        StringOffsetSectionBase += C->Offset; +        StringOffsetsContributionBase += C->Offset; + +    DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, +                          isLittleEndian, 0); +    if (isDWO) +      StringOffsetsTableContribution = +          determineStringOffsetsTableContributionDWO( +              DA, StringOffsetsContributionBase); +    else if (getVersion() >= 5) +      StringOffsetsTableContribution = determineStringOffsetsTableContribution( +          DA, StringOffsetsContributionBase);      // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for      // skeleton CU DIE, so that DWARF users not aware of it are not broken. @@ -344,45 +360,378 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {      clearDIEs(true);  } -void DWARFUnit::updateAddressDieMap(DWARFDie Die) { -  if (Die.isSubroutineDIE()) { +// Populates a map from PC addresses to subprogram DIEs. +// +// This routine tries to look at the smallest amount of the debug info it can +// to locate the DIEs. This is because many subprograms will never end up being +// read or needed at all. We want to be as lazy as possible. +void DWARFUnit::buildSubprogramDIEAddrMap() { +  assert(SubprogramDIEAddrMap.empty() && "Must only build this map once!"); +  SmallVector<DWARFDie, 16> Worklist; +  Worklist.push_back(getUnitDIE()); +  do { +    DWARFDie Die = Worklist.pop_back_val(); + +    // Queue up child DIEs to recurse through. +    // FIXME: This causes us to read a lot more debug info than we really need. +    // We should look at pruning out DIEs which cannot transitively hold +    // separate subprograms. +    for (DWARFDie Child : Die.children()) +      Worklist.push_back(Child); + +    // If handling a non-subprogram DIE, nothing else to do. +    if (!Die.isSubprogramDIE()) +      continue; + +    // For subprogram DIEs, store them, and insert relevant markers into the +    // address map. We don't care about overlap at all here as DWARF doesn't +    // meaningfully support that, so we simply will insert a range with no DIE +    // starting from the high PC. In the event there are overlaps, sorting +    // these may truncate things in surprising ways but still will allow +    // lookups to proceed. +    int DIEIndex = SubprogramDIEAddrInfos.size(); +    SubprogramDIEAddrInfos.push_back({Die, (uint64_t)-1, {}});      for (const auto &R : Die.getAddressRanges()) {        // Ignore 0-sized ranges.        if (R.LowPC == R.HighPC)          continue; -      auto B = AddrDieMap.upper_bound(R.LowPC); -      if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) { -        // The range is a sub-range of existing ranges, we need to split the -        // existing range. -        if (R.HighPC < B->second.first) -          AddrDieMap[R.HighPC] = B->second; -        if (R.LowPC > B->first) -          AddrDieMap[B->first].first = R.LowPC; + +      SubprogramDIEAddrMap.push_back({R.LowPC, DIEIndex}); +      SubprogramDIEAddrMap.push_back({R.HighPC, -1}); + +      if (R.LowPC < SubprogramDIEAddrInfos.back().SubprogramBasePC) +        SubprogramDIEAddrInfos.back().SubprogramBasePC = R.LowPC; +    } +  } while (!Worklist.empty()); + +  if (SubprogramDIEAddrMap.empty()) { +    // If we found no ranges, create a no-op map so that lookups remain simple +    // but never find anything. +    SubprogramDIEAddrMap.push_back({0, -1}); +    return; +  } + +  // Next, sort the ranges and remove both exact duplicates and runs with the +  // same DIE index. We order the ranges so that non-empty ranges are +  // preferred. Because there may be ties, we also need to use stable sort. +  std::stable_sort(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), +                   [](const std::pair<uint64_t, int64_t> &LHS, +                      const std::pair<uint64_t, int64_t> &RHS) { +                     if (LHS.first < RHS.first) +                       return true; +                     if (LHS.first > RHS.first) +                       return false; + +                     // For ranges that start at the same address, keep the one +                     // with a DIE. +                     if (LHS.second != -1 && RHS.second == -1) +                       return true; + +                     return false; +                   }); +  SubprogramDIEAddrMap.erase( +      std::unique(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), +                  [](const std::pair<uint64_t, int64_t> &LHS, +                     const std::pair<uint64_t, int64_t> &RHS) { +                    // If the start addresses are exactly the same, we can +                    // remove all but the first one as it is the only one that +                    // will be found and used. +                    // +                    // If the DIE indices are the same, we can "merge" the +                    // ranges by eliminating the second. +                    return LHS.first == RHS.first || LHS.second == RHS.second; +                  }), +      SubprogramDIEAddrMap.end()); + +  assert(SubprogramDIEAddrMap.back().second == -1 && +         "The last interval must not have a DIE as each DIE's address range is " +         "bounded."); +} + +// Build the second level of mapping from PC to DIE, specifically one that maps +// a PC *within* a particular DWARF subprogram into a precise, maximally nested +// inlined subroutine DIE (if any exists). We build a separate map for each +// subprogram because many subprograms will never get queried for an address +// and this allows us to be significantly lazier in reading the DWARF itself. +void DWARFUnit::buildInlinedSubroutineDIEAddrMap( +    SubprogramDIEAddrInfo &SPInfo) { +  auto &AddrMap = SPInfo.InlinedSubroutineDIEAddrMap; +  uint64_t BasePC = SPInfo.SubprogramBasePC; + +  auto SubroutineAddrMapSorter = [](const std::pair<int, int> &LHS, +                                    const std::pair<int, int> &RHS) { +    if (LHS.first < RHS.first) +      return true; +    if (LHS.first > RHS.first) +      return false; + +    // For ranges that start at the same address, keep the +    // non-empty one. +    if (LHS.second != -1 && RHS.second == -1) +      return true; + +    return false; +  }; +  auto SubroutineAddrMapUniquer = [](const std::pair<int, int> &LHS, +                                     const std::pair<int, int> &RHS) { +    // If the start addresses are exactly the same, we can +    // remove all but the first one as it is the only one that +    // will be found and used. +    // +    // If the DIE indices are the same, we can "merge" the +    // ranges by eliminating the second. +    return LHS.first == RHS.first || LHS.second == RHS.second; +  }; + +  struct DieAndParentIntervalRange { +    DWARFDie Die; +    int ParentIntervalsBeginIdx, ParentIntervalsEndIdx; +  }; + +  SmallVector<DieAndParentIntervalRange, 16> Worklist; +  auto EnqueueChildDIEs = [&](const DWARFDie &Die, int ParentIntervalsBeginIdx, +                              int ParentIntervalsEndIdx) { +    for (DWARFDie Child : Die.children()) +      Worklist.push_back( +          {Child, ParentIntervalsBeginIdx, ParentIntervalsEndIdx}); +  }; +  EnqueueChildDIEs(SPInfo.SubprogramDIE, 0, 0); +  while (!Worklist.empty()) { +    DWARFDie Die = Worklist.back().Die; +    int ParentIntervalsBeginIdx = Worklist.back().ParentIntervalsBeginIdx; +    int ParentIntervalsEndIdx = Worklist.back().ParentIntervalsEndIdx; +    Worklist.pop_back(); + +    // If we encounter a nested subprogram, simply ignore it. We map to +    // (disjoint) subprograms before arriving here and we don't want to examine +    // any inlined subroutines of an unrelated subpragram. +    if (Die.getTag() == DW_TAG_subprogram) +      continue; + +    // For non-subroutines, just recurse to keep searching for inlined +    // subroutines. +    if (Die.getTag() != DW_TAG_inlined_subroutine) { +      EnqueueChildDIEs(Die, ParentIntervalsBeginIdx, ParentIntervalsEndIdx); +      continue; +    } + +    // Capture the inlined subroutine DIE that we will reference from the map. +    int DIEIndex = InlinedSubroutineDIEs.size(); +    InlinedSubroutineDIEs.push_back(Die); + +    int DieIntervalsBeginIdx = AddrMap.size(); +    // First collect the PC ranges for this DIE into our subroutine interval +    // map. +    for (auto R : Die.getAddressRanges()) { +      // Clamp the PCs to be above the base. +      R.LowPC = std::max(R.LowPC, BasePC); +      R.HighPC = std::max(R.HighPC, BasePC); +      // Compute relative PCs from the subprogram base and drop down to an +      // unsigned 32-bit int to represent them within the data structure. This +      // lets us cover a 4gb single subprogram. Because subprograms may be +      // partitioned into distant parts of a binary (think hot/cold +      // partitioning) we want to preserve as much as we can here without +      // burning extra memory. Past that, we will simply truncate and lose the +      // ability to map those PCs to a DIE more precise than the subprogram. +      const uint32_t MaxRelativePC = std::numeric_limits<uint32_t>::max(); +      uint32_t RelativeLowPC = (R.LowPC - BasePC) > (uint64_t)MaxRelativePC +                                   ? MaxRelativePC +                                   : (uint32_t)(R.LowPC - BasePC); +      uint32_t RelativeHighPC = (R.HighPC - BasePC) > (uint64_t)MaxRelativePC +                                    ? MaxRelativePC +                                    : (uint32_t)(R.HighPC - BasePC); +      // Ignore empty or bogus ranges. +      if (RelativeLowPC >= RelativeHighPC) +        continue; +      AddrMap.push_back({RelativeLowPC, DIEIndex}); +      AddrMap.push_back({RelativeHighPC, -1}); +    } + +    // If there are no address ranges, there is nothing to do to map into them +    // and there cannot be any child subroutine DIEs with address ranges of +    // interest as those would all be required to nest within this DIE's +    // non-existent ranges, so we can immediately continue to the next DIE in +    // the worklist. +    if (DieIntervalsBeginIdx == (int)AddrMap.size()) +      continue; + +    // The PCs from this DIE should never overlap, so we can easily sort them +    // here. +    std::sort(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(), +              SubroutineAddrMapSorter); +    // Remove any dead ranges. These should only come from "empty" ranges that +    // were clobbered by some other range. +    AddrMap.erase(std::unique(AddrMap.begin() + DieIntervalsBeginIdx, +                              AddrMap.end(), SubroutineAddrMapUniquer), +                  AddrMap.end()); + +    // Compute the end index of this DIE's addr map intervals. +    int DieIntervalsEndIdx = AddrMap.size(); + +    assert(DieIntervalsBeginIdx != DieIntervalsEndIdx && +           "Must not have an empty map for this layer!"); +    assert(AddrMap.back().second == -1 && "Must end with an empty range!"); +    assert(std::is_sorted(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(), +                          less_first()) && +           "Failed to sort this DIE's interals!"); + +    // If we have any parent intervals, walk the newly added ranges and find +    // the parent ranges they were inserted into. Both of these are sorted and +    // neither has any overlaps. We need to append new ranges to split up any +    // parent ranges these new ranges would overlap when we merge them. +    if (ParentIntervalsBeginIdx != ParentIntervalsEndIdx) { +      int ParentIntervalIdx = ParentIntervalsBeginIdx; +      for (int i = DieIntervalsBeginIdx, e = DieIntervalsEndIdx - 1; i < e; +           ++i) { +        const uint32_t IntervalStart = AddrMap[i].first; +        const uint32_t IntervalEnd = AddrMap[i + 1].first; +        const int IntervalDieIdx = AddrMap[i].second; +        if (IntervalDieIdx == -1) { +          // For empty intervals, nothing is required. This is a bit surprising +          // however. If the prior interval overlaps a parent interval and this +          // would be necessary to mark the end, we will synthesize a new end +          // that switches back to the parent DIE below. And this interval will +          // get dropped in favor of one with a DIE attached. However, we'll +          // still include this and so worst-case, it will still end the prior +          // interval. +          continue; +        } + +        // We are walking the new ranges in order, so search forward from the +        // last point for a parent range that might overlap. +        auto ParentIntervalsRange = +            make_range(AddrMap.begin() + ParentIntervalIdx, +                       AddrMap.begin() + ParentIntervalsEndIdx); +        assert(std::is_sorted(ParentIntervalsRange.begin(), +                              ParentIntervalsRange.end(), less_first()) && +               "Unsorted parent intervals can't be searched!"); +        auto PI = std::upper_bound( +            ParentIntervalsRange.begin(), ParentIntervalsRange.end(), +            IntervalStart, +            [](uint32_t LHS, const std::pair<uint32_t, int32_t> &RHS) { +              return LHS < RHS.first; +            }); +        if (PI == ParentIntervalsRange.begin() || +            PI == ParentIntervalsRange.end()) +          continue; + +        ParentIntervalIdx = PI - AddrMap.begin(); +        int32_t &ParentIntervalDieIdx = std::prev(PI)->second; +        uint32_t &ParentIntervalStart = std::prev(PI)->first; +        const uint32_t ParentIntervalEnd = PI->first; + +        // If the new range starts exactly at the position of the parent range, +        // we need to adjust the parent range. Note that these collisions can +        // only happen with the original parent range because we will merge any +        // adjacent ranges in the child. +        if (IntervalStart == ParentIntervalStart) { +          // If there will be a tail, just shift the start of the parent +          // forward. Note that this cannot change the parent ordering. +          if (IntervalEnd < ParentIntervalEnd) { +            ParentIntervalStart = IntervalEnd; +            continue; +          } +          // Otherwise, mark this as becoming empty so we'll remove it and +          // prefer the child range. +          ParentIntervalDieIdx = -1; +          continue; +        } + +        // Finally, if the parent interval will need to remain as a prefix to +        // this one, insert a new interval to cover any tail. +        if (IntervalEnd < ParentIntervalEnd) +          AddrMap.push_back({IntervalEnd, ParentIntervalDieIdx});        } -      AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);      } + +    // Note that we don't need to re-sort even this DIE's address map intervals +    // after this. All of the newly added intervals actually fill in *gaps* in +    // this DIE's address map, and we know that children won't need to lookup +    // into those gaps. + +    // Recurse through its children, giving them the interval map range of this +    // DIE to use as their parent intervals. +    EnqueueChildDIEs(Die, DieIntervalsBeginIdx, DieIntervalsEndIdx); +  } + +  if (AddrMap.empty()) { +    AddrMap.push_back({0, -1}); +    return;    } -  // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to -  // simplify the logic to update AddrDieMap. The child's range will always -  // be equal or smaller than the parent's range. With this assumption, when -  // adding one range into the map, it will at most split a range into 3 -  // sub-ranges. -  for (DWARFDie Child = Die.getFirstChild(); Child; Child = Child.getSibling()) -    updateAddressDieMap(Child); + +  // Now that we've added all of the intervals needed, we need to resort and +  // unique them. Most notably, this will remove all the empty ranges that had +  // a parent range covering, etc. We only expect a single non-empty interval +  // at any given start point, so we just use std::sort. This could potentially +  // produce non-deterministic maps for invalid DWARF. +  std::sort(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapSorter); +  AddrMap.erase( +      std::unique(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapUniquer), +      AddrMap.end());  }  DWARFDie DWARFUnit::getSubroutineForAddress(uint64_t Address) {    extractDIEsIfNeeded(false); -  if (AddrDieMap.empty()) -    updateAddressDieMap(getUnitDIE()); -  auto R = AddrDieMap.upper_bound(Address); -  if (R == AddrDieMap.begin()) + +  // We use a two-level mapping structure to locate subroutines for a given PC +  // address. +  // +  // First, we map the address to a subprogram. This can be done more cheaply +  // because subprograms cannot nest within each other. It also allows us to +  // avoid detailed examination of many subprograms, instead only focusing on +  // the ones which we end up actively querying. +  if (SubprogramDIEAddrMap.empty()) +    buildSubprogramDIEAddrMap(); + +  assert(!SubprogramDIEAddrMap.empty() && +         "We must always end up with a non-empty map!"); + +  auto I = std::upper_bound( +      SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), Address, +      [](uint64_t LHS, const std::pair<uint64_t, int64_t> &RHS) { +        return LHS < RHS.first; +      }); +  // If we find the beginning, then the address is before the first subprogram. +  if (I == SubprogramDIEAddrMap.begin())      return DWARFDie(); -  // upper_bound's previous item contains Address. -  --R; -  if (Address >= R->second.first) +  // Back up to the interval containing the address and see if it +  // has a DIE associated with it. +  --I; +  if (I->second == -1)      return DWARFDie(); -  return R->second.second; + +  auto &SPInfo = SubprogramDIEAddrInfos[I->second]; + +  // Now that we have the subprogram for this address, we do the second level +  // mapping by building a map within a subprogram's PC range to any specific +  // inlined subroutine. +  if (SPInfo.InlinedSubroutineDIEAddrMap.empty()) +    buildInlinedSubroutineDIEAddrMap(SPInfo); + +  // We lookup within the inlined subroutine using a subprogram-relative +  // address. +  assert(Address >= SPInfo.SubprogramBasePC && +         "Address isn't above the start of the subprogram!"); +  uint32_t RelativeAddr = ((Address - SPInfo.SubprogramBasePC) > +                           (uint64_t)std::numeric_limits<uint32_t>::max()) +                              ? std::numeric_limits<uint32_t>::max() +                              : (uint32_t)(Address - SPInfo.SubprogramBasePC); + +  auto J = +      std::upper_bound(SPInfo.InlinedSubroutineDIEAddrMap.begin(), +                       SPInfo.InlinedSubroutineDIEAddrMap.end(), RelativeAddr, +                       [](uint32_t LHS, const std::pair<uint32_t, int32_t> &RHS) { +                         return LHS < RHS.first; +                       }); +  // If we find the beginning, the address is before any inlined subroutine so +  // return the subprogram DIE. +  if (J == SPInfo.InlinedSubroutineDIEAddrMap.begin()) +    return SPInfo.SubprogramDIE; +  // Back up `J` and return the inlined subroutine if we have one or the +  // subprogram if we don't. +  --J; +  return J->second == -1 ? SPInfo.SubprogramDIE +                         : InlinedSubroutineDIEs[J->second];  }  void @@ -466,3 +815,89 @@ const DWARFAbbreviationDeclarationSet *DWARFUnit::getAbbreviations() const {      Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);    return Abbrevs;  } + +Optional<StrOffsetsContributionDescriptor> +StrOffsetsContributionDescriptor::validateContributionSize( +    DWARFDataExtractor &DA) { +  uint8_t EntrySize = getDwarfOffsetByteSize(); +  // In order to ensure that we don't read a partial record at the end of +  // the section we validate for a multiple of the entry size. +  uint64_t ValidationSize = alignTo(Size, EntrySize); +  // Guard against overflow. +  if (ValidationSize >= Size) +    if (DA.isValidOffsetForDataOfSize((uint32_t)Base, ValidationSize)) +      return *this; +  return Optional<StrOffsetsContributionDescriptor>(); +} + +// Look for a DWARF64-formatted contribution to the string offsets table +// starting at a given offset and record it in a descriptor. +static Optional<StrOffsetsContributionDescriptor> +parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) { +  if (!DA.isValidOffsetForDataOfSize(Offset, 16)) +    return Optional<StrOffsetsContributionDescriptor>(); + +  if (DA.getU32(&Offset) != 0xffffffff) +    return Optional<StrOffsetsContributionDescriptor>(); + +  uint64_t Size = DA.getU64(&Offset); +  uint8_t Version = DA.getU16(&Offset); +  (void)DA.getU16(&Offset); // padding +  return StrOffsetsContributionDescriptor(Offset, Size, Version, DWARF64); +  //return Optional<StrOffsetsContributionDescriptor>(Descriptor); +} + +// Look for a DWARF32-formatted contribution to the string offsets table +// starting at a given offset and record it in a descriptor. +static Optional<StrOffsetsContributionDescriptor> +parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) { +  if (!DA.isValidOffsetForDataOfSize(Offset, 8)) +    return Optional<StrOffsetsContributionDescriptor>(); +  uint32_t ContributionSize = DA.getU32(&Offset); +  if (ContributionSize >= 0xfffffff0) +    return Optional<StrOffsetsContributionDescriptor>(); +  uint8_t Version = DA.getU16(&Offset); +  (void)DA.getU16(&Offset); // padding +  return StrOffsetsContributionDescriptor(Offset, ContributionSize, Version, DWARF32); +  //return Optional<StrOffsetsContributionDescriptor>(Descriptor); +} + +Optional<StrOffsetsContributionDescriptor> +DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA, +                                                   uint64_t Offset) { +  Optional<StrOffsetsContributionDescriptor> Descriptor; +  // Attempt to find a DWARF64 contribution 16 bytes before the base. +  if (Offset >= 16) +    Descriptor = +        parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset - 16); +  // Try to find a DWARF32 contribution 8 bytes before the base. +  if (!Descriptor && Offset >= 8) +    Descriptor = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset - 8); +  return Descriptor ? Descriptor->validateContributionSize(DA) : Descriptor; +} + +Optional<StrOffsetsContributionDescriptor> +DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA, +                                                      uint64_t Offset) { +  if (getVersion() >= 5) { +    // Look for a valid contribution at the given offset. +    auto Descriptor = +        parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset); +    if (!Descriptor) +      Descriptor = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset); +    return Descriptor ? Descriptor->validateContributionSize(DA) : Descriptor; +  } +  // Prior to DWARF v5, we derive the contribution size from the +  // index table (in a package file). In a .dwo file it is simply +  // the length of the string offsets section. +  uint64_t Size = 0; +  if (!IndexEntry) +    Size = StringOffsetSection.Data.size(); +  else if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS)) +    Size = C->Length; +  // Return a descriptor with the given offset as base, version 4 and +  // DWARF32 format. +  //return Optional<StrOffsetsContributionDescriptor>( +      //StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32)); +  return StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32); +} diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp index 34f4017d9828..9c2258f5b933 100644 --- a/lib/Demangle/ItaniumDemangle.cpp +++ b/lib/Demangle/ItaniumDemangle.cpp @@ -8,6 +8,7 @@  //===----------------------------------------------------------------------===//  #include "llvm/Demangle/Demangle.h" +#include "llvm/Support/Compiler.h"  // This file exports a single function: llvm::itanium_demangle.  // It also has no dependencies on the rest of llvm. It is implemented this way @@ -1947,7 +1948,7 @@ static const char *parse_type(const char *first, const char *last, C &db) {                break;              }            } -        // falls through +          LLVM_FALLTHROUGH;          default:            // must check for builtin-types before class-enum-types to avoid            // ambiguities with operator-names diff --git a/lib/FuzzMutate/IRMutator.cpp b/lib/FuzzMutate/IRMutator.cpp index 15e7f86d1cdf..00b558ac4dcb 100644 --- a/lib/FuzzMutate/IRMutator.cpp +++ b/lib/FuzzMutate/IRMutator.cpp @@ -8,15 +8,17 @@  //===----------------------------------------------------------------------===//  #include "llvm/FuzzMutate/IRMutator.h" +#include "llvm/ADT/Optional.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/FuzzMutate/Operations.h"  #include "llvm/FuzzMutate/Random.h"  #include "llvm/FuzzMutate/RandomIRBuilder.h"  #include "llvm/IR/BasicBlock.h"  #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h"  #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h"  #include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h"  #include "llvm/Transforms/Scalar/DCE.h"  using namespace llvm; @@ -90,14 +92,14 @@ std::vector<fuzzerop::OpDescriptor> InjectorIRStrategy::getDefaultOps() {    return Ops;  } -fuzzerop::OpDescriptor +Optional<fuzzerop::OpDescriptor>  InjectorIRStrategy::chooseOperation(Value *Src, RandomIRBuilder &IB) {    auto OpMatchesPred = [&Src](fuzzerop::OpDescriptor &Op) {      return Op.SourcePreds[0].matches({}, Src);    };    auto RS = makeSampler(IB.Rand, make_filter_range(Operations, OpMatchesPred));    if (RS.isEmpty()) -    report_fatal_error("No available operations for src type"); +    return None;    return *RS;  } @@ -120,10 +122,15 @@ void InjectorIRStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {    // Choose an operation that's constrained to be valid for the type of the    // source, collect any other sources it needs, and then build it. -  fuzzerop::OpDescriptor OpDesc = chooseOperation(Srcs[0], IB); -  for (const auto &Pred : makeArrayRef(OpDesc.SourcePreds).slice(1)) +  auto OpDesc = chooseOperation(Srcs[0], IB); +  // Bail if no operation was found +  if (!OpDesc) +    return; + +  for (const auto &Pred : makeArrayRef(OpDesc->SourcePreds).slice(1))      Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred)); -  if (Value *Op = OpDesc.BuilderFunc(Srcs, Insts[IP])) { + +  if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP])) {      // Find a sink and wire up the results of the operation.      IB.connectToSink(BB, InstsAfter, Op);    } diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp index 90b10309b58b..59818a1425f1 100644 --- a/lib/IR/ConstantFold.cpp +++ b/lib/IR/ConstantFold.cpp @@ -1674,6 +1674,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,            }          }        } +      break;      }      default:        break; diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index 1fff912ecf2f..7063f6f40a30 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -1333,7 +1333,9 @@ Optional<uint64_t> Function::getEntryCount() const {        if (MDS->getString().equals("function_entry_count")) {          ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(1));          uint64_t Count = CI->getValue().getZExtValue(); -        if (Count == 0) +        // A value of -1 is used for SamplePGO when there were no samples. +        // Treat this the same as unknown. +        if (Count == (uint64_t)-1)            return None;          return Count;        } diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index eae697b2e4b9..163c785f5d76 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -627,9 +627,10 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,    CanBeNull = false;    if (const Argument *A = dyn_cast<Argument>(this)) {      DerefBytes = A->getDereferenceableBytes(); -    if (DerefBytes == 0 && A->hasByValAttr()) { +    if (DerefBytes == 0 && (A->hasByValAttr() || A->hasStructRetAttr())) {        Type *PT = cast<PointerType>(A->getType())->getElementType(); -      DerefBytes = DL.getTypeStoreSize(PT); +      if (PT->isSized()) +        DerefBytes = DL.getTypeStoreSize(PT);      }      if (DerefBytes == 0) {        DerefBytes = A->getDereferenceableOrNullBytes(); @@ -655,10 +656,8 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,        CanBeNull = true;      }    } else if (auto *AI = dyn_cast<AllocaInst>(this)) { -    const ConstantInt *ArraySize = dyn_cast<ConstantInt>(AI->getArraySize()); -    if (ArraySize && AI->getAllocatedType()->isSized()) { -      DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType()) * -        ArraySize->getZExtValue(); +    if (!AI->isArrayAllocation()) { +      DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType());        CanBeNull = false;      }    } else if (auto *GV = dyn_cast<GlobalVariable>(this)) { diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp index 3357553cf19f..e521b6e7c704 100644 --- a/lib/MC/MCAsmStreamer.cpp +++ b/lib/MC/MCAsmStreamer.cpp @@ -405,9 +405,13 @@ void MCAsmStreamer::emitExplicitComments() {  void MCAsmStreamer::ChangeSection(MCSection *Section,                                    const MCExpr *Subsection) {    assert(Section && "Cannot switch to a null section!"); -  Section->PrintSwitchToSection( -      *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS, -      Subsection); +  if (MCTargetStreamer *TS = getTargetStreamer()) { +    TS->changeSection(getCurrentSectionOnly(), Section, Subsection, OS); +  } else { +    Section->PrintSwitchToSection( +        *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS, +        Subsection); +  }  }  void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) { @@ -796,10 +800,15 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {           "Cannot emit contents before setting section!");    if (Data.empty()) return; -  if (Data.size() == 1) { -    OS << MAI->getData8bitsDirective(); -    OS << (unsigned)(unsigned char)Data[0]; -    EmitEOL(); +  // If only single byte is provided or no ascii or asciz directives is +  // supported, emit as vector of 8bits data. +  if (Data.size() == 1 || +      !(MAI->getAscizDirective() || MAI->getAsciiDirective())) { +    const char *Directive = MAI->getData8bitsDirective(); +    for (const unsigned char C : Data.bytes()) { +      OS << Directive << (unsigned)C; +      EmitEOL(); +    }      return;    } @@ -884,8 +893,12 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,    assert(Directive && "Invalid size for machine code value!");    OS << Directive; -  Value->print(OS, MAI); -  EmitEOL(); +  if (MCTargetStreamer *TS = getTargetStreamer()) { +    TS->emitValue(Value); +  } else { +    Value->print(OS, MAI); +    EmitEOL(); +  }  }  void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) { @@ -1097,13 +1110,19 @@ unsigned MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,      }    } -  OS << "\t.file\t" << FileNo << ' '; +  SmallString<128> Str; +  raw_svector_ostream OS1(Str); +  OS1 << "\t.file\t" << FileNo << ' ';    if (!Directory.empty()) { -    PrintQuotedString(Directory, OS); -    OS << ' '; +    PrintQuotedString(Directory, OS1); +    OS1 << ' '; +  } +  PrintQuotedString(Filename, OS1); +  if (MCTargetStreamer *TS = getTargetStreamer()) { +    TS->emitDwarfFileDirective(OS1.str()); +  } else { +    EmitRawText(OS1.str());    } -  PrintQuotedString(Filename, OS); -  EmitEOL();    return FileNo;  } diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 6f3647d61932..6e801ed8777c 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -49,6 +49,28 @@ void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {}  void MCTargetStreamer::finish() {} +void MCTargetStreamer::changeSection(const MCSection *CurSection, +                                     MCSection *Section, +                                     const MCExpr *Subsection, +                                     raw_ostream &OS) { +  Section->PrintSwitchToSection( +      *Streamer.getContext().getAsmInfo(), +      Streamer.getContext().getObjectFileInfo()->getTargetTriple(), OS, +      Subsection); +} + +void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) { +  Streamer.EmitRawText(Directive); +} + +void MCTargetStreamer::emitValue(const MCExpr *Value) { +  SmallString<128> Str; +  raw_svector_ostream OS(Str); + +  Value->print(OS, Streamer.getContext().getAsmInfo()); +  Streamer.EmitRawText(OS.str()); +} +  void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}  MCStreamer::MCStreamer(MCContext &Ctx) diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 6e76c5fac35f..0f0b645492ee 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -553,7 +553,7 @@ uint32_t WasmObjectWriter::getRelocationIndexValue(    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:      if (!IndirectSymbolIndices.count(RelEntry.Symbol)) -      report_fatal_error("symbol not found table index space: " + +      report_fatal_error("symbol not found in table index space: " +                           RelEntry.Symbol->getName());      return IndirectSymbolIndices[RelEntry.Symbol];    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: @@ -562,7 +562,7 @@ uint32_t WasmObjectWriter::getRelocationIndexValue(    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:      if (!SymbolIndices.count(RelEntry.Symbol)) -      report_fatal_error("symbol not found function/global index space: " + +      report_fatal_error("symbol not found in function/global index space: " +                           RelEntry.Symbol->getName());      return SymbolIndices[RelEntry.Symbol];    case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB: @@ -994,33 +994,10 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,    SmallVector<WasmExport, 4> Exports;    SmallVector<std::pair<StringRef, uint32_t>, 4> SymbolFlags;    SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs; -  SmallPtrSet<const MCSymbolWasm *, 4> IsAddressTaken;    unsigned NumFuncImports = 0;    SmallVector<WasmDataSegment, 4> DataSegments;    uint32_t DataSize = 0; -  // Populate the IsAddressTaken set. -  for (const WasmRelocationEntry &RelEntry : CodeRelocations) { -    switch (RelEntry.Type) { -    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: -    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: -      IsAddressTaken.insert(RelEntry.Symbol); -      break; -    default: -      break; -    } -  } -  for (const WasmRelocationEntry &RelEntry : DataRelocations) { -    switch (RelEntry.Type) { -    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: -    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: -      IsAddressTaken.insert(RelEntry.Symbol); -      break; -    default: -      break; -    } -  } -    // In the special .global_variables section, we've encoded global    // variables used by the function. Translate them into the Globals    // list. @@ -1116,7 +1093,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,        continue;      // If the symbol is not defined in this translation unit, import it. -    if (!WS.isDefined(/*SetUsed=*/false)) { +    if (!WS.isDefined(/*SetUsed=*/false) || WS.isVariable()) {        WasmImport Import;        Import.ModuleName = WS.getModuleName();        Import.FieldName = WS.getName(); @@ -1132,8 +1109,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,          Import.IsMutable = false;          SymbolIndices[&WS] = NumGlobalImports; -        // If this global is the stack pointer, make it mutable and remember it -        // so that we can emit metadata for it. +        // If this global is the stack pointer, make it mutable.          if (WS.getName() == "__stack_pointer")            Import.IsMutable = true; @@ -1218,14 +1194,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,        }        DEBUG(dbgs() << "  -> function index: " << Index << "\n"); - -      // If needed, prepare the function to be called indirectly. -      if (IsAddressTaken.count(&WS) != 0) { -        IndirectSymbolIndices[&WS] = TableElems.size(); -        DEBUG(dbgs() << "  -> adding to table: " << TableElems.size() << "\n"); -        TableElems.push_back(Index); -      } -    } else { +   } else {        if (WS.isTemporary() && !WS.getSize())          continue; @@ -1289,7 +1258,6 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,      uint32_t Index = SymbolIndices.find(ResolvedSym)->second;      DEBUG(dbgs() << "  -> index:" << Index << "\n"); -    SymbolIndices[&WS] = Index;      WasmExport Export;      Export.FieldName = WS.getName();      Export.Index = Index; @@ -1304,12 +1272,34 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,        SymbolFlags.emplace_back(WS.getName(), wasm::WASM_SYMBOL_BINDING_LOCAL);    } -  // Add types for indirect function calls. -  for (const WasmRelocationEntry &Fixup : CodeRelocations) { -    if (Fixup.Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB) -      continue; +  { +    auto HandleReloc = [&](const WasmRelocationEntry &Rel) { +      // Functions referenced by a relocation need to prepared to be called +      // indirectly. +      const MCSymbolWasm& WS = *Rel.Symbol; +      if (WS.isFunction() && IndirectSymbolIndices.count(&WS) == 0) { +        switch (Rel.Type) { +        case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: +        case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: +        case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: +        case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: { +          uint32_t Index = SymbolIndices.find(&WS)->second; +          IndirectSymbolIndices[&WS] = TableElems.size(); +          DEBUG(dbgs() << "  -> adding to table: " << TableElems.size() << "\n"); +          TableElems.push_back(Index); +          registerFunctionType(WS); +          break; +        } +        default: +          break; +        } +      } +    }; -    registerFunctionType(*Fixup.Symbol); +    for (const WasmRelocationEntry &RelEntry : CodeRelocations) +      HandleReloc(RelEntry); +    for (const WasmRelocationEntry &RelEntry : DataRelocations) +      HandleReloc(RelEntry);    }    // Translate .init_array section contents into start functions. diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index c72a1258c1ee..5906dc5f5307 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -138,6 +138,7 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,      default:        break;      } +    break;    case ELF::EM_BPF:      switch (Type) {  #include "llvm/BinaryFormat/ELFRelocs/BPF.def" diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index 7a0c05ed8a15..48f98df6f34d 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -303,7 +303,6 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {  void WasmObjectFile::populateSymbolTable() {    // Add imports to symbol table -  size_t ImportIndex = 0;    size_t GlobalIndex = 0;    size_t FunctionIndex = 0;    for (const wasm::WasmImport& Import : Imports) { @@ -312,7 +311,7 @@ void WasmObjectFile::populateSymbolTable() {        assert(Import.Global.Type == wasm::WASM_TYPE_I32);        SymbolMap.try_emplace(Import.Field, Symbols.size());        Symbols.emplace_back(Import.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT, -                           ImportSection, GlobalIndex++, ImportIndex); +                           ImportSection, GlobalIndex++);        DEBUG(dbgs() << "Adding import: " << Symbols.back()                     << " sym index:" << Symbols.size() << "\n");        break; @@ -320,14 +319,13 @@ void WasmObjectFile::populateSymbolTable() {        SymbolMap.try_emplace(Import.Field, Symbols.size());        Symbols.emplace_back(Import.Field,                             WasmSymbol::SymbolType::FUNCTION_IMPORT, -                           ImportSection, FunctionIndex++, ImportIndex); +                           ImportSection, FunctionIndex++, Import.SigIndex);        DEBUG(dbgs() << "Adding import: " << Symbols.back()                     << " sym index:" << Symbols.size() << "\n");        break;      default:        break;      } -    ImportIndex++;    }    // Add exports to symbol table @@ -338,11 +336,22 @@ void WasmObjectFile::populateSymbolTable() {            Export.Kind == wasm::WASM_EXTERNAL_FUNCTION                ? WasmSymbol::SymbolType::FUNCTION_EXPORT                : WasmSymbol::SymbolType::GLOBAL_EXPORT; -      SymbolMap.try_emplace(Export.Name, Symbols.size()); -      Symbols.emplace_back(Export.Name, ExportType, -                           ExportSection, Export.Index); -      DEBUG(dbgs() << "Adding export: " << Symbols.back() -                   << " sym index:" << Symbols.size() << "\n"); +      auto Pair = SymbolMap.try_emplace(Export.Name, Symbols.size()); +      if (Pair.second) { +        Symbols.emplace_back(Export.Name, ExportType, +                             ExportSection, Export.Index); +        DEBUG(dbgs() << "Adding export: " << Symbols.back() +                     << " sym index:" << Symbols.size() << "\n"); +      } else { +        uint32_t SymIndex = Pair.first->second; +        const WasmSymbol &OldSym = Symbols[SymIndex]; +        WasmSymbol NewSym(Export.Name, ExportType, ExportSection, Export.Index); +        NewSym.setAltIndex(OldSym.ElementIndex); +        Symbols[SymIndex] = NewSym; + +        DEBUG(dbgs() << "Replacing existing symbol:  " << NewSym +                     << " sym index:" << SymIndex << "\n"); +      }      }    }  } @@ -1017,7 +1026,7 @@ void WasmObjectFile::getRelocationTypeName(      break;    switch (Rel.Type) { -#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def" +#include "llvm/BinaryFormat/WasmRelocs.def"    }  #undef WASM_RELOC diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp index 9ca584a4a1ae..271224ec6312 100644 --- a/lib/Object/WindowsResource.cpp +++ b/lib/Object/WindowsResource.cpp @@ -14,6 +14,7 @@  #include "llvm/Object/WindowsResource.h"  #include "llvm/Object/COFF.h"  #include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/FormatVariadic.h"  #include "llvm/Support/MathExtras.h"  #include <ctime>  #include <queue> @@ -560,10 +561,9 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {    // Now write a symbol for each relocation.    for (unsigned i = 0; i < Data.size(); i++) { -    char RelocationName[9]; -    sprintf(RelocationName, "$R%06X", DataOffsets[i]); +    auto RelocationName = formatv("$R{0:X-6}", i & 0xffffff).sstr<COFF::NameSize>();      Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset); -    strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize); +    memcpy(Symbol->Name.ShortName, RelocationName.data(), (size_t) COFF::NameSize);      Symbol->Value = DataOffsets[i];      Symbol->SectionNumber = 2;      Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL; diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp index 8687f22949a2..b2411395dc0f 100644 --- a/lib/ObjectYAML/WasmYAML.cpp +++ b/lib/ObjectYAML/WasmYAML.cpp @@ -439,7 +439,7 @@ void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(  void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration(      IO &IO, WasmYAML::RelocType &Type) {  #define WASM_RELOC(name, value) IO.enumCase(Type, #name, wasm::name); -#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def" +#include "llvm/BinaryFormat/WasmRelocs.def"  #undef WASM_RELOC  } diff --git a/lib/Passes/LLVMBuild.txt b/lib/Passes/LLVMBuild.txt index 4d8c7f85d3aa..e2378a84328e 100644 --- a/lib/Passes/LLVMBuild.txt +++ b/lib/Passes/LLVMBuild.txt @@ -19,4 +19,4 @@  type = Library  name = Passes  parent = Libraries -required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp index f7fb0cef16bf..3489feb93a02 100644 --- a/lib/Support/APFloat.cpp +++ b/lib/Support/APFloat.cpp @@ -2546,12 +2546,12 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {  }  bool IEEEFloat::convertFromStringSpecials(StringRef str) { -  if (str.equals("inf") || str.equals("INFINITY")) { +  if (str.equals("inf") || str.equals("INFINITY") || str.equals("+Inf")) {      makeInf(false);      return true;    } -  if (str.equals("-inf") || str.equals("-INFINITY")) { +  if (str.equals("-inf") || str.equals("-INFINITY") || str.equals("-Inf")) {      makeInf(true);      return true;    } diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp index 3e97c991f504..141573c2a1c7 100644 --- a/lib/Support/CachePruning.cpp +++ b/lib/Support/CachePruning.cpp @@ -165,12 +165,14 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {        return false;      }    } else { +    if (!Policy.Interval) +      return false;      if (Policy.Interval != seconds(0)) {        // Check whether the time stamp is older than our pruning interval.        // If not, do nothing.        const auto TimeStampModTime = FileStatus.getLastModificationTime();        auto TimeStampAge = CurrentTime - TimeStampModTime; -      if (TimeStampAge <= Policy.Interval) { +      if (TimeStampAge <= *Policy.Interval) {          DEBUG(dbgs() << "Timestamp file too recent ("                       << duration_cast<seconds>(TimeStampAge).count()                       << "s old), do not prune.\n"); diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp index 85e782b2c048..c709fc416df6 100644 --- a/lib/Support/MemoryBuffer.cpp +++ b/lib/Support/MemoryBuffer.cpp @@ -80,10 +80,12 @@ void *operator new(size_t N, const NamedBufferAlloc &Alloc) {  namespace {  /// MemoryBufferMem - Named MemoryBuffer pointing to a block of memory. -class MemoryBufferMem : public MemoryBuffer { +template<typename MB> +class MemoryBufferMem : public MB {  public:    MemoryBufferMem(StringRef InputData, bool RequiresNullTerminator) { -    init(InputData.begin(), InputData.end(), RequiresNullTerminator); +    MemoryBuffer::init(InputData.begin(), InputData.end(), +                       RequiresNullTerminator);    }    /// Disable sized deallocation for MemoryBufferMem, because it has @@ -95,21 +97,22 @@ public:      return StringRef(reinterpret_cast<const char *>(this + 1));    } -  BufferKind getBufferKind() const override { -    return MemoryBuffer_Malloc; +  MemoryBuffer::BufferKind getBufferKind() const override { +    return MemoryBuffer::MemoryBuffer_Malloc;    }  };  } -static ErrorOr<std::unique_ptr<MemoryBuffer>> -getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,  +template <typename MB> +static ErrorOr<std::unique_ptr<MB>> +getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,             uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile);  std::unique_ptr<MemoryBuffer>  MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,                             bool RequiresNullTerminator) {    auto *Ret = new (NamedBufferAlloc(BufferName)) -      MemoryBufferMem(InputData, RequiresNullTerminator); +      MemoryBufferMem<MemoryBuffer>(InputData, RequiresNullTerminator);    return std::unique_ptr<MemoryBuffer>(Ret);  } @@ -119,50 +122,30 @@ MemoryBuffer::getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator) {        Ref.getBuffer(), Ref.getBufferIdentifier(), RequiresNullTerminator));  } -std::unique_ptr<MemoryBuffer> -MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) { -  std::unique_ptr<MemoryBuffer> Buf = -      getNewUninitMemBuffer(InputData.size(), BufferName); +static ErrorOr<std::unique_ptr<WritableMemoryBuffer>> +getMemBufferCopyImpl(StringRef InputData, const Twine &BufferName) { +  auto Buf = WritableMemoryBuffer::getNewUninitMemBuffer(InputData.size(), BufferName);    if (!Buf) -    return nullptr; -  memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(), -         InputData.size()); -  return Buf; +    return make_error_code(errc::not_enough_memory); +  memcpy(Buf->getBufferStart(), InputData.data(), InputData.size()); +  return std::move(Buf);  }  std::unique_ptr<MemoryBuffer> -MemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) { -  // Allocate space for the MemoryBuffer, the data and the name. It is important -  // that MemoryBuffer and data are aligned so PointerIntPair works with them. -  // TODO: Is 16-byte alignment enough?  We copy small object files with large -  // alignment expectations into this buffer. -  SmallString<256> NameBuf; -  StringRef NameRef = BufferName.toStringRef(NameBuf); -  size_t AlignedStringLen = -      alignTo(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16); -  size_t RealLen = AlignedStringLen + Size + 1; -  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow)); -  if (!Mem) -    return nullptr; - -  // The name is stored after the class itself. -  CopyStringRef(Mem + sizeof(MemoryBufferMem), NameRef); - -  // The buffer begins after the name and must be aligned. -  char *Buf = Mem + AlignedStringLen; -  Buf[Size] = 0; // Null terminate buffer. - -  auto *Ret = new (Mem) MemoryBufferMem(StringRef(Buf, Size), true); -  return std::unique_ptr<MemoryBuffer>(Ret); +MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) { +  auto Buf = getMemBufferCopyImpl(InputData, BufferName); +  if (Buf) +    return std::move(*Buf); +  return nullptr;  }  std::unique_ptr<MemoryBuffer>  MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) { -  std::unique_ptr<MemoryBuffer> SB = getNewUninitMemBuffer(Size, BufferName); +  auto SB = WritableMemoryBuffer::getNewUninitMemBuffer(Size, BufferName);    if (!SB)      return nullptr; -  memset(const_cast<char*>(SB->getBufferStart()), 0, Size); -  return SB; +  memset(SB->getBufferStart(), 0, Size); +  return std::move(SB);  }  ErrorOr<std::unique_ptr<MemoryBuffer>> @@ -179,10 +162,10 @@ MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,  ErrorOr<std::unique_ptr<MemoryBuffer>>  MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize,                              uint64_t Offset, bool IsVolatile) { -  return getFileAux(FilePath, -1, MapSize, Offset, false, IsVolatile); +  return getFileAux<MemoryBuffer>(FilePath, -1, MapSize, Offset, false, +                                  IsVolatile);  } -  //===----------------------------------------------------------------------===//  // MemoryBuffer::getFile implementation.  //===----------------------------------------------------------------------===// @@ -191,7 +174,8 @@ namespace {  /// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.  ///  /// This handles converting the offset into a legal offset on the platform. -class MemoryBufferMMapFile : public MemoryBuffer { +template<typename MB> +class MemoryBufferMMapFile : public MB {    sys::fs::mapped_file_region MFR;    static uint64_t getLegalMapOffset(uint64_t Offset) { @@ -209,11 +193,13 @@ class MemoryBufferMMapFile : public MemoryBuffer {  public:    MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,                         uint64_t Offset, std::error_code &EC) -      : MFR(FD, sys::fs::mapped_file_region::readonly, +      : MFR(FD, +            MB::Writable ? sys::fs::mapped_file_region::priv +                         : sys::fs::mapped_file_region::readonly,              getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {      if (!EC) {        const char *Start = getStart(Len, Offset); -      init(Start, Start + Len, RequiresNullTerminator); +      MemoryBuffer::init(Start, Start + Len, RequiresNullTerminator);      }    } @@ -226,13 +212,13 @@ public:      return StringRef(reinterpret_cast<const char *>(this + 1));    } -  BufferKind getBufferKind() const override { -    return MemoryBuffer_MMap; +  MemoryBuffer::BufferKind getBufferKind() const override { +    return MemoryBuffer::MemoryBuffer_MMap;    }  };  } -static ErrorOr<std::unique_ptr<MemoryBuffer>> +static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>  getMemoryBufferForStream(int FD, const Twine &BufferName) {    const ssize_t ChunkSize = 4096*4;    SmallString<ChunkSize> Buffer; @@ -246,37 +232,80 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {      Buffer.set_size(Buffer.size() + ReadBytes);    } while (ReadBytes != 0); -  return MemoryBuffer::getMemBufferCopy(Buffer, BufferName); +  return getMemBufferCopyImpl(Buffer, BufferName);  }  ErrorOr<std::unique_ptr<MemoryBuffer>>  MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,                        bool RequiresNullTerminator, bool IsVolatile) { -  return getFileAux(Filename, FileSize, FileSize, 0, -                    RequiresNullTerminator, IsVolatile); +  return getFileAux<MemoryBuffer>(Filename, FileSize, FileSize, 0, +                                  RequiresNullTerminator, IsVolatile);  } -static ErrorOr<std::unique_ptr<MemoryBuffer>> +template <typename MB> +static ErrorOr<std::unique_ptr<MB>>  getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,                  uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,                  bool IsVolatile); -static ErrorOr<std::unique_ptr<MemoryBuffer>> +template <typename MB> +static ErrorOr<std::unique_ptr<MB>>  getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,             uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) {    int FD;    std::error_code EC = sys::fs::openFileForRead(Filename, FD); +    if (EC)      return EC; -  ErrorOr<std::unique_ptr<MemoryBuffer>> Ret = -      getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset, -                      RequiresNullTerminator, IsVolatile); +  auto Ret = getOpenFileImpl<MB>(FD, Filename, FileSize, MapSize, Offset, +                                 RequiresNullTerminator, IsVolatile);    close(FD);    return Ret;  } +ErrorOr<std::unique_ptr<WritableMemoryBuffer>> +WritableMemoryBuffer::getFile(const Twine &Filename, int64_t FileSize, +                              bool IsVolatile) { +  return getFileAux<WritableMemoryBuffer>(Filename, FileSize, FileSize, 0, +                                          /*RequiresNullTerminator*/ false, +                                          IsVolatile); +} + +ErrorOr<std::unique_ptr<WritableMemoryBuffer>> +WritableMemoryBuffer::getFileSlice(const Twine &Filename, uint64_t MapSize, +                                   uint64_t Offset, bool IsVolatile) { +  return getFileAux<WritableMemoryBuffer>(Filename, -1, MapSize, Offset, false, +                                          IsVolatile); +} + +std::unique_ptr<WritableMemoryBuffer> +WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) { +  using MemBuffer = MemoryBufferMem<WritableMemoryBuffer>; +  // Allocate space for the MemoryBuffer, the data and the name. It is important +  // that MemoryBuffer and data are aligned so PointerIntPair works with them. +  // TODO: Is 16-byte alignment enough?  We copy small object files with large +  // alignment expectations into this buffer. +  SmallString<256> NameBuf; +  StringRef NameRef = BufferName.toStringRef(NameBuf); +  size_t AlignedStringLen = alignTo(sizeof(MemBuffer) + NameRef.size() + 1, 16); +  size_t RealLen = AlignedStringLen + Size + 1; +  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow)); +  if (!Mem) +    return nullptr; + +  // The name is stored after the class itself. +  CopyStringRef(Mem + sizeof(MemBuffer), NameRef); + +  // The buffer begins after the name and must be aligned. +  char *Buf = Mem + AlignedStringLen; +  Buf[Size] = 0; // Null terminate buffer. + +  auto *Ret = new (Mem) MemBuffer(StringRef(Buf, Size), true); +  return std::unique_ptr<WritableMemoryBuffer>(Ret); +} +  static bool shouldUseMmap(int FD,                            size_t FileSize,                            size_t MapSize, @@ -332,7 +361,8 @@ static bool shouldUseMmap(int FD,    return true;  } -static ErrorOr<std::unique_ptr<MemoryBuffer>> +template <typename MB> +static ErrorOr<std::unique_ptr<MB>>  getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,                  uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,                  bool IsVolatile) { @@ -364,22 +394,21 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,    if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,                      PageSize, IsVolatile)) {      std::error_code EC; -    std::unique_ptr<MemoryBuffer> Result( -        new (NamedBufferAlloc(Filename)) -        MemoryBufferMMapFile(RequiresNullTerminator, FD, MapSize, Offset, EC)); +    std::unique_ptr<MB> Result( +        new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>( +            RequiresNullTerminator, FD, MapSize, Offset, EC));      if (!EC)        return std::move(Result);    } -  std::unique_ptr<MemoryBuffer> Buf = -      MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename); +  auto Buf = WritableMemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);    if (!Buf) {      // Failed to create a buffer. The only way it can fail is if      // new(std::nothrow) returns 0.      return make_error_code(errc::not_enough_memory);    } -  char *BufPtr = const_cast<char *>(Buf->getBufferStart()); +  char *BufPtr = Buf.get()->getBufferStart();    size_t BytesLeft = MapSize;  #ifndef HAVE_PREAD @@ -412,7 +441,7 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,  ErrorOr<std::unique_ptr<MemoryBuffer>>  MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,                            bool RequiresNullTerminator, bool IsVolatile) { -  return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0, +  return getOpenFileImpl<MemoryBuffer>(FD, Filename, FileSize, FileSize, 0,                           RequiresNullTerminator, IsVolatile);  } @@ -420,7 +449,8 @@ ErrorOr<std::unique_ptr<MemoryBuffer>>  MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,                                 int64_t Offset, bool IsVolatile) {    assert(MapSize != uint64_t(-1)); -  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatile); +  return getOpenFileImpl<MemoryBuffer>(FD, Filename, -1, MapSize, Offset, false, +                                       IsVolatile);  }  ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() { diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp index 90992fce0bcc..9ba7a09f9962 100644 --- a/lib/Support/StringRef.cpp +++ b/lib/Support/StringRef.cpp @@ -586,7 +586,7 @@ bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {    APFloat::opStatus Status =        F.convertFromString(*this, APFloat::rmNearestTiesToEven);    if (Status != APFloat::opOK) { -    if (!AllowInexact || Status != APFloat::opInexact) +    if (!AllowInexact || !(Status & APFloat::opInexact))        return true;    } diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index c59068cb3550..b96ca084e9bf 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -537,7 +537,7 @@ StringRef llvm::AArch64::getDefaultCPU(StringRef Arch) {  }  unsigned llvm::AArch64::checkArchVersion(StringRef Arch) { -  if (Arch[0] == 'v' && std::isdigit(Arch[1])) +  if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1]))      return (Arch[1] - 48);    return 0;  } @@ -633,7 +633,7 @@ StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) {    // Only match non-marketing names    if (offset != StringRef::npos) {      // Must start with 'vN'. -    if (A[0] != 'v' || !std::isdigit(A[1])) +    if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1])))        return Error;      // Can't have an extra 'eb'.      if (A.find("eb") != StringRef::npos) @@ -739,7 +739,6 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {    case ARM::ArchKind::ARMV8_2A:    case ARM::ArchKind::ARMV8_3A:      return ARM::ProfileKind::A; -    LLVM_FALLTHROUGH;    case ARM::ArchKind::ARMV2:    case ARM::ArchKind::ARMV2A:    case ARM::ArchKind::ARMV3: diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp index 05ca40f03018..f8a80ba87873 100644 --- a/lib/Support/YAMLTraits.cpp +++ b/lib/Support/YAMLTraits.cpp @@ -657,7 +657,12 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {        }        i = j + 1;      } else if (MustQuote == QuotingType::Double && -               !sys::unicode::isPrintable(S[j])) { +               !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) { +      // If we're double quoting non-printable characters, we prefer printing +      // them as "\x" + their hex representation. Note that special casing is +      // needed for UTF-8, where a byte may be part of a UTF-8 sequence and +      // appear as non-printable, in which case we want to print the correct +      // unicode character and not its hex representation.        output(StringRef(&Base[i], j - i)); // "flush"        output(StringLiteral("\\x")); diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 67138f41dda8..2ff2ee347f56 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -583,6 +583,20 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {    switch (MI->getOpcode()) {    default:      break; +  case AArch64::MOVIv2d_ns: +    // If the target has <rdar://problem/16473581>, lower this +    // instruction to movi.16b instead. +    if (STI->hasZeroCycleZeroingFPWorkaround() && +        MI->getOperand(1).getImm() == 0) { +      MCInst TmpInst; +      TmpInst.setOpcode(AArch64::MOVIv16b_ns); +      TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); +      TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm())); +      EmitToStreamer(*OutStreamer, TmpInst); +      return; +    } +    break; +    case AArch64::DBG_VALUE: {      if (isVerbose() && OutStreamer->hasRawTextSupport()) {        SmallString<128> TmpStr; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index fd1699fd363d..022200986d2b 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -5135,11 +5135,12 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {      return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I));    } -  // fall-back to target-independent instruction selection. -  return selectOperator(I, I->getOpcode());    // Silence warnings.    (void)&CC_AArch64_DarwinPCS_VarArg;    (void)&CC_AArch64_Win64_VarArg; + +  // fall-back to target-independent instruction selection. +  return selectOperator(I, I->getOpcode());  }  namespace llvm { diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 73944359223a..d66f7b59a4b5 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -97,6 +97,7 @@  #include "AArch64RegisterInfo.h"  #include "AArch64Subtarget.h"  #include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/CodeGen/LivePhysRegs.h" @@ -335,6 +336,22 @@ bool AArch64FrameLowering::canUseAsPrologue(    return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;  } +static bool windowsRequiresStackProbe(MachineFunction &MF, +                                      unsigned StackSizeInBytes) { +  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); +  if (!Subtarget.isTargetWindows()) +    return false; +  const Function &F = MF.getFunction(); +  // TODO: When implementing stack protectors, take that into account +  // for the probe threshold. +  unsigned StackProbeSize = 4096; +  if (F.hasFnAttribute("stack-probe-size")) +    F.getFnAttribute("stack-probe-size") +        .getValueAsString() +        .getAsInteger(0, StackProbeSize); +  return StackSizeInBytes >= StackProbeSize; +} +  bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(      MachineFunction &MF, unsigned StackBumpBytes) const {    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -347,7 +364,7 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(    // 512 is the maximum immediate for stp/ldp that will be used for    // callee-save save/restores -  if (StackBumpBytes >= 512) +  if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))      return false;    if (MFI.hasVarSizedObjects()) @@ -478,7 +495,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,      return;    int NumBytes = (int)MFI.getStackSize(); -  if (!AFI->hasStackFrame()) { +  if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {      assert(!HasFP && "unexpected function without stack frame but with FP");      // All of the stack allocation is for locals. @@ -550,6 +567,44 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,                      MachineInstr::FrameSetup);    } +  if (windowsRequiresStackProbe(MF, NumBytes)) { +    uint32_t NumWords = NumBytes >> 4; + +    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) +        .addImm(NumWords) +        .setMIFlags(MachineInstr::FrameSetup); + +    switch (MF.getTarget().getCodeModel()) { +    case CodeModel::Small: +    case CodeModel::Medium: +    case CodeModel::Kernel: +      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) +          .addExternalSymbol("__chkstk") +          .addReg(AArch64::X15, RegState::Implicit) +          .setMIFlags(MachineInstr::FrameSetup); +      break; +    case CodeModel::Large: +      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) +          .addReg(AArch64::X16, RegState::Define) +          .addExternalSymbol("__chkstk") +          .addExternalSymbol("__chkstk") +          .setMIFlags(MachineInstr::FrameSetup); + +      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) +          .addReg(AArch64::X16, RegState::Kill) +          .addReg(AArch64::X15, RegState::Implicit | RegState::Define) +          .setMIFlags(MachineInstr::FrameSetup); +      break; +    } + +    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) +        .addReg(AArch64::SP, RegState::Kill) +        .addReg(AArch64::X15, RegState::Kill) +        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) +        .setMIFlags(MachineInstr::FrameSetup); +    NumBytes = 0; +  } +    // Allocate space for the rest of the frame.    if (NumBytes) {      const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1164,18 +1219,32 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,    unsigned UnspilledCSGPR = AArch64::NoRegister;    unsigned UnspilledCSGPRPaired = AArch64::NoRegister; +  MachineFrameInfo &MFI = MF.getFrameInfo(); +  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + +  unsigned BasePointerReg = RegInfo->hasBasePointer(MF) +                                ? RegInfo->getBaseRegister() +                                : (unsigned)AArch64::NoRegister; + +  unsigned SpillEstimate = SavedRegs.count(); +  for (unsigned i = 0; CSRegs[i]; ++i) { +    unsigned Reg = CSRegs[i]; +    unsigned PairedReg = CSRegs[i ^ 1]; +    if (Reg == BasePointerReg) +      SpillEstimate++; +    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) +      SpillEstimate++; +  } +  SpillEstimate += 2; // Conservatively include FP+LR in the estimate +  unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate; +    // The frame record needs to be created by saving the appropriate registers -  if (hasFP(MF)) { +  if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) {      SavedRegs.set(AArch64::FP);      SavedRegs.set(AArch64::LR);    } -  unsigned BasePointerReg = AArch64::NoRegister; -  if (RegInfo->hasBasePointer(MF)) -    BasePointerReg = RegInfo->getBaseRegister(); -    unsigned ExtraCSSpill = 0; -  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);    // Figure out which callee-saved registers to save/restore.    for (unsigned i = 0; CSRegs[i]; ++i) {      const unsigned Reg = CSRegs[i]; @@ -1217,7 +1286,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,    // The CSR spill slots have not been allocated yet, so estimateStackSize    // won't include them. -  MachineFrameInfo &MFI = MF.getFrameInfo();    unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;    DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");    unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 1242cf5be188..6f7b2b6fd5b5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -470,10 +470,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,    if (Subtarget->hasPerfMon())      setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); -  if (Subtarget->isTargetMachO()) { -    // For iOS, we don't want to the normal expansion of a libcall to -    // sincos. We want to issue a libcall to __sincos_stret to avoid memory -    // traffic. +  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && +      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { +    // Issue __sincos_stret if available.      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);    } else { @@ -2328,8 +2327,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,    Entry.IsZExt = false;    Args.push_back(Entry); -  const char *LibcallName = -      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; +  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 +                                        : RTLIB::SINCOS_STRET_F32; +  const char *LibcallName = getLibcallName(LC);    SDValue Callee =        DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index c7c560a81328..abbba7d1d5a9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4963,16 +4963,9 @@ void AArch64InstrInfo::insertOutlinerEpilogue(      MachineBasicBlock &MBB, MachineFunction &MF,      const MachineOutlinerInfo &MInfo) const { -  bool ContainsCalls = false; - -  for (MachineInstr &MI : MBB) { -    if (MI.isCall()) { -      ContainsCalls = true; -      break; -    } -  } - -  if (ContainsCalls) { +  // Is there a call in the outlined range? +  if (std::any_of(MBB.instr_begin(), MBB.instr_end(), +                  [](MachineInstr &MI) { return MI.isCall(); })) {      // Fix up the instructions in the range, since we're going to modify the      // stack.      fixupPostOutline(MBB); diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 7f5507371fa0..a719d47618e5 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -25,11 +25,11 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(    ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);    const AArch64Subtarget &STI =        DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); -  const char *bzeroEntry = -      (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr; +  const char *bzeroName = (V && V->isNullValue()) +      ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) : nullptr;    // For small size (< 256), it is not beneficial to use bzero    // instead of memset. -  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { +  if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) {      const AArch64TargetLowering &TLI = *STI.getTargetLowering();      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); @@ -45,7 +45,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(      CLI.setDebugLoc(dl)          .setChain(Chain)          .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), -                      DAG.getExternalSymbol(bzeroEntry, IntPtr), +                      DAG.getExternalSymbol(bzeroName, IntPtr),                        std::move(Args))          .setDiscardResult();      std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index e397d585ae77..688bb936d0ca 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -217,19 +217,6 @@ unsigned char AArch64Subtarget::classifyGlobalFunctionReference(    return AArch64II::MO_NO_FLAG;  } -/// This function returns the name of a function which has an interface -/// like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over -/// memset with zero passed as the second argument. Otherwise it -/// returns null. -const char *AArch64Subtarget::getBZeroEntry() const { -  // Prefer bzero on Darwin only. -  if(isTargetDarwin()) -    return "bzero"; - -  return nullptr; -} -  void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,                                             unsigned NumRegionInstrs) const {    // LNT run (at least on Cyclone) showed reasonably significant gains for diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 5d9759d363dd..9245b2f396b7 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -309,13 +309,6 @@ public:    unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,                                                  const TargetMachine &TM) const; -  /// This function returns the name of a function which has an interface -  /// like the non-standard bzero function, if such a function exists on -  /// the current subtarget and it is considered prefereable over -  /// memset with zero passed as the second argument. Otherwise it -  /// returns null. -  const char *getBZeroEntry() const; -    void overrideSchedPolicy(MachineSchedPolicy &Policy,                             unsigned NumRegionInstrs) const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index df939add70fa..66b7e02ceb99 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -322,6 +322,9 @@ def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;  def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;  def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;  def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CCSIDR2_EL1",        0b11, 0b001, 0b0000, 0b0000, 0b010> { +  let Requires = [{ {AArch64::HasV8_3aOps} }]; +}  def : ROSysReg<"CLIDR_EL1",          0b11, 0b001, 0b0000, 0b0000, 0b001>;  def : ROSysReg<"CTR_EL0",            0b11, 0b011, 0b0000, 0b0000, 0b001>;  def : ROSysReg<"MPIDR_EL1",          0b11, 0b000, 0b0000, 0b0000, 0b101>; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 64583ead73f2..0e6ad944c141 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -346,10 +346,9 @@ public:  } // end anonymous namespace -TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(AArch64TTIImpl(this, F)); -  }); +TargetTransformInfo +AArch64TargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(AArch64TTIImpl(this, F));  }  TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 2bbfb2da3db6..8d28a5e30ebf 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -44,8 +44,7 @@ public:    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -  /// \brief Get the TargetIRAnalysis for this target. -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    TargetLoweringObjectFile* getObjFileLowering() const override {      return TLOF.get(); diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index aeffbd70fc81..6e63783e5646 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1975,10 +1975,6 @@ static bool isValidSVEKind(StringRef Name) {        .Default(false);  } -static bool isSVERegister(StringRef Name) { -  return Name[0] == 'z' || Name[0] == 'p'; -} -  static void parseValidVectorKind(StringRef Name, unsigned &NumElements,                                   char &ElementKind) {    assert(isValidVectorKind(Name)); @@ -2008,21 +2004,19 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,  // Matches a register name or register alias previously defined by '.req'  unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,                                                    RegKind Kind) { -  unsigned RegNum; -  switch (Kind) { -  case RegKind::Scalar: -    RegNum = MatchRegisterName(Name); -    break; -  case RegKind::NeonVector: -    RegNum = MatchNeonVectorRegName(Name); -    break; -  case RegKind::SVEDataVector: -    RegNum = matchSVEDataVectorRegName(Name); -    break; -  case RegKind::SVEPredicateVector: -    RegNum = matchSVEPredicateVectorRegName(Name); -    break; -  } +  unsigned RegNum = 0; +  if ((RegNum = matchSVEDataVectorRegName(Name))) +    return Kind == RegKind::SVEDataVector ? RegNum : 0; + +  if ((RegNum = matchSVEPredicateVectorRegName(Name))) +    return Kind == RegKind::SVEPredicateVector ? RegNum : 0; + +  if ((RegNum = MatchNeonVectorRegName(Name))) +    return Kind == RegKind::NeonVector ? RegNum : 0; + +  // The parsed register must be of RegKind Scalar +  if ((RegNum = MatchRegisterName(Name))) +    return Kind == RegKind::Scalar ? RegNum : 0;    if (!RegNum) {      // Check for aliases registered via .req. Canonicalize to lower case. @@ -2049,10 +2043,8 @@ int AArch64AsmParser::tryParseRegister() {      return -1;    std::string lowerCase = Tok.getString().lower(); -  if (isSVERegister(lowerCase)) -    return -1; -    unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar); +    // Also handle a few aliases of registers.    if (RegNum == 0)      RegNum = StringSwitch<unsigned>(lowerCase) diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index bb628b8c558f..fda6252f46e3 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -695,18 +695,24 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(            IsSGPR = false;            Width = 3;          } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { +          assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && +            "trap handler registers should not be used");            IsSGPR = true;            Width = 4;          } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {            IsSGPR = false;            Width = 4;          } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { +          assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && +            "trap handler registers should not be used");            IsSGPR = true;            Width = 8;          } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {            IsSGPR = false;            Width = 8;          } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { +          assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && +            "trap handler registers should not be used");            IsSGPR = true;            Width = 16;          } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 3f8a9b1964ca..5c31bddd9b1a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -202,6 +202,16 @@ public:    const char* getTargetNodeName(unsigned Opcode) const override; +  // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection +  // for AMDGPU. +  // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036 +  // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on +  // MergeConsecutiveStores() before Instruction Selection for all targets. +  // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores() +  // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores() +  // re-merges, etc. ) to warrant turning it off for now. +  bool mergeStoresAfterLegalization() const override { return false; } +    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {      return true;    } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6984f4e71613..2042dbf6d5e2 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -571,10 +571,9 @@ public:  } // end anonymous namespace -TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(AMDGPUTTIImpl(this, F)); -  }); +TargetTransformInfo +AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(AMDGPUTTIImpl(this, F));  }  void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { @@ -898,4 +897,3 @@ void GCNPassConfig::addPreEmitPass() {  TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {    return new GCNPassConfig(*this, PM);  } - diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 5043e31f6f5b..5f9b2a7fca20 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -55,7 +55,7 @@ public:    const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {      return &IntrinsicInfo;    } -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    TargetLoweringObjectFile *getObjFileLowering() const override {      return TLOF.get(); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2acd7f78faea..ebf656c549ec 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -536,6 +536,10 @@ public:      return EndLoc;    } +  SMRange getLocRange() const { +    return SMRange(StartLoc, EndLoc); +  } +    Modifiers getModifiers() const {      assert(isRegKind() || isImmTy(ImmTyNone));      return isRegKind() ? Reg.Mods : Imm.Mods; @@ -1491,6 +1495,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {        case 1: return AMDGPU::TTMP_32RegClassID;        case 2: return AMDGPU::TTMP_64RegClassID;        case 4: return AMDGPU::TTMP_128RegClassID; +      case 8: return AMDGPU::TTMP_256RegClassID; +      case 16: return AMDGPU::TTMP_512RegClassID;      }    } else if (Is == IS_SGPR) {      switch (RegWidth) { @@ -1498,8 +1504,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {        case 1: return AMDGPU::SGPR_32RegClassID;        case 2: return AMDGPU::SGPR_64RegClassID;        case 4: return AMDGPU::SGPR_128RegClassID; -      case 8: return AMDGPU::SReg_256RegClassID; -      case 16: return AMDGPU::SReg_512RegClassID; +      case 8: return AMDGPU::SGPR_256RegClassID; +      case 16: return AMDGPU::SGPR_512RegClassID;      }    }    return -1; @@ -1754,6 +1760,11 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {    // TODO: add syntactic sugar for 1/(2*PI)    bool Minus = false;    if (getLexer().getKind() == AsmToken::Minus) { +    const AsmToken NextToken = getLexer().peekTok(); +    if (!NextToken.is(AsmToken::Integer) && +        !NextToken.is(AsmToken::Real)) { +        return MatchOperand_NoMatch; +    }      Minus = true;      Parser.Lex();    } @@ -1783,7 +1794,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {      return MatchOperand_Success;    }    default: -    return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch; +    return MatchOperand_NoMatch;    }  } @@ -2244,6 +2255,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,    return true;  } +static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS, +                                            unsigned VariantID = 0); +  bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,                                                OperandVector &Operands,                                                MCStreamer &Out, @@ -2286,8 +2300,13 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,    case Match_MissingFeature:      return Error(IDLoc, "instruction not supported on this GPU"); -  case Match_MnemonicFail: -    return Error(IDLoc, "unrecognized instruction mnemonic"); +  case Match_MnemonicFail: { +    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); +    std::string Suggestion = AMDGPUMnemonicSpellCheck( +        ((AMDGPUOperand &)*Operands[0]).getToken(), FBS); +    return Error(IDLoc, "invalid instruction" + Suggestion, +                 ((AMDGPUOperand &)*Operands[0]).getLocRange()); +  }    case Match_InvalidOperand: {      SMLoc ErrorLoc = IDLoc; @@ -3838,7 +3857,9 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {      return Ok? MatchOperand_Success : MatchOperand_ParseFail;    } else { -    return MatchOperand_NoMatch; +    // Swizzle "offset" operand is optional. +    // If it is omitted, try parsing other optional operands. +    return parseOptionalOperand(Operands);    }  } @@ -4786,6 +4807,7 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() {  #define GET_REGISTER_MATCHER  #define GET_MATCHER_IMPLEMENTATION +#define GET_MNEMONIC_SPELL_CHECKER  #include "AMDGPUGenAsmMatcher.inc"  // This fuction should be defined after auto-generated include so that we have diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4a3f2c975179..47a2d3f2fdc5 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -348,10 +348,12 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,    case AMDGPU::TTMP_128RegClassID:    // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in    // this bundle? -  case AMDGPU::SReg_256RegClassID: -  // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in +  case AMDGPU::SGPR_256RegClassID: +  case AMDGPU::TTMP_256RegClassID: +    // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in    // this bundle? -  case AMDGPU::SReg_512RegClassID: +  case AMDGPU::SGPR_512RegClassID: +  case AMDGPU::TTMP_512RegClassID:      shift = 2;      break;    // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in @@ -441,11 +443,11 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {  }  MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const { -  return createSRegOperand(AMDGPU::SReg_256RegClassID, Val); +  return decodeDstOp(OPW256, Val);  }  MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { -  return createSRegOperand(AMDGPU::SReg_512RegClassID, Val); +  return decodeDstOp(OPW512, Val);  }  MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { @@ -593,6 +595,8 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {      return SGPR_32RegClassID;    case OPW64: return SGPR_64RegClassID;    case OPW128: return SGPR_128RegClassID; +  case OPW256: return SGPR_256RegClassID; +  case OPW512: return SGPR_512RegClassID;    }  } @@ -608,6 +612,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {      return TTMP_32RegClassID;    case OPW64: return TTMP_64RegClassID;    case OPW128: return TTMP_128RegClassID; +  case OPW256: return TTMP_256RegClassID; +  case OPW512: return TTMP_512RegClassID;    }  } @@ -659,6 +665,25 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c    }  } +MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) const { +  using namespace AMDGPU::EncValues; + +  assert(Val < 128); +  assert(Width == OPW256 || Width == OPW512); + +  if (Val <= SGPR_MAX) { +    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. +    return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); +  } + +  int TTmpIdx = getTTmpIdx(Val); +  if (TTmpIdx >= 0) { +    return createSRegOperand(getTtmpClassId(Width), TTmpIdx); +  } + +  llvm_unreachable("unknown dst register"); +} +  MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {    using namespace AMDGPU; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index ce396eb68c4c..75cfc5e11282 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -95,6 +95,8 @@ public:      OPW32,      OPW64,      OPW128, +    OPW256, +    OPW512,      OPW16,      OPWV216,      OPW_LAST_, @@ -110,6 +112,7 @@ public:    MCOperand decodeLiteralConstant() const;    MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; +  MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const;    MCOperand decodeSpecialReg32(unsigned Val) const;    MCOperand decodeSpecialReg64(unsigned Val) const; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index 67663d39967c..bf57f88bef91 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -335,13 +335,13 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,    } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {      O << 'v';      NumRegs = 8; -  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) { +  } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) {      O << 's';      NumRegs = 8;    } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {      O << 'v';      NumRegs = 16; -  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) { +  } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) {      O << 's';      NumRegs = 16;    } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index 6b7c3ffb7bb8..dd0efef7f91b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -8,6 +8,26 @@  //===----------------------------------------------------------------------===//  //===----------------------------------------------------------------------===// +//  Helpers +//===----------------------------------------------------------------------===// + +class getSubRegs<int size> { +  list<SubRegIndex> ret2 = [sub0, sub1]; +  list<SubRegIndex> ret3 = [sub0, sub1, sub2]; +  list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3]; +  list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; +  list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3, +                             sub4, sub5, sub6, sub7, +                             sub8, sub9, sub10, sub11, +                             sub12, sub13, sub14, sub15]; + +  list<SubRegIndex> ret = !if(!eq(size, 2), ret2, +                              !if(!eq(size, 3), ret3, +                                  !if(!eq(size, 4), ret4, +                                      !if(!eq(size, 8), ret8, ret16)))); +} + +//===----------------------------------------------------------------------===//  //  Declarations that describe the SI registers  //===----------------------------------------------------------------------===//  class SIReg <string n, bits<16> regIdx = 0> : Register<n>, @@ -141,19 +161,19 @@ def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,  }  // SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<[sub0, sub1], +def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,                               [(add (decimate SGPR_32, 2)),                                (add (decimate (shl SGPR_32, 1), 2))]>;  // SGPR 128-bit registers -def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], +def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,                                [(add (decimate SGPR_32, 4)),                                 (add (decimate (shl SGPR_32, 1), 4)),                                 (add (decimate (shl SGPR_32, 2), 4)),                                 (add (decimate (shl SGPR_32, 3), 4))]>;  // SGPR 256-bit registers -def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], +def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,                                [(add (decimate SGPR_32, 4)),                                 (add (decimate (shl SGPR_32, 1), 4)),                                 (add (decimate (shl SGPR_32, 2), 4)), @@ -164,8 +184,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],                                 (add (decimate (shl SGPR_32, 7), 4))]>;  // SGPR 512-bit registers -def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, -                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], +def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,                                [(add (decimate SGPR_32, 4)),                                 (add (decimate (shl SGPR_32, 1), 4)),                                 (add (decimate (shl SGPR_32, 2), 4)), @@ -190,47 +209,125 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,  }  // Trap handler TMP 64-bit registers -def TTMP_64Regs : RegisterTuples<[sub0, sub1], +def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret,                               [(add (decimate TTMP_32, 2)),                                (add (decimate (shl TTMP_32, 1), 2))]>;  // Trap handler TMP 128-bit registers -def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], +def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret,                                [(add (decimate TTMP_32, 4)),                                 (add (decimate (shl TTMP_32, 1), 4)),                                 (add (decimate (shl TTMP_32, 2), 4)),                                 (add (decimate (shl TTMP_32, 3), 4))]>; -class TmpRegTuples <string tgt, -                    bit Is64Bit, -                    int Index0, -                    int Index1 = !add(Index0, 1), -                    int Index2 = !add(Index0, !if(Is64Bit, 1, 2)), -                    int Index3 = !add(Index0, !if(Is64Bit, 1, 3)), -                    string name = "ttmp["#Index0#":"#Index3#"]", -                    Register r0 = !cast<Register>("TTMP"#Index0#tgt), -                    Register r1 = !cast<Register>("TTMP"#Index1#tgt), -                    Register r2 = !cast<Register>("TTMP"#Index2#tgt), -                    Register r3 = !cast<Register>("TTMP"#Index3#tgt)> : -  RegisterWithSubRegs<name, !if(Is64Bit, [r0, r1], [r0, r1, r2, r3])> { -  let SubRegIndices = !if(Is64Bit, [sub0, sub1], [sub0, sub1, sub2, sub3]); -  let HWEncoding = r0.HWEncoding; -} +def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret, +                              [(add (decimate TTMP_32, 4)), +                               (add (decimate (shl TTMP_32, 1), 4)), +                               (add (decimate (shl TTMP_32, 2), 4)), +                               (add (decimate (shl TTMP_32, 3), 4)), +                               (add (decimate (shl TTMP_32, 4), 4)), +                               (add (decimate (shl TTMP_32, 5), 4)), +                               (add (decimate (shl TTMP_32, 6), 4)), +                               (add (decimate (shl TTMP_32, 7), 4))]>; + +def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret, +                              [(add (decimate TTMP_32, 4)), +                               (add (decimate (shl TTMP_32, 1), 4)), +                               (add (decimate (shl TTMP_32, 2), 4)), +                               (add (decimate (shl TTMP_32, 3), 4)), +                               (add (decimate (shl TTMP_32, 4), 4)), +                               (add (decimate (shl TTMP_32, 5), 4)), +                               (add (decimate (shl TTMP_32, 6), 4)), +                               (add (decimate (shl TTMP_32, 7), 4)), +                               (add (decimate (shl TTMP_32, 8), 4)), +                               (add (decimate (shl TTMP_32, 9), 4)), +                               (add (decimate (shl TTMP_32, 10), 4)), +                               (add (decimate (shl TTMP_32, 11), 4)), +                               (add (decimate (shl TTMP_32, 12), 4)), +                               (add (decimate (shl TTMP_32, 13), 4)), +                               (add (decimate (shl TTMP_32, 14), 4)), +                               (add (decimate (shl TTMP_32, 15), 4))]>; + +class TmpRegTuplesBase<int index, int size, +                       list<Register> subRegs, +                       list<SubRegIndex> indices = getSubRegs<size>.ret, +                       int index1 = !add(index, !add(size, -1)), +                       string name = "ttmp["#index#":"#index1#"]"> : +  RegisterWithSubRegs<name, subRegs> { +  let HWEncoding = subRegs[0].HWEncoding; +  let SubRegIndices = indices; +} + +class TmpRegTuples<string tgt, +                   int size, +                   int index0, +                   int index1 = !add(index0, 1), +                   int index2 = !add(index0, !if(!eq(size, 2), 1, 2)), +                   int index3 = !add(index0, !if(!eq(size, 2), 1, 3)), +                   int index4 = !add(index0, !if(!eq(size, 8), 4, 1)), +                   int index5 = !add(index0, !if(!eq(size, 8), 5, 1)), +                   int index6 = !add(index0, !if(!eq(size, 8), 6, 1)), +                   int index7 = !add(index0, !if(!eq(size, 8), 7, 1)), +                   Register r0 = !cast<Register>("TTMP"#index0#tgt), +                   Register r1 = !cast<Register>("TTMP"#index1#tgt), +                   Register r2 = !cast<Register>("TTMP"#index2#tgt), +                   Register r3 = !cast<Register>("TTMP"#index3#tgt), +                   Register r4 = !cast<Register>("TTMP"#index4#tgt), +                   Register r5 = !cast<Register>("TTMP"#index5#tgt), +                   Register r6 = !cast<Register>("TTMP"#index6#tgt), +                   Register r7 = !cast<Register>("TTMP"#index7#tgt)> : +  TmpRegTuplesBase<index0, size, +                   !if(!eq(size, 2), [r0, r1], +                       !if(!eq(size, 4), [r0, r1, r2, r3], +                                         [r0, r1, r2, r3, r4, r5, r6, r7])), +                   getSubRegs<size>.ret>;  foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { -  def TTMP#Index#_TTMP#!add(Index,1)#_vi   : TmpRegTuples<"_vi",   1, Index>; -  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 1, Index>; +  def TTMP#Index#_TTMP#!add(Index,1)#_vi   : TmpRegTuples<"_vi",   2, Index>; +  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>;  }  foreach Index = {0, 4, 8, 12} in {    def TTMP#Index#_TTMP#!add(Index,1)#                   _TTMP#!add(Index,2)# -                 _TTMP#!add(Index,3)#_vi   : TmpRegTuples<"_vi",   0, Index>; +                 _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi",   4, Index>;    def TTMP#Index#_TTMP#!add(Index,1)#                   _TTMP#!add(Index,2)# -                 _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 0, Index>; +                 _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>;  } +foreach Index = {0, 4, 8} in { +  def TTMP#Index#_TTMP#!add(Index,1)# +                 _TTMP#!add(Index,2)# +                 _TTMP#!add(Index,3)# +                 _TTMP#!add(Index,4)# +                 _TTMP#!add(Index,5)# +                 _TTMP#!add(Index,6)# +                 _TTMP#!add(Index,7)#_vi : TmpRegTuples<"_vi",   8, Index>; +  def TTMP#Index#_TTMP#!add(Index,1)# +                 _TTMP#!add(Index,2)# +                 _TTMP#!add(Index,3)# +                 _TTMP#!add(Index,4)# +                 _TTMP#!add(Index,5)# +                 _TTMP#!add(Index,6)# +                 _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; +} + +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : +  TmpRegTuplesBase<0, 16, +                   [TTMP0_vi, TTMP1_vi, TTMP2_vi, TTMP3_vi, +                    TTMP4_vi, TTMP5_vi, TTMP6_vi, TTMP7_vi, +                    TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, +                    TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; + +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : +  TmpRegTuplesBase<0, 16, +                   [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, +                    TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, +                    TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, +                    TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; + +  // VGPR 32-bit registers  // i16/f16 only on VI+  def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -240,25 +337,25 @@ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,  }  // VGPR 64-bit registers -def VGPR_64 : RegisterTuples<[sub0, sub1], +def VGPR_64 : RegisterTuples<getSubRegs<2>.ret,                               [(add (trunc VGPR_32, 255)),                                (add (shl VGPR_32, 1))]>;  // VGPR 96-bit registers -def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], +def VGPR_96 : RegisterTuples<getSubRegs<3>.ret,                               [(add (trunc VGPR_32, 254)),                                (add (shl VGPR_32, 1)),                                (add (shl VGPR_32, 2))]>;  // VGPR 128-bit registers -def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], +def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,                                [(add (trunc VGPR_32, 253)),                                 (add (shl VGPR_32, 1)),                                 (add (shl VGPR_32, 2)),                                 (add (shl VGPR_32, 3))]>;  // VGPR 256-bit registers -def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], +def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,                                [(add (trunc VGPR_32, 249)),                                 (add (shl VGPR_32, 1)),                                 (add (shl VGPR_32, 2)), @@ -269,8 +366,7 @@ def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],                                 (add (shl VGPR_32, 7))]>;  // VGPR 512-bit registers -def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, -                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], +def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,                                [(add (trunc VGPR_32, 241)),                                 (add (shl VGPR_32, 1)),                                 (add (shl VGPR_32, 2)), @@ -368,13 +464,31 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,  } // End CopyCost = 2 -def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { +  let AllocationPriority = 11; +} + +def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { +  let isAllocatable = 0; +} + +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, +  (add SGPR_256, TTMP_256)> {    // Requires 4 s_mov_b64 to copy    let CopyCost = 4;    let AllocationPriority = 11;  } -def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> { +def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { +  let AllocationPriority = 12; +} + +def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> { +  let isAllocatable = 0; +} + +def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, +  (add SGPR_512, TTMP_512)> {    // Requires 8 s_mov_b64 to copy    let CopyCost = 8;    let AllocationPriority = 12; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 819a7add0be4..125a3b22d0cf 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -667,6 +667,10 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {    CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \    CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \    CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ +  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ +  CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ +  CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ +  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \    }  #define CASE_CI_VI(node) \ diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp index d2512c281a61..1acae3a88870 100644 --- a/lib/Target/ARC/ARCTargetMachine.cpp +++ b/lib/Target/ARC/ARCTargetMachine.cpp @@ -88,8 +88,7 @@ extern "C" void LLVMInitializeARCTarget() {    RegisterTargetMachine<ARCTargetMachine> X(getTheARCTarget());  } -TargetIRAnalysis ARCTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(ARCTTIImpl(this, F)); -  }); +TargetTransformInfo +ARCTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(ARCTTIImpl(this, F));  } diff --git a/lib/Target/ARC/ARCTargetMachine.h b/lib/Target/ARC/ARCTargetMachine.h index 98021b3dc1d5..18117e3409af 100644 --- a/lib/Target/ARC/ARCTargetMachine.h +++ b/lib/Target/ARC/ARCTargetMachine.h @@ -40,7 +40,7 @@ public:    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    TargetLoweringObjectFile *getObjFileLowering() const override {      return TLOF.get();    } diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index 3aac689c6310..9ffb4c2055f9 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -61,6 +61,7 @@ void initializeARMLoadStoreOptPass(PassRegistry &);  void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);  void initializeARMConstantIslandsPass(PassRegistry &);  void initializeARMExpandPseudoPass(PassRegistry &); +void initializeThumb2SizeReducePass(PassRegistry &);  } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index c1a3f639461d..c9766aa2161a 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -83,6 +83,9 @@ def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",  def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",                                               "Has v7 clrex instruction">; +def FeatureDFB  : SubtargetFeature<"dfb", "HasFullDataBarrier", "true", +                                   "Has full data barrier (dfb) instruction">; +  def FeatureAcquireRelease : SubtargetFeature<"acquire-release",                                               "HasAcquireRelease", "true",                                               "Has v8 acquire/release (lda/ldaex " @@ -617,6 +620,7 @@ def ARMv83a   : Architecture<"armv8.3-a", "ARMv83a",  [HasV8_3aOps,  def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,                                                         FeatureRClass,                                                         FeatureDB, +                                                       FeatureDFB,                                                         FeatureDSP,                                                         FeatureCRC,                                                         FeatureMP, diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 0ea435062ec0..60048d4453d8 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1416,7 +1416,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,      case MVT::i8:      case MVT::i16:        needsExt = true; -    // Intentional fall-through. +      LLVM_FALLTHROUGH;      case MVT::i32:        if (isThumb2) {          if (!UseImm) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 1b4d7ff50848..aeda7c06a27a 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1041,7 +1041,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,    if (!Subtarget->isThumb1Only())      setOperationAction(ISD::SETCCE, MVT::i32, Custom); -  setOperationAction(ISD::BRCOND,    MVT::Other, Expand); +  setOperationAction(ISD::BRCOND,    MVT::Other, Custom);    setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);    setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);    setOperationAction(ISD::BR_CC,     MVT::f64,   Custom); @@ -1084,20 +1084,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,      }    } -  // Combine sin / cos into one node or libcall if possible. -  if (Subtarget->hasSinCos()) { -    setLibcallName(RTLIB::SINCOS_F32, "sincosf"); -    setLibcallName(RTLIB::SINCOS_F64, "sincos"); -    if (Subtarget->isTargetWatchABI()) { -      setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); -      setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); -    } -    if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { -      // For iOS, we don't want to the normal expansion of a libcall to -      // sincos. We want to issue a libcall to __sincos_stret. -      setOperationAction(ISD::FSINCOS, MVT::f64, Custom); -      setOperationAction(ISD::FSINCOS, MVT::f32, Custom); -    } +  // Use __sincos_stret if available. +  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && +      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { +    setOperationAction(ISD::FSINCOS, MVT::f64, Custom); +    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);    }    // FP-ARMv8 implements a lot of rounding-like FP operations. @@ -1255,6 +1246,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {    case ARMISD::CMOV:          return "ARMISD::CMOV";    case ARMISD::SSAT:          return "ARMISD::SSAT"; +  case ARMISD::USAT:          return "ARMISD::USAT";    case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";    case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG"; @@ -3902,6 +3894,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {    return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);  } +// This function returns three things: the arithmetic computation itself +// (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The +// comparison and the condition code define the case in which the arithmetic +// computation *does not* overflow.  std::pair<SDValue, SDValue>  ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,                                   SDValue &ARMcc) const { @@ -3927,7 +3923,11 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,      break;    case ISD::UADDO:      ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); -    Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); +    // We use ADDC here to correspond to its use in LowerUnsignedALUO. +    // We do not use it in the USUBO case as Value may not be used. +    Value = DAG.getNode(ARMISD::ADDC, dl, +                        DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) +                .getValue(0);      OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);      break;    case ISD::SSUBO: @@ -4205,7 +4205,7 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,            ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));  } -// Check if two chained conditionals could be converted into SSAT. +// Check if two chained conditionals could be converted into SSAT or USAT.  //  // SSAT can replace a set of two conditional selectors that bound a number to an  // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: @@ -4216,10 +4216,14 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,  //     x < k ? (x < -k ? -k : x) : k  //     etc.  // +// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is +// a power of 2. +//  // It returns true if the conversion can be done, false otherwise. -// Additionally, the variable is returned in parameter V and the constant in K. +// Additionally, the variable is returned in parameter V, the constant in K and +// usat is set to true if the conditional represents an unsigned saturation  static bool isSaturatingConditional(const SDValue &Op, SDValue &V, -                                    uint64_t &K) { +                                    uint64_t &K, bool &usat) {    SDValue LHS1 = Op.getOperand(0);    SDValue RHS1 = Op.getOperand(1);    SDValue TrueVal1 = Op.getOperand(2); @@ -4286,13 +4290,23 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,    int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();    int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();    int64_t PosVal = std::max(Val1, Val2); +  int64_t NegVal = std::min(Val1, Val2);    if (((Val1 > Val2 && UpperCheckOp == &Op) ||         (Val1 < Val2 && UpperCheckOp == &Op2)) && -      Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) { +      isPowerOf2_64(PosVal + 1)) { + +    // Handle the difference between USAT (unsigned) and SSAT (signed) saturation +    if (Val1 == ~Val2) +      usat = false; +    else if (NegVal == 0) +      usat = true; +    else +      return false;      V = V2;      K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive +      return true;    } @@ -4306,10 +4320,16 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {    // Try to convert two saturating conditional selects into a single SSAT    SDValue SatValue;    uint64_t SatConstant; +  bool SatUSat;    if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && -      isSaturatingConditional(Op, SatValue, SatConstant)) -    return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, -                       DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); +      isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { +    if (SatUSat) +      return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, +                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); +    else +      return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, +                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); +  }    SDValue LHS = Op.getOperand(0);    SDValue RHS = Op.getOperand(1); @@ -4506,6 +4526,39 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {    return SDValue();  } +SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { +  SDValue Chain = Op.getOperand(0); +  SDValue Cond = Op.getOperand(1); +  SDValue Dest = Op.getOperand(2); +  SDLoc dl(Op); + +  // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction. +  unsigned Opc = Cond.getOpcode(); +  if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || +                               Opc == ISD::SSUBO || Opc == ISD::USUBO)) { +    // Only lower legal XALUO ops. +    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) +      return SDValue(); + +    // The actual operation with overflow check. +    SDValue Value, OverflowCmp; +    SDValue ARMcc; +    std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); + +    // Reverse the condition code. +    ARMCC::CondCodes CondCode = +        (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); +    CondCode = ARMCC::getOppositeCondition(CondCode); +    ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); +    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + +    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, +                       OverflowCmp); +  } + +  return SDValue(); +} +  SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {    SDValue Chain = Op.getOperand(0);    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); @@ -4526,6 +4579,33 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {      }    } +  // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction. +  unsigned Opc = LHS.getOpcode(); +  if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && +      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || +       Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { +    // Only lower legal XALUO ops. +    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) +      return SDValue(); + +    // The actual operation with overflow check. +    SDValue Value, OverflowCmp; +    SDValue ARMcc; +    std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); + +    if ((CC == ISD::SETNE) != isOneConstant(RHS)) { +      // Reverse the condition code. +      ARMCC::CondCodes CondCode = +          (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); +      CondCode = ARMCC::getOppositeCondition(CondCode); +      ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); +    } +    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + +    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, +                       OverflowCmp); +  } +    if (LHS.getValueType() == MVT::i32) {      SDValue ARMcc;      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); @@ -7523,10 +7603,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {    Entry.IsZExt = false;    Args.push_back(Entry); -  const char *LibcallName = -      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";    RTLIB::Libcall LC = -      (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; +      (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; +  const char *LibcallName = getLibcallName(LC);    CallingConv::ID CC = getLibcallCallingConv(LC);    SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); @@ -7782,6 +7861,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {    case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);    case ISD::SELECT:        return LowerSELECT(Op, DAG);    case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG); +  case ISD::BRCOND:        return LowerBRCOND(Op, DAG);    case ISD::BR_CC:         return LowerBR_CC(Op, DAG);    case ISD::BR_JT:         return LowerBR_JT(Op, DAG);    case ISD::VASTART:       return LowerVASTART(Op, DAG); @@ -13751,7 +13831,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,    case AtomicOrdering::SequentiallyConsistent:      if (!Inst->hasAtomicStore())        return nullptr; // Nothing to do -    /*FALLTHROUGH*/ +    LLVM_FALLTHROUGH;    case AtomicOrdering::Release:    case AtomicOrdering::AcquireRelease:      if (Subtarget->preferISHSTBarriers()) diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 0a1af8d89f9b..bf63dfae4407 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -87,6 +87,7 @@ class VectorType;        CMOV,         // ARM conditional move instructions.        SSAT,         // Signed saturation +      USAT,         // Unsigned saturation        BCC_i64, @@ -643,6 +644,7 @@ class VectorType;      SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; +    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 4e13af596300..eb8526bfeadf 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -139,6 +139,8 @@ def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,  def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; +def ARMusatnoshift   : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>; +  def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; @@ -278,6 +280,9 @@ def HasDSP           : Predicate<"Subtarget->hasDSP()">,  def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,                                   AssemblerPredicate<"FeatureDB",                                                      "data-barriers">; +def HasDFB           : Predicate<"Subtarget->hasFullDataBarrier()">, +                                 AssemblerPredicate<"FeatureDFB", +                                                    "full-data-barrier">;  def HasV7Clrex  : Predicate<"Subtarget->hasV7Clrex()">,                              AssemblerPredicate<"FeatureV7Clrex",                                                 "v7 clrex">; @@ -3832,6 +3837,8 @@ def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),                 (USAT imm0_31:$pos, GPRnopc:$a, 0)>;  def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),               (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; +def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), +             (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;  def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos),                 (SSAT16 imm1_16:$pos, GPRnopc:$a)>;  def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos), @@ -5846,6 +5853,8 @@ include "ARMInstrNEON.td"  def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;  def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;  def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>; +// Armv8-R 'Data Full Barrier' +def : InstAlias<"dfb", (DSB 0xc), 1>, Requires<[IsARM, HasDFB]>;  // System instructions  def : MnemonicAlias<"swi", "svc">; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 670ed127da7e..4592249f5795 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2336,6 +2336,8 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),  def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),               (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; +def : T2Pat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), +             (t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;  def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos),              (t2SSAT imm1_32:$pos, GPR:$a, 0)>;  def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), @@ -4506,6 +4508,8 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",  def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;  def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;  def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; +// Armv8-R 'Data Full Barrier' +def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;  // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional  // width specifier. diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 6bbeae2e1151..b0fd0b476920 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -669,13 +669,22 @@ bool ARMInstructionSelector::select(MachineInstr &I,      return true;    } +  using namespace TargetOpcode; +  if (I.getOpcode() == G_CONSTANT) { +    // Pointer constants should be treated the same as 32-bit integer constants. +    // Change the type and let TableGen handle it. +    unsigned ResultReg = I.getOperand(0).getReg(); +    LLT Ty = MRI.getType(ResultReg); +    if (Ty.isPointer()) +      MRI.setType(ResultReg, LLT::scalar(32)); +  } +    if (selectImpl(I, CoverageInfo))      return true;    MachineInstrBuilder MIB{MF, I};    bool isSExt = false; -  using namespace TargetOpcode;    switch (I.getOpcode()) {    case G_SEXT:      isSExt = true; @@ -741,6 +750,31 @@ bool ARMInstructionSelector::select(MachineInstr &I,      const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);      const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); +    if (SrcRegBank.getID() == ARM::FPRRegBankID) { +      // This should only happen in the obscure case where we have put a 64-bit +      // integer into a D register. Get it out of there and keep only the +      // interesting part. +      assert(I.getOpcode() == G_TRUNC && "Unsupported operand for G_ANYEXT"); +      assert(DstRegBank.getID() == ARM::GPRRegBankID && +             "Unsupported combination of register banks"); +      assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size"); +      assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size"); + +      unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); +      auto InsertBefore = std::next(I.getIterator()); +      auto MovI = +          BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD)) +              .addDef(DstReg) +              .addDef(IgnoredBits) +              .addUse(SrcReg) +              .add(predOps(ARMCC::AL)); +      if (!constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI)) +        return false; + +      MIB->eraseFromParent(); +      return true; +    } +      if (SrcRegBank.getID() != DstRegBank.getID()) {        DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");        return false; @@ -754,6 +788,28 @@ bool ARMInstructionSelector::select(MachineInstr &I,      I.setDesc(TII.get(COPY));      return selectCopy(I, TII, MRI, TRI, RBI);    } +  case G_INTTOPTR: +  case G_PTRTOINT: { +    auto SrcReg = I.getOperand(1).getReg(); +    auto DstReg = I.getOperand(0).getReg(); + +    const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); +    const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + +    if (SrcRegBank.getID() != DstRegBank.getID()) { +      DEBUG(dbgs() +            << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n"); +      return false; +    } + +    if (SrcRegBank.getID() != ARM::GPRRegBankID) { +      DEBUG(dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n"); +      return false; +    } + +    I.setDesc(TII.get(COPY)); +    return selectCopy(I, TII, MRI, TRI, RBI); +  }    case G_SELECT:      return selectSelect(MIB, MRI);    case G_ICMP: { diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 2dd1dff64e87..8cff1f0869d0 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -126,6 +126,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {      setAction({Op, s32}, Legal);    } +  setAction({G_INTTOPTR, p0}, Legal); +  setAction({G_INTTOPTR, 1, s32}, Legal); + +  setAction({G_PTRTOINT, s32}, Legal); +  setAction({G_PTRTOINT, 1, p0}, Legal); +    for (unsigned Op : {G_ASHR, G_LSHR, G_SHL})      setAction({Op, s32}, Legal); @@ -139,6 +145,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {    setAction({G_BRCOND, s1}, Legal);    setAction({G_CONSTANT, s32}, Legal); +  setAction({G_CONSTANT, p0}, Legal);    setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);    setAction({G_ICMP, s1}, Legal); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index b32bfd449544..fad0e98285e6 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -226,12 +226,30 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {    case G_SEXT:    case G_ZEXT:    case G_ANYEXT: -  case G_TRUNC:    case G_GEP: +  case G_INTTOPTR: +  case G_PTRTOINT:      // FIXME: We're abusing the fact that everything lives in a GPR for now; in      // the real world we would use different mappings.      OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];      break; +  case G_TRUNC: { +    // In some cases we may end up with a G_TRUNC from a 64-bit value to a +    // 32-bit value. This isn't a real floating point trunc (that would be a +    // G_FPTRUNC). Instead it is an integer trunc in disguise, which can appear +    // because the legalizer doesn't distinguish between integer and floating +    // point values so it may leave some 64-bit integers un-narrowed. Until we +    // have a more principled solution that doesn't let such things sneak all +    // the way to this point, just map the source to a DPR and the destination +    // to a GPR. +    LLT LargeTy = MRI.getType(MI.getOperand(1).getReg()); +    OperandsMapping = +        LargeTy.getSizeInBits() <= 32 +            ? &ARM::ValueMappings[ARM::GPR3OpsIdx] +            : getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], +                                  &ARM::ValueMappings[ARM::DPR3OpsIdx]}); +    break; +  }    case G_LOAD:    case G_STORE: {      LLT Ty = MRI.getType(MI.getOperand(0).getReg()); diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 4d4a88126ce6..23027e92481f 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -348,11 +348,6 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {    return SchedModel.MispredictPenalty;  } -bool ARMSubtarget::hasSinCos() const { -  return isTargetWatchOS() || -    (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); -} -  bool ARMSubtarget::enableMachineScheduler() const {    // Enable the MachineScheduler before register allocation for subtargets    // with the use-misched feature. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 9301197e1387..eedb675a3304 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -236,6 +236,10 @@ protected:    /// instructions.    bool HasDataBarrier = false; +  /// HasFullDataBarrier - True if the subtarget supports DFB data barrier +  /// instruction. +  bool HasFullDataBarrier = false; +    /// HasV7Clrex - True if the subtarget supports CLREX instructions    bool HasV7Clrex = false; @@ -544,6 +548,7 @@ public:    bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; }    bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }    bool hasDataBarrier() const { return HasDataBarrier; } +  bool hasFullDataBarrier() const { return HasFullDataBarrier; }    bool hasV7Clrex() const { return HasV7Clrex; }    bool hasAcquireRelease() const { return HasAcquireRelease; } @@ -712,10 +717,6 @@ public:    unsigned getMispredictionPenalty() const; -  /// This function returns true if the target has sincos() routine in its -  /// compiler runtime or math libraries. -  bool hasSinCos() const; -    /// Returns true if machine scheduler should be enabled.    bool enableMachineScheduler() const override; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 51982b2dab14..0f6d1eddc985 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -92,6 +92,7 @@ extern "C" void LLVMInitializeARMTarget() {    initializeARMConstantIslandsPass(Registry);    initializeARMExecutionDepsFixPass(Registry);    initializeARMExpandPseudoPass(Registry); +  initializeThumb2SizeReducePass(Registry);  }  static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -282,10 +283,9 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {    return I.get();  } -TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(ARMTTIImpl(this, F)); -  }); +TargetTransformInfo +ARMBaseTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(ARMTTIImpl(this, F));  }  ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 655ec3202bfb..2072bb731f0a 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -53,8 +53,7 @@ public:    const ARMSubtarget *getSubtargetImpl() const = delete;    bool isLittleEndian() const { return isLittle; } -  /// \brief Get the TargetIRAnalysis for this target. -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index cae01e415eff..43d7888075b5 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -394,25 +394,6 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,    return 1;  } -int ARMTTIImpl::getFPOpCost(Type *Ty) { -  // Use similar logic that's in ARMISelLowering: -  // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access -  // to VFP. - -  if (ST->hasVFP2() && !ST->isThumb1Only()) { -    if (Ty->isFloatTy()) { -      return TargetTransformInfo::TCC_Basic; -    } - -    if (Ty->isDoubleTy()) { -      return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive : -        TargetTransformInfo::TCC_Basic; -    } -  } - -  return TargetTransformInfo::TCC_Expensive; -} -  int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,                                 Type *SubTp) {    // We only handle costs of reverse and alternate shuffles for now. diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 99353a3219a0..cd9fa0709020 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -156,8 +156,6 @@ public:    int getAddressComputationCost(Type *Val, ScalarEvolution *SE,                                   const SCEV *Ptr); -  int getFPOpCost(Type *Ty); -    int getArithmeticInstrCost(        unsigned Opcode, Type *Ty,        TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 26fda5f22b4f..97b642c99f80 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -5581,11 +5581,11 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,      CanAcceptPredicationCode =          Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&          Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" && -        Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" && -        Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" && -        Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" && -        Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") && -        !Mnemonic.startswith("srs"); +        Mnemonic != "dmb" && Mnemonic != "dfb" && Mnemonic != "dsb" && +        Mnemonic != "isb" && Mnemonic != "pld" && Mnemonic != "pli" && +        Mnemonic != "pldw" && Mnemonic != "ldc2" && Mnemonic != "ldc2l" && +        Mnemonic != "stc2" && Mnemonic != "stc2l" && +        !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");    } else if (isThumbOne()) {      if (hasV6MOps())        CanAcceptPredicationCode = Mnemonic != "movs"; diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index a29a2eeccfe8..53c635877675 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -2386,6 +2386,7 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,      case ARM::VLD4q32_UPD:        if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))          return MCDisassembler::Fail; +      break;      default:        break;    } @@ -3326,6 +3327,7 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,    case ARM::t2STRs:      if (Rn == 15)        return MCDisassembler::Fail; +    break;    default:      break;    } @@ -3391,6 +3393,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,        break;      case ARM::t2LDRSBs:        Inst.setOpcode(ARM::t2PLIs); +      break;      default:        break;      } @@ -3854,6 +3857,7 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,    case ARM::t2STRHi12:      if (Rn == 15)        return MCDisassembler::Fail; +    break;    default:      break;    } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 3920c73fba6a..5357e26856ea 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -45,6 +45,7 @@  using namespace llvm;  #define DEBUG_TYPE "t2-reduce-size" +#define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass"  STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");  STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones"); @@ -162,7 +163,7 @@ namespace {      const Thumb2InstrInfo *TII;      const ARMSubtarget *STI; -    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor); +    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor = nullptr);      bool runOnMachineFunction(MachineFunction &MF) override; @@ -172,7 +173,7 @@ namespace {      }      StringRef getPassName() const override { -      return "Thumb2 instruction size reduction pass"; +      return THUMB2_SIZE_REDUCE_NAME;      }    private: @@ -237,6 +238,9 @@ namespace {  } // end anonymous namespace +INITIALIZE_PASS(Thumb2SizeReduce, DEBUG_TYPE, THUMB2_SIZE_REDUCE_NAME, false, +                false) +  Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)      : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {    OptimizeSize = MinimizeSize = false; diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp index 6f81e020b996..1f4ef098403d 100644 --- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp +++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp @@ -56,7 +56,7 @@ void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,    if (Op.isReg()) {      O << getRegisterName(Op.getReg());    } else if (Op.isImm()) { -    O << (int32_t)Op.getImm(); +    O << formatImm((int32_t)Op.getImm());    } else {      assert(Op.isExpr() && "Expected an expression");      printExpr(Op.getExpr(), O); @@ -76,9 +76,9 @@ void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,    if (OffsetOp.isImm()) {      auto Imm = OffsetOp.getImm();      if (Imm >= 0) -      O << " + " << formatDec(Imm); +      O << " + " << formatImm(Imm);      else -      O << " - " << formatDec(-Imm); +      O << " - " << formatImm(-Imm);    } else {      assert(0 && "Expected an immediate");    } @@ -88,7 +88,7 @@ void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo,                                         raw_ostream &O) {    const MCOperand &Op = MI->getOperand(OpNo);    if (Op.isImm()) -    O << (uint64_t)Op.getImm(); +    O << formatImm(Op.getImm());    else if (Op.isExpr())      printExpr(Op.getExpr(), O);    else @@ -100,7 +100,7 @@ void BPFInstPrinter::printBrTargetOperand(const MCInst *MI, unsigned OpNo,    const MCOperand &Op = MI->getOperand(OpNo);    if (Op.isImm()) {      int16_t Imm = Op.getImm(); -    O << ((Imm >= 0) ? "+" : "") << Imm; +    O << ((Imm >= 0) ? "+" : "") << formatImm(Imm);    } else if (Op.isExpr()) {      printExpr(Op.getExpr(), O);    } else { diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 537f97c9a987..8b6c571dee02 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -756,11 +756,11 @@ struct ShuffleMask {    ShuffleMask lo() const {      size_t H = Mask.size()/2; -    return ShuffleMask({Mask.data(), H}); +    return ShuffleMask(Mask.take_front(H));    }    ShuffleMask hi() const {      size_t H = Mask.size()/2; -    return ShuffleMask({Mask.data()+H, H}); +    return ShuffleMask(Mask.take_back(H));    }  }; @@ -836,15 +836,6 @@ namespace llvm {    };  } -// Return a submask of A that is shorter than A by |C| elements: -// - if C > 0, return a submask of A that starts at position C, -// - if C <= 0, return a submask of A that starts at 0 (reduce A by |C|). -static ArrayRef<int> subm(ArrayRef<int> A, int C) { -  if (C > 0) -    return { A.data()+C, A.size()-C }; -  return { A.data(), A.size()+C }; -} -  static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,                        MutableArrayRef<int> MaskR) {    unsigned VecLen = Mask.size(); @@ -910,21 +901,38 @@ bool HvxSelector::selectVectorConstants(SDNode *N) {    // Since they are generated during the selection process, the main    // selection algorithm is not aware of them. Select them directly    // here. -  if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { -    SDValue Addr = cast<LoadSDNode>(N)->getBasePtr(); -    unsigned AddrOpc = Addr.getOpcode(); -    if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) { -      if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) { -        ISel.Select(N); -        return true; -      } +  SmallVector<SDNode*,4> Loads; +  SmallVector<SDNode*,16> WorkQ; + +  // The DAG can change (due to CSE) during selection, so cache all the +  // unselected nodes first to avoid traversing a mutating DAG. + +  auto IsLoadToSelect = [] (SDNode *N) { +    if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { +      SDValue Addr = cast<LoadSDNode>(N)->getBasePtr(); +      unsigned AddrOpc = Addr.getOpcode(); +      if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) +        if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) +          return true;      } +    return false; +  }; + +  WorkQ.push_back(N); +  for (unsigned i = 0; i != WorkQ.size(); ++i) { +    SDNode *W = WorkQ[i]; +    if (IsLoadToSelect(W)) { +      Loads.push_back(W); +      continue; +    } +    for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j) +      WorkQ.push_back(W->getOperand(j).getNode());    } -  bool Selected = false; -  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) -    Selected = selectVectorConstants(N->getOperand(I).getNode()) || Selected; -  return Selected; +  for (SDNode *L : Loads) +    ISel.Select(L); + +  return !Loads.empty();  }  void HvxSelector::materialize(const ResultStack &Results) { @@ -1159,8 +1167,8 @@ OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,                           ResultStack &Results) {    DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});    size_t S = Bytes.size() / 2; -  OpRef L = vmuxs({Bytes.data(),   S}, OpRef::lo(Va), OpRef::lo(Vb), Results); -  OpRef H = vmuxs({Bytes.data()+S, S}, OpRef::hi(Va), OpRef::hi(Vb), Results); +  OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results); +  OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results);    return concat(L, H, Results);  } @@ -1435,7 +1443,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,        return OpRef::fail();      // Examine the rest of the mask.      for (int I = L; I < N; I += L) { -      auto S = findStrip(subm(SM.Mask,I), 1, N-I); +      auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);        // Check whether the mask element at the beginning of each strip        // increases by 2L each time.        if (S.first - Strip.first != 2*I) @@ -1465,7 +1473,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,    std::pair<int,unsigned> PrevS = Strip;    bool Flip = false;    for (int I = L; I < N; I += L) { -    auto S = findStrip(subm(SM.Mask,I), 1, N-I); +    auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);      if (S.second != PrevS.second)        return OpRef::fail();      int Diff = Flip ? PrevS.first - S.first + 2*L @@ -1524,7 +1532,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {    // First, check the non-ignored strips.    for (int I = 2*L; I < 2*N; I += 2*L) { -    auto S = findStrip(subm(SM.Mask,I), 1, N-I); +    auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);      if (S.second != unsigned(L))        return OpRef::fail();      if (2*S.first != I) @@ -1532,7 +1540,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {    }    // Check the -1s.    for (int I = L; I < 2*N; I += 2*L) { -    auto S = findStrip(subm(SM.Mask,I), 0, N-I); +    auto S = findStrip(SM.Mask.drop_front(I), 0, N-I);      if (S.first != -1 || S.second != unsigned(L))        return OpRef::fail();    } @@ -1666,8 +1674,8 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {      if (!isPowerOf2_32(X))        return OpRef::fail();      // Check the other segments of Mask. -    for (int J = 0; J < VecLen; J += I) { -      if (XorPow2(subm(SM.Mask, -J), I) != X) +    for (int J = I; J < VecLen; J += I) { +      if (XorPow2(SM.Mask.slice(J, I), I) != X)          return OpRef::fail();      }      Perm[Log2_32(X)] = Log2_32(I)-1; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 586363335df1..0e0da2ddc400 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -761,11 +761,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,      // Promote the value if needed.      switch (VA.getLocInfo()) {        default: -        // Loc info must be one of Full, SExt, ZExt, or AExt. +        // Loc info must be one of Full, BCvt, SExt, ZExt, or AExt.          llvm_unreachable("Unknown loc info!"); -      case CCValAssign::BCvt:        case CCValAssign::Full:          break; +      case CCValAssign::BCvt: +        Arg = DAG.getBitcast(VA.getLocVT(), Arg); +        break;        case CCValAssign::SExt:          Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);          break; @@ -1135,6 +1137,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments(          unsigned VReg =            RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);          RegInfo.addLiveIn(VA.getLocReg(), VReg); +        if (VA.getLocInfo() == CCValAssign::BCvt) +          RegVT = VA.getValVT();          SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);          // Treat values of type MVT::i1 specially: they are passed in          // registers of type i32, but they need to remain as values of @@ -1155,6 +1159,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments(          unsigned VReg =            RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);          RegInfo.addLiveIn(VA.getLocReg(), VReg); +        if (VA.getLocInfo() == CCValAssign::BCvt) +          RegVT = VA.getValVT();          InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));        // Single Vector @@ -1715,8 +1721,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,    addRegisterClass(MVT::v4i1,  &Hexagon::PredRegsRegClass);  // ddccbbaa    addRegisterClass(MVT::v8i1,  &Hexagon::PredRegsRegClass);  // hgfedcba    addRegisterClass(MVT::i32,   &Hexagon::IntRegsRegClass); -  addRegisterClass(MVT::v4i8,  &Hexagon::IntRegsRegClass);    addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass); +  addRegisterClass(MVT::v4i8,  &Hexagon::IntRegsRegClass);    addRegisterClass(MVT::i64,   &Hexagon::DoubleRegsRegClass);    addRegisterClass(MVT::v8i8,  &Hexagon::DoubleRegsRegClass);    addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass); @@ -1735,6 +1741,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,        addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);        addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);        addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass); +      // These "short" boolean vector types should be legal because +      // they will appear as results of vector compares. If they were +      // not legal, type legalization would try to make them legal +      // and that would require using operations that do not use or +      // produce such types. That, in turn, would imply using custom +      // nodes, which would be unoptimizable by the DAG combiner. +      // The idea is to rely on target-independent operations as much +      // as possible.        addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);        addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);        addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass); @@ -1964,9 +1978,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);    // Types natively supported: -  for (MVT NativeVT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v32i1, MVT::v64i1, -                       MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v1i32, -                       MVT::v2i32, MVT::v1i64}) { +  for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16, +                       MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) {      setOperationAction(ISD::BUILD_VECTOR,       NativeVT, Custom);      setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);      setOperationAction(ISD::INSERT_VECTOR_ELT,  NativeVT, Custom); @@ -1992,63 +2005,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,      AddPromotedToType(Opc, FromTy, ToTy);    }; -  if (Subtarget.useHVXOps()) { -    bool Use64b = Subtarget.useHVX64BOps(); -    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128; -    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128; -    MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8; -    MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; - -    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); -    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); -    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); -    setOperationAction(ISD::AND,            ByteV, Legal); -    setOperationAction(ISD::OR,             ByteV, Legal); -    setOperationAction(ISD::XOR,            ByteV, Legal); - -    for (MVT T : LegalV) { -      setIndexedLoadAction(ISD::POST_INC,  T, Legal); -      setIndexedStoreAction(ISD::POST_INC, T, Legal); - -      setOperationAction(ISD::ADD,     T, Legal); -      setOperationAction(ISD::SUB,     T, Legal); -      setOperationAction(ISD::VSELECT, T, Legal); -      if (T != ByteV) { -        setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); -        setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); -      } - -      setOperationAction(ISD::MUL,                T, Custom); -      setOperationAction(ISD::SETCC,              T, Custom); -      setOperationAction(ISD::BUILD_VECTOR,       T, Custom); -      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom); -      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom); -      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom); -      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); -      if (T != ByteV) -        setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); -    } - -    for (MVT T : LegalV) { -      if (T == ByteV) -        continue; -      // Promote all shuffles and concats to operate on vectors of bytes. -      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); -      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); -      setPromoteTo(ISD::AND,            T, ByteV); -      setPromoteTo(ISD::OR,             T, ByteV); -      setPromoteTo(ISD::XOR,            T, ByteV); -    } - -    for (MVT T : LegalW) { -      if (T == ByteW) -        continue; -      // Promote all shuffles and concats to operate on vectors of bytes. -      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); -      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW); -    } -  } -    // Subtarget-specific operation actions.    //    if (Subtarget.hasV5TOps()) { @@ -2110,6 +2066,67 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,      setIndexedStoreAction(ISD::POST_INC, VT, Legal);    } +  if (Subtarget.useHVXOps()) { +    bool Use64b = Subtarget.useHVX64BOps(); +    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128; +    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128; +    MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8; +    MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; + +    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); +    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); +    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); +    setOperationAction(ISD::AND,            ByteV, Legal); +    setOperationAction(ISD::OR,             ByteV, Legal); +    setOperationAction(ISD::XOR,            ByteV, Legal); + +    for (MVT T : LegalV) { +      setIndexedLoadAction(ISD::POST_INC,  T, Legal); +      setIndexedStoreAction(ISD::POST_INC, T, Legal); + +      setOperationAction(ISD::ADD,     T, Legal); +      setOperationAction(ISD::SUB,     T, Legal); +      if (T != ByteV) { +        setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); +        setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); +      } + +      setOperationAction(ISD::MUL,                T, Custom); +      setOperationAction(ISD::SETCC,              T, Custom); +      setOperationAction(ISD::BUILD_VECTOR,       T, Custom); +      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom); +      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom); +      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom); +      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); +      if (T != ByteV) +        setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); +    } + +    for (MVT T : LegalV) { +      if (T == ByteV) +        continue; +      // Promote all shuffles and concats to operate on vectors of bytes. +      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); +      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); +      setPromoteTo(ISD::AND,            T, ByteV); +      setPromoteTo(ISD::OR,             T, ByteV); +      setPromoteTo(ISD::XOR,            T, ByteV); +    } + +    for (MVT T : LegalW) { +      // Custom-lower BUILD_VECTOR for vector pairs. The standard (target- +      // independent) handling of it would convert it to a load, which is +      // not always the optimal choice. +      setOperationAction(ISD::BUILD_VECTOR, T, Custom); + +      if (T == ByteW) +        continue; +      // Promote all shuffles and concats to operate on vectors of bytes. +      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); +      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW); +    } +  } +    computeRegisterProperties(&HRI);    // @@ -2256,6 +2273,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {    case HexagonISD::VINSERTW0:     return "HexagonISD::VINSERTW0";    case HexagonISD::VROR:          return "HexagonISD::VROR";    case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE"; +  case HexagonISD::VZERO:         return "HexagonISD::VZERO";    case HexagonISD::OP_END:        break;    }    return nullptr; @@ -2331,14 +2349,27 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,  TargetLoweringBase::LegalizeTypeAction  HexagonTargetLowering::getPreferredVectorAction(EVT VT) const { +  if (VT.getVectorNumElements() == 1) +    return TargetLoweringBase::TypeScalarizeVector; + +  // Always widen vectors of i1. +  MVT ElemTy = VT.getSimpleVT().getVectorElementType(); +  if (ElemTy == MVT::i1) +    return TargetLoweringBase::TypeWidenVector; +    if (Subtarget.useHVXOps()) {      // If the size of VT is at least half of the vector length,      // widen the vector. Note: the threshold was not selected in      // any scientific way. -    if (VT.getSizeInBits() >= Subtarget.getVectorLength()*8/2) -      return TargetLoweringBase::TypeWidenVector; +    ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes(); +    if (llvm::find(Tys, ElemTy) != Tys.end()) { +      unsigned HwWidth = 8*Subtarget.getVectorLength(); +      unsigned VecWidth = VT.getSizeInBits(); +      if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) +        return TargetLoweringBase::TypeWidenVector; +    }    } -  return TargetLowering::getPreferredVectorAction(VT); +  return TargetLoweringBase::TypeSplitVector;  }  // Lower a vector shuffle (V1, V2, V3).  V1 and V2 are the two vectors @@ -2463,21 +2494,43 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {    return DAG.getNode(ISD::BITCAST, dl, VT, Result);  } +bool +HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values, +      MVT VecTy, SelectionDAG &DAG, +      MutableArrayRef<ConstantInt*> Consts) const { +  MVT ElemTy = VecTy.getVectorElementType(); +  unsigned ElemWidth = ElemTy.getSizeInBits(); +  IntegerType *IntTy = IntegerType::get(*DAG.getContext(), ElemWidth); +  bool AllConst = true; + +  for (unsigned i = 0, e = Values.size(); i != e; ++i) { +    SDValue V = Values[i]; +    if (V.isUndef()) { +      Consts[i] = ConstantInt::get(IntTy, 0); +      continue; +    } +    if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) { +      const ConstantInt *CI = CN->getConstantIntValue(); +      Consts[i] = const_cast<ConstantInt*>(CI); +    } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) { +      const ConstantFP *CF = CN->getConstantFPValue(); +      APInt A = CF->getValueAPF().bitcastToAPInt(); +      Consts[i] = ConstantInt::get(IntTy, A.getZExtValue()); +    } else { +      AllConst = false; +    } +  } +  return AllConst; +} +  SDValue  HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,                                       MVT VecTy, SelectionDAG &DAG) const {    MVT ElemTy = VecTy.getVectorElementType();    assert(VecTy.getVectorNumElements() == Elem.size()); -  SmallVector<ConstantSDNode*,4> Consts; -  bool AllConst = true; -  for (SDValue V : Elem) { -    if (isUndef(V)) -      V = DAG.getConstant(0, dl, ElemTy); -    auto *C = dyn_cast<ConstantSDNode>(V.getNode()); -    Consts.push_back(C); -    AllConst = AllConst && C != nullptr; -  } +  SmallVector<ConstantInt*,4> Consts(Elem.size()); +  bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);    unsigned First, Num = Elem.size();    for (First = 0; First != Num; ++First) @@ -2486,6 +2539,10 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,    if (First == Num)      return DAG.getUNDEF(VecTy); +  if (AllConst && +      llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) +    return getZero(dl, VecTy, DAG); +    if (ElemTy == MVT::i16) {      assert(Elem.size() == 2);      if (AllConst) { @@ -2498,45 +2555,55 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,      return DAG.getBitcast(MVT::v2i16, N);    } -  // First try generating a constant. -  assert(ElemTy == MVT::i8 && Num == 4); -  if (AllConst) { -    int32_t V = (Consts[0]->getZExtValue() & 0xFF) | -                (Consts[1]->getZExtValue() & 0xFF) << 8 | -                (Consts[1]->getZExtValue() & 0xFF) << 16 | -                Consts[2]->getZExtValue() << 24; -    return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); -  } +  if (ElemTy == MVT::i8) { +    // First try generating a constant. +    if (AllConst) { +      int32_t V = (Consts[0]->getZExtValue() & 0xFF) | +                  (Consts[1]->getZExtValue() & 0xFF) << 8 | +                  (Consts[1]->getZExtValue() & 0xFF) << 16 | +                  Consts[2]->getZExtValue() << 24; +      return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); +    } -  // Then try splat. -  bool IsSplat = true; -  for (unsigned i = 0; i != Num; ++i) { -    if (i == First) -      continue; -    if (Elem[i] == Elem[First] || isUndef(Elem[i])) -      continue; -    IsSplat = false; -    break; -  } -  if (IsSplat) -    return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]); +    // Then try splat. +    bool IsSplat = true; +    for (unsigned i = 0; i != Num; ++i) { +      if (i == First) +        continue; +      if (Elem[i] == Elem[First] || isUndef(Elem[i])) +        continue; +      IsSplat = false; +      break; +    } +    if (IsSplat) { +      // Legalize the operand to VSPLAT. +      SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32); +      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext); +    } -  // Generate -  //   (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) | -  //   (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16 -  SDValue S8 = DAG.getConstant(8, dl, MVT::i32); -  SDValue V0 = DAG.getZeroExtendInReg(Elem[0], dl, MVT::i8); -  SDValue V1 = DAG.getZeroExtendInReg(Elem[1], dl, MVT::i8); -  SDValue V2 = DAG.getZeroExtendInReg(Elem[2], dl, MVT::i8); -  SDValue V3 = DAG.getZeroExtendInReg(Elem[3], dl, MVT::i8); +    // Generate +    //   (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) | +    //   (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16 +    assert(Elem.size() == 4); +    SDValue Vs[4]; +    for (unsigned i = 0; i != 4; ++i) { +      Vs[i] = DAG.getZExtOrTrunc(Elem[i], dl, MVT::i32); +      Vs[i] = DAG.getZeroExtendInReg(Vs[i], dl, MVT::i8); +    } +    SDValue S8 = DAG.getConstant(8, dl, MVT::i32); +    SDValue T0 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[1], S8}); +    SDValue T1 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[3], S8}); +    SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0}); +    SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1}); -  SDValue V4 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V1, S8}); -  SDValue V5 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V3, S8}); -  SDValue V6 = DAG.getNode(ISD::OR, dl, MVT::i32, {V0, V4}); -  SDValue V7 = DAG.getNode(ISD::OR, dl, MVT::i32, {V2, V5}); +    SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG); +    return DAG.getBitcast(MVT::v4i8, R); +  } -  SDValue T0 = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {V7, V6}, DAG); -  return DAG.getBitcast(MVT::v4i8, T0); +#ifndef NDEBUG +  dbgs() << "VecTy: " << EVT(VecTy).getEVTString() << '\n'; +#endif +  llvm_unreachable("Unexpected vector element type");  }  SDValue @@ -2545,15 +2612,8 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,    MVT ElemTy = VecTy.getVectorElementType();    assert(VecTy.getVectorNumElements() == Elem.size()); -  SmallVector<ConstantSDNode*,8> Consts; -  bool AllConst = true; -  for (SDValue V : Elem) { -    if (isUndef(V)) -      V = DAG.getConstant(0, dl, ElemTy); -    auto *C = dyn_cast<ConstantSDNode>(V.getNode()); -    Consts.push_back(C); -    AllConst = AllConst && C != nullptr; -  } +  SmallVector<ConstantInt*,8> Consts(Elem.size()); +  bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);    unsigned First, Num = Elem.size();    for (First = 0; First != Num; ++First) @@ -2562,6 +2622,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,    if (First == Num)      return DAG.getUNDEF(VecTy); +  if (AllConst && +      llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) +    return getZero(dl, VecTy, DAG); +    // First try splat if possible.    if (ElemTy == MVT::i16) {      bool IsSplat = true; @@ -2573,8 +2637,11 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,        IsSplat = false;        break;      } -    if (IsSplat) -      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]); +    if (IsSplat) { +      // Legalize the operand to VSPLAT. +      SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32); +      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext); +    }    }    // Then try constant. @@ -2593,10 +2660,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,    MVT HalfTy = MVT::getVectorVT(ElemTy, Num/2);    SDValue L = (ElemTy == MVT::i32)                  ? Elem[0] -                : buildVector32({Elem.data(), Num/2}, dl, HalfTy, DAG); +                : buildVector32(Elem.take_front(Num/2), dl, HalfTy, DAG);    SDValue H = (ElemTy == MVT::i32)                  ? Elem[1] -                : buildVector32({Elem.data()+Num/2, Num/2}, dl, HalfTy, DAG); +                : buildVector32(Elem.drop_front(Num/2), dl, HalfTy, DAG);    return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, {H, L});  } @@ -2696,21 +2763,41 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,  }  SDValue +HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) +      const { +  if (Ty.isVector()) { +    assert(Ty.isInteger() && "Only integer vectors are supported here"); +    unsigned W = Ty.getSizeInBits(); +    if (W <= 64) +      return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W))); +    return DAG.getNode(HexagonISD::VZERO, dl, Ty); +  } + +  if (Ty.isInteger()) +    return DAG.getConstant(0, dl, Ty); +  if (Ty.isFloatingPoint()) +    return DAG.getConstantFP(0.0, dl, Ty); +  llvm_unreachable("Invalid type for zero"); +} + +SDValue  HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {    MVT VecTy = ty(Op);    unsigned BW = VecTy.getSizeInBits(); + +  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true)) +    return LowerHvxBuildVector(Op, DAG); +    if (BW == 32 || BW == 64) { +    const SDLoc &dl(Op);      SmallVector<SDValue,8> Ops;      for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)        Ops.push_back(Op.getOperand(i));      if (BW == 32) -      return buildVector32(Ops, SDLoc(Op), VecTy, DAG); -    return buildVector64(Ops, SDLoc(Op), VecTy, DAG); +      return buildVector32(Ops, dl, VecTy, DAG); +    return buildVector64(Ops, dl, VecTy, DAG);    } -  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy)) -    return LowerHvxBuildVector(Op, DAG); -    return SDValue();  } @@ -2822,7 +2909,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {  #ifndef NDEBUG        Op.getNode()->dumpr(&DAG);        if (Opc > HexagonISD::OP_BEGIN && Opc < HexagonISD::OP_END) -        errs() << "Check for a non-legal type in this operation\n"; +        errs() << "Error: check for a non-legal type in this operation\n";  #endif        llvm_unreachable("Should not custom lower this!");      case ISD::CONCAT_VECTORS:       return LowerCONCAT_VECTORS(Op, DAG); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 0619e2e4e7f9..732834b464b4 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -70,6 +70,7 @@ namespace HexagonISD {        EH_RETURN,        DCFETCH,        READCYCLE, +      VZERO,        OP_END      }; @@ -283,6 +284,9 @@ namespace HexagonISD {      }    private: +    bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy, +                                 SelectionDAG &DAG, +                                 MutableArrayRef<ConstantInt*> Consts) const;      SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,                            SelectionDAG &DAG) const;      SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy, @@ -301,6 +305,7 @@ namespace HexagonISD {        SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);        return SDValue(N, 0);      } +    SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const;      using VectorPair = std::pair<SDValue, SDValue>;      using TypePair = std::pair<MVT, MVT>; @@ -344,6 +349,13 @@ namespace HexagonISD {      SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,                             ArrayRef<int> Mask, SelectionDAG &DAG) const; +    MVT getVecBoolVT() const; + +    SDValue buildHvxVectorSingle(ArrayRef<SDValue> Values, const SDLoc &dl, +                                 MVT VecTy, SelectionDAG &DAG) const; +    SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl, +                               MVT VecTy, SelectionDAG &DAG) const; +      SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;      SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index c1d44cb0e7de..51480d09d734 100644 --- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -141,49 +141,50 @@ HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0,                                opCastElem(Op1, MVT::i8, DAG), ByteMask);  } +MVT +HexagonTargetLowering::getVecBoolVT() const { +  return MVT::getVectorVT(MVT::i1, 8*Subtarget.getVectorLength()); +} +  SDValue -HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) -      const { -  const SDLoc &dl(Op); -  BuildVectorSDNode *BN = cast<BuildVectorSDNode>(Op.getNode()); -  bool IsConst = BN->isConstant(); +HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values, +                                            const SDLoc &dl, MVT VecTy, +                                            SelectionDAG &DAG) const { +  unsigned VecLen = Values.size();    MachineFunction &MF = DAG.getMachineFunction(); -  MVT VecTy = ty(Op); +  MVT ElemTy = VecTy.getVectorElementType(); +  unsigned ElemWidth = ElemTy.getSizeInBits(); +  unsigned HwLen = Subtarget.getVectorLength(); -  if (IsConst) { -    SmallVector<Constant*, 128> Elems; -    for (SDValue V : BN->op_values()) { -      if (auto *C = dyn_cast<ConstantSDNode>(V.getNode())) -        Elems.push_back(const_cast<ConstantInt*>(C->getConstantIntValue())); -    } -    Constant *CV = ConstantVector::get(Elems); -    unsigned Align = VecTy.getSizeInBits() / 8; +  SmallVector<ConstantInt*, 128> Consts(VecLen); +  bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts); +  if (AllConst) { +    if (llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) +      return getZero(dl, VecTy, DAG); + +    ArrayRef<Constant*> Tmp((Constant**)Consts.begin(), +                            (Constant**)Consts.end()); +    Constant *CV = ConstantVector::get(Tmp); +    unsigned Align = HwLen;      SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);      return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,                         MachinePointerInfo::getConstantPool(MF), Align);    } -  unsigned NumOps = Op.getNumOperands(); -  unsigned HwLen = Subtarget.getVectorLength(); -  unsigned ElemSize = VecTy.getVectorElementType().getSizeInBits() / 8; -  assert(ElemSize*NumOps == HwLen); - +  unsigned ElemSize = ElemWidth / 8; +  assert(ElemSize*VecLen == HwLen);    SmallVector<SDValue,32> Words; -  SmallVector<SDValue,32> Ops; -  for (unsigned i = 0; i != NumOps; ++i) -    Ops.push_back(Op.getOperand(i));    if (VecTy.getVectorElementType() != MVT::i32) { -    assert(ElemSize < 4 && "vNi64 should have been promoted to vNi32");      assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size");      unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2;      MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord); -    for (unsigned i = 0; i != NumOps; i += OpsPerWord) { -      SDValue W = buildVector32({&Ops[i], OpsPerWord}, dl, PartVT, DAG); +    for (unsigned i = 0; i != VecLen; i += OpsPerWord) { +      SDValue W = buildVector32(Values.slice(i, OpsPerWord), dl, PartVT, DAG);        Words.push_back(DAG.getBitcast(MVT::i32, W));      }    } else { -    Words.assign(Ops.begin(), Ops.end()); +    Words.assign(Values.begin(), Values.end());    }    // Construct two halves in parallel, then or them together. @@ -208,6 +209,83 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)  }  SDValue +HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values, +                                          const SDLoc &dl, MVT VecTy, +                                          SelectionDAG &DAG) const { +  // Construct a vector V of bytes, such that a comparison V >u 0 would +  // produce the required vector predicate. +  unsigned VecLen = Values.size(); +  unsigned HwLen = Subtarget.getVectorLength(); +  assert(VecLen <= HwLen || VecLen == 8*HwLen); +  SmallVector<SDValue,128> Bytes; + +  if (VecLen <= HwLen) { +    // In the hardware, each bit of a vector predicate corresponds to a byte +    // of a vector register. Calculate how many bytes does a bit of VecTy +    // correspond to. +    assert(HwLen % VecLen == 0); +    unsigned BitBytes = HwLen / VecLen; +    for (SDValue V : Values) { +      SDValue Ext = !V.isUndef() ? DAG.getZExtOrTrunc(V, dl, MVT::i8) +                                 : DAG.getConstant(0, dl, MVT::i8); +      for (unsigned B = 0; B != BitBytes; ++B) +        Bytes.push_back(Ext); +    } +  } else { +    // There are as many i1 values, as there are bits in a vector register. +    // Divide the values into groups of 8 and check that each group consists +    // of the same value (ignoring undefs). +    for (unsigned I = 0; I != VecLen; I += 8) { +      unsigned B = 0; +      // Find the first non-undef value in this group. +      for (; B != 8; ++B) { +        if (!Values[I+B].isUndef()) +          break; +      } +      SDValue F = Values[I+B]; +      SDValue Ext = (B < 8) ? DAG.getZExtOrTrunc(F, dl, MVT::i8) +                            : DAG.getConstant(0, dl, MVT::i8); +      Bytes.push_back(Ext); +      // Verify that the rest of values in the group are the same as the +      // first. +      for (; B != 8; ++B) +        assert(Values[I+B].isUndef() || Values[I+B] == F); +    } +  } + +  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen); +  SDValue ByteVec = buildHvxVectorSingle(Bytes, dl, ByteTy, DAG); +  SDValue Cmp = DAG.getSetCC(dl, VecTy, ByteVec, getZero(dl, ByteTy, DAG), +                             ISD::SETUGT); +  return Cmp; +} + +SDValue +HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) +      const { +  const SDLoc &dl(Op); +  MVT VecTy = ty(Op); + +  unsigned Size = Op.getNumOperands(); +  SmallVector<SDValue,128> Ops; +  for (unsigned i = 0; i != Size; ++i) +    Ops.push_back(Op.getOperand(i)); + +  if (VecTy.getVectorElementType() == MVT::i1) +    return buildHvxVectorPred(Ops, dl, VecTy, DAG); + +  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) { +    ArrayRef<SDValue> A(Ops); +    MVT SingleTy = typeSplit(VecTy).first; +    SDValue V0 = buildHvxVectorSingle(A.take_front(Size/2), dl, SingleTy, DAG); +    SDValue V1 = buildHvxVectorSingle(A.drop_front(Size/2), dl, SingleTy, DAG); +    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1); +  } + +  return buildHvxVectorSingle(Ops, dl, VecTy, DAG); +} + +SDValue  HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)        const {    // Change the type of the extracted element to i32. @@ -399,6 +477,10 @@ HexagonTargetLowering::LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const {    //   (negate (swap-op NewCmp)),    // the condition code for the NewCmp should be calculated from the original    // CC by applying these operations in the reverse order. +  // +  // This could also be done through setCondCodeAction, but for negation it +  // uses a xor with a vector of -1s, which it obtains from BUILD_VECTOR. +  // That is far too expensive for what can be done with a single instruction.    switch (CC) {      case ISD::SETNE:    // !eq diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index e2120d3de2ef..cdc2085986a5 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2899,6 +2899,8 @@ def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,  def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>; +def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>; +  def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,    [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;  def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>; @@ -2920,7 +2922,14 @@ let Predicates = [UseHVX] in {    def: OpR_RR_pat<V6_vpackoh,  pf2<HexagonVPACKO>,    VecI16, HVI16>;  } +def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>; +def vzero: PatFrag<(ops), (HexagonVZERO)>; +  let Predicates = [UseHVX] in { +  def: Pat<(VecI8  vzero), (V6_vd0)>; +  def: Pat<(VecI16 vzero), (V6_vd0)>; +  def: Pat<(VecI32 vzero), (V6_vd0)>; +    def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),             (Combinev HvxVR:$Vt, HvxVR:$Vs)>;    def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)), diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td index 2ceed70c2497..1d1e85e7ac7e 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -242,7 +242,7 @@ def VecQ32  // FIXME: the register order should be defined in terms of the preferred  // allocation order...  // -def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, +def IntRegs : RegisterClass<"Hexagon", [i32, f32, v32i1, v4i8, v2i16], 32,    (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),         R10, R11, R29, R30, R31)>; @@ -254,7 +254,8 @@ def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,  def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,    (add R7, R6, R5, R4, R3, R2, R1, R0)> ; -def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, +def DoubleRegs : RegisterClass<"Hexagon", +  [i64, f64, v64i1, v8i8, v4i16, v2i32], 64,    (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;  def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64, diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 678ef210d0ae..af93f20d97fc 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -204,14 +204,38 @@ public:      llvm_unreachable("Invalid HVX vector length settings");    } -  bool isHVXVectorType(MVT VecTy) const { +  ArrayRef<MVT> getHVXElementTypes() const { +    static MVT Types[] = { MVT::i8, MVT::i16, MVT::i32 }; +    return makeArrayRef(Types); +  } + +  bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const {      if (!VecTy.isVector() || !useHVXOps())        return false; -    unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits(); -    if (ElemWidth < 8 || ElemWidth > 64) +    MVT ElemTy = VecTy.getVectorElementType(); +    if (!IncludeBool && ElemTy == MVT::i1) +      return false; + +    unsigned HwLen = getVectorLength(); +    unsigned NumElems = VecTy.getVectorNumElements(); +    ArrayRef<MVT> ElemTypes = getHVXElementTypes(); + +    if (IncludeBool && ElemTy == MVT::i1) { +      // Special case for the v512i1, etc. +      if (8*HwLen == NumElems) +        return true; +      // Boolean HVX vector types are formed from regular HVX vector types +      // by replacing the element type with i1. +      for (MVT T : ElemTypes) +        if (NumElems * T.getSizeInBits() == 8*HwLen) +          return true;        return false; +    } +      unsigned VecWidth = VecTy.getSizeInBits(); -    return VecWidth == 8*getVectorLength() || VecWidth == 16*getVectorLength(); +    if (VecWidth != 8*HwLen && VecWidth != 16*HwLen) +      return false; +    return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; });    }    unsigned getL1CacheLineSize() const; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 0c40a7b8f382..363b703fef28 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -258,10 +258,9 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {      });  } -TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(HexagonTTIImpl(this, F)); -  }); +TargetTransformInfo +HexagonTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(HexagonTTIImpl(this, F));  } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index acd41f920b53..a7c6a3437fbc 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -39,7 +39,7 @@ public:    void adjustPassManager(PassManagerBuilder &PMB) override;    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    HexagonTargetObjectFile *getObjFileLowering() const override {      return static_cast<HexagonTargetObjectFile*>(TLOF.get()); diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 9a73c95d6516..2c21a53b13bb 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -74,10 +74,9 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,    initAsmInfo();  } -TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(LanaiTTIImpl(this, F)); -  }); +TargetTransformInfo +LanaiTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(LanaiTTIImpl(this, F));  }  namespace { diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 2fb1a0536104..0db286ec13e7 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -42,7 +42,7 @@ public:      return &Subtarget;    } -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index ac81e6207456..2f6dd0035de3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -188,7 +188,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,    // so we have to special check for them.    unsigned Opcode = TmpInst.getOpcode();    if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && -      (Opcode != Mips::SLL_MM) && !Binary) +      (Opcode != Mips::SLL_MM) && (Opcode != Mips::SLL_MMR6) && !Binary)      llvm_unreachable("unimplemented opcode in encodeInstruction()");    int NewOpcode = -1; diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index 50537bed8ff0..c85ee20273c0 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -38,7 +38,7 @@ class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs>    let Namespace = "Mips";  } -// Mips CPU Registers +// Mips CPU Registers.  class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>;  // Mips 64-bit CPU Registers diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 85193bffef56..fb79a4bf40c5 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -259,17 +259,16 @@ void MipsPassConfig::addPreRegAlloc() {    addPass(createMipsOptimizePICCallPass());  } -TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    if (Subtarget->allowMixed16_32()) { -      DEBUG(errs() << "No Target Transform Info Pass Added\n"); -      // FIXME: This is no longer necessary as the TTI returned is per-function. -      return TargetTransformInfo(F.getParent()->getDataLayout()); -    } - -    DEBUG(errs() << "Target Transform Info Pass Added\n"); -    return TargetTransformInfo(BasicTTIImpl(this, F)); -  }); +TargetTransformInfo +MipsTargetMachine::getTargetTransformInfo(const Function &F) { +  if (Subtarget->allowMixed16_32()) { +    DEBUG(errs() << "No Target Transform Info Pass Added\n"); +    // FIXME: This is no longer necessary as the TTI returned is per-function. +    return TargetTransformInfo(F.getParent()->getDataLayout()); +  } + +  DEBUG(errs() << "Target Transform Info Pass Added\n"); +  return TargetTransformInfo(BasicTTIImpl(this, F));  }  // Implemented by targets that want to run passes immediately before diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index ccfc9a938d9c..56e6e5d8daa2 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -44,7 +44,7 @@ public:                      CodeGenOpt::Level OL, bool JIT, bool isLittle);    ~MipsTargetMachine() override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    const MipsSubtarget *getSubtargetImpl() const {      if (Subtarget) diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 85f757878f94..d31e1cb5047b 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -180,10 +180,9 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {      });  } -TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(NVPTXTTIImpl(this, F)); -  }); +TargetTransformInfo +NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(NVPTXTTIImpl(this, F));  }  void NVPTXPassConfig::addEarlyCSEOrGVNPass() { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 54a72a688ee3..eeebf64d39c3 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -63,7 +63,7 @@ public:    void adjustPassManager(PassManagerBuilder &) override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    bool isMachineVerifierClean() const override {      return false; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index c870a2256691..7902da20a010 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1531,11 +1531,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,  void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {    MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); -  DebugLoc dl; -  if (MBBI != MBB.end()) -    dl = MBBI->getDebugLoc(); +  // If we got this far a first terminator should exist. +  assert(MBBI != MBB.end() && "Failed to find the first terminator."); +  DebugLoc dl = MBBI->getDebugLoc();    const PPCInstrInfo &TII = *Subtarget.getInstrInfo();    // Create branch instruction for pseudo tail call return instruction diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 18e567fa589c..cea59de3e8a9 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11882,6 +11882,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,    SDLoc dl(N);    SDValue Op(N, 0); +  // Don't handle ppc_fp128 here or i1 conversions. +  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) +    return SDValue(); +  if (Op.getOperand(0).getValueType() == MVT::i1) +    return SDValue(); +    SDValue FirstOperand(Op.getOperand(0));    bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&      (FirstOperand.getValueType() == MVT::i8 || @@ -11910,11 +11916,6 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,        return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);    } -  // Don't handle ppc_fp128 here or i1 conversions. -  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) -    return SDValue(); -  if (Op.getOperand(0).getValueType() == MVT::i1) -    return SDValue();    // For i32 intermediate values, unfortunately, the conversion functions    // leave the upper 32 bits of the value are undefined. Within the set of diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index a2640727f813..474661aaaee8 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1025,9 +1025,6 @@ bool PPCMIPeephole::eliminateRedundantTOCSaves(  //   bge    0, .LBB0_4  bool PPCMIPeephole::eliminateRedundantCompare(void) { -  // FIXME: this transformation is causing miscompiles. Disabling it for now -  // until we can resolve the issue. -  return false;    bool Simplified = false;    for (MachineBasicBlock &MBB2 : *MF) { @@ -1087,10 +1084,21 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {        // we replace it with a signed comparison if the comparison        // to be merged is a signed comparison.        // In other cases of opcode mismatch, we cannot optimize this. -      if (isEqOrNe(BI2) && + +      // We cannot change opcode when comparing against an immediate +      // if the most significant bit of the immediate is one +      // due to the difference in sign extension. +      auto CmpAgainstImmWithSignBit = [](MachineInstr *I) { +        if (!I->getOperand(2).isImm()) +          return false; +        int16_t Imm = (int16_t)I->getOperand(2).getImm(); +        return Imm < 0; +      }; + +      if (isEqOrNe(BI2) && !CmpAgainstImmWithSignBit(CMPI2) &&            CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode()))          NewOpCode = CMPI1->getOpcode(); -      else if (isEqOrNe(BI1) && +      else if (isEqOrNe(BI1) && !CmpAgainstImmWithSignBit(CMPI1) &&                 getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode())          NewOpCode = CMPI2->getOpcode();        else continue; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 491f25ca2c64..20a83c973026 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -451,8 +451,7 @@ void PPCPassConfig::addPreEmitPass() {    addPass(createPPCBranchSelectionPass(), false);  } -TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(PPCTTIImpl(this, F)); -  }); +TargetTransformInfo +PPCTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(PPCTTIImpl(this, F));  } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 102bf7ca59c2..75b98a815ab4 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -49,7 +49,7 @@ public:    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    TargetLoweringObjectFile *getObjFileLowering() const override {      return TLOF.get(); diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index e74d68182949..3a167a6d452a 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -257,8 +257,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {    return new SystemZPassConfig(*this, PM);  } -TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(SystemZTTIImpl(this, F)); -  }); +TargetTransformInfo +SystemZTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(SystemZTTIImpl(this, F));  } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 95ad5e339e0b..52bf8bba55de 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -44,7 +44,7 @@ public:    // Override LLVMTargetMachine    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    TargetLoweringObjectFile *getObjFileLowering() const override {      return TLOF.get(); diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index ad63c7a9cb30..c4c0dd22ee0c 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -219,10 +219,8 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }  void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; } -TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([](const Function &F) { -    return TargetTransformInfo(F.getParent()->getDataLayout()); -  }); +TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(F.getParent()->getDataLayout());  }  void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, @@ -244,3 +242,10 @@ MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const {    getNameWithPrefix(NameStr, GV, TLOF->getMangler());    return TLOF->getContext().getOrCreateSymbol(NameStr);  } + +TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { +  // Since Analysis can't depend on Target, use a std::function to invert the +  // dependency. +  return TargetIRAnalysis( +      [this](const Function &F) { return this->getTargetTransformInfo(F); }); +} diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 2bdba96ab674..a4bb967f36f6 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -746,6 +746,14 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {    MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();    LiveIntervals &LIS = getAnalysis<LiveIntervals>(); +  // Disable the TEE optimization if we aren't doing direct wasm object +  // emission, because lowering TEE to TEE_LOCAL is done in the ExplicitLocals +  // pass, which is also disabled. +  bool UseTee = true; +  if (MF.getSubtarget<WebAssemblySubtarget>() +        .getTargetTriple().isOSBinFormatELF()) +    UseTee = false; +    // Walk the instructions from the bottom up. Currently we don't look past    // block boundaries, and the blocks aren't ordered so the block visitation    // order isn't significant, but we may want to change this in the future. @@ -811,7 +819,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {            Insert =                RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),                                      LIS, MFI, MRI, TII, TRI); -        } else if (CanMove && +        } else if (UseTee && CanMove &&                     OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {            Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,                                           MRI, TII); diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 2599064334ee..f808c063d7e4 100644 --- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -223,6 +223,8 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {  /* SINCOS_F80 */ unsupported,  /* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,  /* SINCOS_PPCF128 */ unsupported, +/* SINCOS_STRET_F32 */ unsupported, +/* SINCOS_STRET_F64 */ unsupported,  /* POW_F32 */ f32_func_f32_f32,  /* POW_F64 */ f64_func_f64_f64,  /* POW_F80 */ unsupported, @@ -390,8 +392,9 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {  // MEMORY  /* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMSET */ iPTR_func_iPTR_i32_iPTR,  /* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR, +/* MEMSET */ iPTR_func_iPTR_i32_iPTR, +/* BZERO */ unsupported,  // ELEMENT-WISE ATOMIC MEMORY  /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported, @@ -687,6 +690,8 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {  /* SINCOS_F80 */ nullptr,  /* SINCOS_F128 */ "sincosl",  /* SINCOS_PPCF128 */ nullptr, +/* SINCOS_STRET_F32 */ nullptr, +/* SINCOS_STRET_F64 */ nullptr,  /* POW_F32 */ "powf",  /* POW_F64 */ "pow",  /* POW_F80 */ nullptr, @@ -850,6 +855,7 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {  /* MEMCPY */ "memcpy",  /* MEMMOVE */ "memset",  /* MEMSET */ "memmove", +/* BZERO */ nullptr,  /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,  /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,  /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr, diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 3cc19ef5fbab..d38cde74d2ec 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -146,10 +146,9 @@ public:  };  } // end anonymous namespace -TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); -  }); +TargetTransformInfo +WebAssemblyTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(WebAssemblyTTIImpl(this, F));  }  TargetPassConfig * diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h index 224849526514..dd826befd117 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h @@ -43,8 +43,7 @@ public:      return TLOF.get();    } -  /// \brief Get the TargetIRAnalysis for this target. -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    bool usesPhysRegsForPEI() const override { return false; }  }; diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 78385ae1877b..239db2a74b24 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -78,7 +78,7 @@ public:                CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&                CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&                CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && -              CPU != "c3" && CPU != "c3-2" && CPU != "lakemont"; +              CPU != "c3" && CPU != "c3-2" && CPU != "lakemont" && CPU != "";    }    unsigned getNumFixupKinds() const override { diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 08731cd0204c..7e7c35569093 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -137,7 +137,7 @@ def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",  def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",                        "Enable AVX-512 PreFetch Instructions",                                        [FeatureAVX512]>; -def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1", +def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",                                     "true",                                     "Prefetch with Intent to Write and T1 Hint">;  def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true", @@ -263,6 +263,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",  def FeatureSoftFloat      : SubtargetFeature<"soft-float", "UseSoftFloat", "true",                         "Use software floating point features.">; +// On recent X86 (port bound) processors, its preferable to combine to a single shuffle +// using a variable mask over multiple fixed shuffles. +def FeatureFastVariableShuffle +    : SubtargetFeature<"fast-variable-shuffle", +                       "HasFastVariableShuffle", +                       "true", "Shuffles with variable masks are fast">;  // On some X86 processors, there is no performance hazard to writing only the  // lower parts of a YMM or ZMM register without clearing the upper part.  def FeatureFastPartialYMMorZMMWrite @@ -620,7 +626,8 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [    FeatureERMSB,    FeatureFMA,    FeatureLZCNT, -  FeatureMOVBE +  FeatureMOVBE, +  FeatureFastVariableShuffle  ]>;  class HaswellProc<string Name> : ProcModel<Name, HaswellModel, @@ -632,7 +639,8 @@ def : HaswellProc<"core-avx2">; // Legacy alias.  def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [    FeatureADX, -  FeatureRDSEED +  FeatureRDSEED, +  FeaturePRFCHW  ]>;  class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,                                               BDWFeatures.Value, [ @@ -669,7 +677,8 @@ def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [    FeatureLZCNT,    FeatureBMI,    FeatureBMI2, -  FeatureFMA +  FeatureFMA, +  FeaturePRFCHW  ]>;  // FIXME: define KNL model diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index 0a87fb4533c2..ba7280c29cc9 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -301,60 +301,21 @@ typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *>  /// different closure that manipulates the loaded or stored value.  class Closure {  private: -  const TargetInstrInfo *TII; -  MachineRegisterInfo *MRI; -    /// Virtual registers in the closure.    DenseSet<unsigned> Edges;    /// Instructions in the closure.    SmallVector<MachineInstr *, 8> Instrs; -  /// A map of available Instruction Converters. -  const InstrConverterBaseMap &Converters; - -  /// The register domain of this closure. -  RegDomain Domain; -    /// Domains which this closure can legally be reassigned to.    std::bitset<NumDomains> LegalDstDomains; -  /// Enqueue \p Reg to be considered for addition to the closure. -  void visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist); - -  /// Add \p MI to this closure. -  void encloseInstr(MachineInstr *MI); - -  /// Calculate the total cost of reassigning the closure to \p Domain. -  double calculateCost(RegDomain Domain) const; - -  /// All edges that are included in some closure. -  DenseSet<unsigned> &EnclosedEdges; - -  /// All instructions that are included in some closure. -  DenseMap<MachineInstr *, Closure *> &EnclosedInstrs; -  public: -  Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, -          const InstrConverterBaseMap &Converters, -          std::initializer_list<RegDomain> LegalDstDomainList, -          DenseSet<unsigned> &EnclosedEdges, -          DenseMap<MachineInstr *, Closure *> &EnclosedInstrs) -      : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain), -        EnclosedEdges(EnclosedEdges), EnclosedInstrs(EnclosedInstrs) { +  Closure(std::initializer_list<RegDomain> LegalDstDomainList) {      for (RegDomain D : LegalDstDomainList)        LegalDstDomains.set(D);    } -  /// Starting from \Reg, expand the closure as much as possible. -  void buildClosure(unsigned E); - -  /// /returns true if it is profitable to reassign the closure to \p Domain. -  bool isReassignmentProfitable(RegDomain Domain) const; - -  /// Reassign the closure to \p Domain. -  void Reassign(RegDomain Domain) const; -    /// Mark this closure as illegal for reassignment to all domains.    void setAllIllegal() { LegalDstDomains.reset(); } @@ -364,10 +325,41 @@ public:    /// \returns true if is legal to reassign this closure to domain \p RD.    bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; } +  /// Mark this closure as illegal for reassignment to domain \p RD. +  void setIllegal(RegDomain RD) { LegalDstDomains[RD] = false; } +    bool empty() const { return Edges.empty(); } + +  bool insertEdge(unsigned Reg) { +    return Edges.insert(Reg).second; +  } + +  using const_edge_iterator = DenseSet<unsigned>::const_iterator; +  iterator_range<const_edge_iterator> edges() const { +    return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end()); +  } + +  void addInstruction(MachineInstr *I) { +    Instrs.push_back(I); +  } + +  ArrayRef<MachineInstr *> instructions() const { +    return Instrs; +  } +  };  class X86DomainReassignment : public MachineFunctionPass { +  const X86Subtarget *STI; +  MachineRegisterInfo *MRI; +  const X86InstrInfo *TII; + +  /// All edges that are included in some closure +  DenseSet<unsigned> EnclosedEdges; + +  /// All instructions that are included in some closure. +  DenseMap<MachineInstr *, Closure *> EnclosedInstrs; +  public:    static char ID; @@ -387,22 +379,39 @@ public:    }  private: -  const X86Subtarget *STI; -  MachineRegisterInfo *MRI; -  const X86InstrInfo *TII; -    /// A map of available Instruction Converters.    InstrConverterBaseMap Converters;    /// Initialize Converters map.    void initConverters(); + +  /// Starting from \Reg, expand the closure as much as possible. +  void buildClosure(Closure &, unsigned Reg); + +  /// Enqueue \p Reg to be considered for addition to the closure. +  void visitRegister(Closure &, unsigned Reg, RegDomain &Domain, +                     SmallVectorImpl<unsigned> &Worklist); + +  /// Reassign the closure to \p Domain. +  void reassign(const Closure &C, RegDomain Domain) const; + +  /// Add \p MI to the closure. +  void encloseInstr(Closure &C, MachineInstr *MI); + +  /// /returns true if it is profitable to reassign the closure to \p Domain. +  bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const; + +  /// Calculate the total cost of reassigning the closure to \p Domain. +  double calculateCost(const Closure &C, RegDomain Domain) const;  };  char X86DomainReassignment::ID = 0;  } // End anonymous namespace. -void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) { +void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg, +                                          RegDomain &Domain, +                                          SmallVectorImpl<unsigned> &Worklist) {    if (EnclosedEdges.count(Reg))      return; @@ -423,59 +432,61 @@ void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) {    Worklist.push_back(Reg);  } -void Closure::encloseInstr(MachineInstr *MI) { +void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {    auto I = EnclosedInstrs.find(MI);    if (I != EnclosedInstrs.end()) { -    if (I->second != this) +    if (I->second != &C)        // Instruction already belongs to another closure, avoid conflicts between        // closure and mark this closure as illegal. -      setAllIllegal(); +      C.setAllIllegal();      return;    } -  EnclosedInstrs[MI] = this; -  Instrs.push_back(MI); +  EnclosedInstrs[MI] = &C; +  C.addInstruction(MI);    // Mark closure as illegal for reassignment to domains, if there is no    // converter for the instruction or if the converter cannot convert the    // instruction. -  for (unsigned i = 0; i != LegalDstDomains.size(); ++i) { -    if (LegalDstDomains[i]) { +  for (int i = 0; i != NumDomains; ++i) { +    if (C.isLegal((RegDomain)i)) {        InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()});        if (!IC || !IC->isLegal(MI, TII)) -        LegalDstDomains[i] = false; +        C.setIllegal((RegDomain)i);      }    }  } -double Closure::calculateCost(RegDomain DstDomain) const { -  assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); +double X86DomainReassignment::calculateCost(const Closure &C, +                                            RegDomain DstDomain) const { +  assert(C.isLegal(DstDomain) && "Cannot calculate cost for illegal closure");    double Cost = 0.0; -  for (auto MI : Instrs) +  for (auto *MI : C.instructions())      Cost +=          Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI);    return Cost;  } -bool Closure::isReassignmentProfitable(RegDomain Domain) const { -  return calculateCost(Domain) < 0.0; +bool X86DomainReassignment::isReassignmentProfitable(const Closure &C, +                                                     RegDomain Domain) const { +  return calculateCost(C, Domain) < 0.0;  } -void Closure::Reassign(RegDomain Domain) const { -  assert(isLegal(Domain) && "Cannot convert illegal closure"); +void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { +  assert(C.isLegal(Domain) && "Cannot convert illegal closure");    // Iterate all instructions in the closure, convert each one using the    // appropriate converter.    SmallVector<MachineInstr *, 8> ToErase; -  for (auto MI : Instrs) +  for (auto *MI : C.instructions())      if (Converters.lookup({Domain, MI->getOpcode()})              ->convertInstr(MI, TII, MRI))        ToErase.push_back(MI);    // Iterate all registers in the closure, replace them with registers in the    // destination domain. -  for (unsigned Reg : Edges) { +  for (unsigned Reg : C.edges()) {      MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));      for (auto &MO : MRI->use_operands(Reg)) {        if (MO.isReg()) @@ -512,18 +523,19 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,    return false;  } -void Closure::buildClosure(unsigned Reg) { +void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {    SmallVector<unsigned, 4> Worklist; -  visitRegister(Reg, Worklist); +  RegDomain Domain = NoDomain; +  visitRegister(C, Reg, Domain, Worklist);    while (!Worklist.empty()) {      unsigned CurReg = Worklist.pop_back_val();      // Register already in this closure. -    if (!Edges.insert(CurReg).second) +    if (!C.insertEdge(CurReg))        continue;      MachineInstr *DefMI = MRI->getVRegDef(CurReg); -    encloseInstr(DefMI); +    encloseInstr(C, DefMI);      // Add register used by the defining MI to the worklist.      // Do not add registers which are used in address calculation, they will be @@ -542,7 +554,7 @@ void Closure::buildClosure(unsigned Reg) {        auto &Op = DefMI->getOperand(OpIdx);        if (!Op.isReg() || !Op.isUse())          continue; -      visitRegister(Op.getReg(), Worklist); +      visitRegister(C, Op.getReg(), Domain, Worklist);      }      // Expand closure through register uses. @@ -550,10 +562,10 @@ void Closure::buildClosure(unsigned Reg) {        // We would like to avoid converting closures which calculare addresses,        // as this should remain in GPRs.        if (usedAsAddr(UseMI, CurReg, TII)) { -        setAllIllegal(); +        C.setAllIllegal();          continue;        } -      encloseInstr(&UseMI); +      encloseInstr(C, &UseMI);        for (auto &DefOp : UseMI.defs()) {          if (!DefOp.isReg()) @@ -561,10 +573,10 @@ void Closure::buildClosure(unsigned Reg) {          unsigned DefReg = DefOp.getReg();          if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { -          setAllIllegal(); +          C.setAllIllegal();            continue;          } -        visitRegister(DefReg, Worklist); +        visitRegister(C, DefReg, Domain, Worklist);        }      }    } @@ -701,8 +713,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {    initConverters();    bool Changed = false; -  DenseSet<unsigned> EnclosedEdges; -  DenseMap<MachineInstr *, Closure *> EnclosedInstrs; +  EnclosedEdges.clear(); +  EnclosedInstrs.clear();    std::vector<Closure> Closures; @@ -719,9 +731,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {        continue;      // Calculate closure starting with Reg. -    Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges, -              EnclosedInstrs); -    C.buildClosure(Reg); +    Closure C({MaskDomain}); +    buildClosure(C, Reg);      // Collect all closures that can potentially be converted.      if (!C.empty() && C.isLegal(MaskDomain)) @@ -729,8 +740,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {    }    for (Closure &C : Closures) -    if (C.isReassignmentProfitable(MaskDomain)) { -      C.Reassign(MaskDomain); +    if (isReassignmentProfitable(C, MaskDomain)) { +      reassign(C, MaskDomain);        ++NumClosuresConverted;        Changed = true;      } diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index a6c7c5f22a3a..660c1eff3c4b 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -106,14 +106,15 @@ namespace {        if (Base_Reg.getNode())          Base_Reg.getNode()->dump();        else -        dbgs() << "nul"; -      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' -             << " Scale" << Scale << '\n' +        dbgs() << "nul\n"; +      if (BaseType == FrameIndexBase) +        dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; +      dbgs() << " Scale " << Scale << '\n'               << "IndexReg ";        if (IndexReg.getNode())          IndexReg.getNode()->dump();        else -        dbgs() << "nul"; +        dbgs() << "nul\n";        dbgs() << " Disp " << Disp << '\n'               << "GV ";        if (GV) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a72f4daa5e11..5ac5d0348f8a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -461,7 +461,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,      setOperationAction(ISD::SRL_PARTS, VT, Custom);    } -  if (Subtarget.hasSSE1()) +  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())      setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);    setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom); @@ -1622,16 +1622,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,      setLibcallName(RTLIB::MUL_I128, nullptr);    } -  // Combine sin / cos into one node or libcall if possible. -  if (Subtarget.hasSinCos()) { -    setLibcallName(RTLIB::SINCOS_F32, "sincosf"); -    setLibcallName(RTLIB::SINCOS_F64, "sincos"); -    if (Subtarget.isTargetDarwin()) { -      // For MacOSX, we don't want the normal expansion of a libcall to sincos. -      // We want to issue a libcall to __sincos_stret to avoid memory traffic. -      setOperationAction(ISD::FSINCOS, MVT::f64, Custom); -      setOperationAction(ISD::FSINCOS, MVT::f32, Custom); -    } +  // Combine sin / cos into _sincos_stret if it is available. +  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && +      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { +    setOperationAction(ISD::FSINCOS, MVT::f64, Custom); +    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);    }    if (Subtarget.isTargetWin64()) { @@ -7480,9 +7475,9 @@ static bool isAddSub(const BuildVectorSDNode *BV,  }  /// Returns true if is possible to fold MUL and an idiom that has already been -/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1). -/// If (and only if) true is returned, the operands of FMADDSUB are written to -/// parameters \p Opnd0, \p Opnd1, \p Opnd2. +/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into  +/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the +/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.  ///  /// Prior to calling this function it should be known that there is some  /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation @@ -7505,12 +7500,12 @@ static bool isAddSub(const BuildVectorSDNode *BV,  /// recognized ADDSUB idiom with ADDSUB operation is that such replacement  /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit  /// FMADDSUB is. -static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG, -                       SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, -                       unsigned ExpectedUses) { +static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, +                                 SelectionDAG &DAG, +                                 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, +                                 unsigned ExpectedUses) {    if (Opnd0.getOpcode() != ISD::FMUL || -      !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || -      !Subtarget.hasAnyFMA()) +      !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())      return false;    // FIXME: These checks must match the similar ones in @@ -7547,7 +7542,7 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,    SDValue Opnd2;    // TODO: According to coverage reports, the FMADDSUB transform is not    // triggered by any tests. -  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) +  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))      return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);    // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -11958,6 +11953,19 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,    return 0;  } +static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, +                                           ArrayRef<int> Mask, SDValue V1, +                                           SDValue V2, SelectionDAG &DAG) { +  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); +  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + +  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); +  if (V2.isUndef()) +    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + +  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} +  /// \brief Generic lowering of v16i8 shuffles.  ///  /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to @@ -12148,6 +12156,10 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,        if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(                DL, MVT::v16i8, V1, V2, Mask, DAG))          return Unpack; + +      // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. +      if (Subtarget.hasVBMI() && Subtarget.hasVLX()) +        return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);      }      return PSHUFB; @@ -13048,19 +13060,6 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,                       DAG.getConstant(Immediate, DL, MVT::i8));  } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, -                                           ArrayRef<int> Mask, SDValue V1, -                                           SDValue V2, SelectionDAG &DAG) { -  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); -  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); - -  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); -  if (V2.isUndef()) -    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - -  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); -} -  /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.  ///  /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -13615,6 +13614,10 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,            DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))      return PSHUFB; +  // AVX512VBMIVL can lower to VPERMB. +  if (Subtarget.hasVBMI() && Subtarget.hasVLX()) +    return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); +    // Try to simplify this by merging 128-bit lanes to enable a lane-based    // shuffle.    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -14077,6 +14080,10 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,                                                  Zeroable, Subtarget, DAG))      return Blend; +  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( +          DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) +    return PSHUFB; +    return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);  } @@ -14212,7 +14219,9 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,      ExtVT = MVT::v4i32;      break;    case MVT::v8i1: -    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL +    // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit +    // shuffle. +    ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;      break;    case MVT::v16i1:      ExtVT = MVT::v16i32; @@ -14569,11 +14578,10 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,      unsigned NumElts = VecVT.getVectorNumElements();      // Extending v8i1/v16i1 to 512-bit get better performance on KNL      // than extending to 128/256bit. -    unsigned VecSize = (NumElts <= 4 ? 128 : 512); -    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts); -    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); -    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, -                              ExtVT.getVectorElementType(), Ext, Idx); +    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; +    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); +    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); +    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);      return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);    } @@ -14768,12 +14776,11 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,      // Non constant index. Extend source and destination,      // insert element and then truncate the result.      unsigned NumElts = VecVT.getVectorNumElements(); -    unsigned VecSize = (NumElts <= 4 ? 128 : 512); -    MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); -    MVT ExtEltVT = ExtVecVT.getVectorElementType(); +    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; +    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);      SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, -      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), -      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); +      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), +      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);      return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);    } @@ -16287,21 +16294,6 @@ static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,    return SelectedVal;  } -static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, -                               SelectionDAG &DAG) { -  SDValue In = Op->getOperand(0); -  MVT InVT = In.getSimpleValueType(); - -  if (InVT.getVectorElementType() == MVT::i1) -    return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); - -  if (Subtarget.hasFp256()) -    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) -      return Res; - -  return SDValue(); -} -  static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,                                  SelectionDAG &DAG) {    SDValue In = Op.getOperand(0); @@ -16440,7 +16432,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,      assert((InVT.is256BitVector() || InVT.is128BitVector()) &&             "Unexpected vector type.");      unsigned NumElts = InVT.getVectorNumElements(); -    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); +    MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); +    MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);      InVT = ExtVT;      ShiftInx = InVT.getScalarSizeInBits() - 1; @@ -18446,6 +18439,21 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,    return V;  } +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, +                               SelectionDAG &DAG) { +  SDValue In = Op->getOperand(0); +  MVT InVT = In.getSimpleValueType(); + +  if (InVT.getVectorElementType() == MVT::i1) +    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); + +  if (Subtarget.hasFp256()) +    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) +      return Res; + +  return SDValue(); +} +  // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.  // For sign extend this needs to handle all vector sizes and SSE4.1 and  // non-SSE4.1 targets. For zero extend this should only handle inputs of @@ -21128,7 +21136,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,    // ADC/ADCX/SBB    case ADX: {      SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); -    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32); +    SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);      SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),                                  DAG.getConstant(-1, dl, MVT::i8));      SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), @@ -22231,6 +22239,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,                           DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));      } +    assert(VT == MVT::v16i8 && "Unexpected VT"); +      SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);      SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);      SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); @@ -22989,12 +22999,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,        (Subtarget.hasAVX512() && VT == MVT::v16i16) ||        (Subtarget.hasAVX512() && VT == MVT::v16i8) ||        (Subtarget.hasBWI() && VT == MVT::v32i8)) { -    MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32); +    assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && +           "Unexpected vector type"); +    MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;      MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());      unsigned ExtOpc =          Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;      R = DAG.getNode(ExtOpc, dl, ExtVT, R); -    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); +    Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);      return DAG.getNode(ISD::TRUNCATE, dl, VT,                         DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));    } @@ -24101,8 +24113,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,    // Only optimize x86_64 for now. i386 is a bit messy. For f32,    // the small struct {f32, f32} is returned in (eax, edx). For f64,    // the results are returned via SRet in memory. -  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";    const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; +  const char *LibcallName = TLI.getLibcallName(LC);    SDValue Callee =        DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); @@ -24928,7 +24941,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,    case ISD::BITCAST: {      assert(Subtarget.hasSSE2() && "Requires at least SSE2!");      EVT DstVT = N->getValueType(0); -    EVT SrcVT = N->getOperand(0)->getValueType(0); +    EVT SrcVT = N->getOperand(0).getValueType();      if (SrcVT != MVT::f64 ||          (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) @@ -28407,8 +28420,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,    // TODO - attempt to narrow Mask back to writemask size.    bool IsEVEXShuffle =        RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); -  if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits)) -    return SDValue();    // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. @@ -28491,11 +28502,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,      if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,                                  V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, -                                ShuffleVT)) { +                                ShuffleVT) && +        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {        if (Depth == 1 && Root.getOpcode() == Shuffle)          return SDValue(); // Nothing to do! -      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) -        return SDValue(); // AVX512 Writemask clash.        Res = DAG.getBitcast(ShuffleSrcVT, V1);        DCI.AddToWorklist(Res.getNode());        Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); @@ -28505,11 +28515,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,      if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,                                         AllowIntDomain, Subtarget, Shuffle, -                                       ShuffleVT, PermuteImm)) { +                                       ShuffleVT, PermuteImm) && +        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {        if (Depth == 1 && Root.getOpcode() == Shuffle)          return SDValue(); // Nothing to do! -      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) -        return SDValue(); // AVX512 Writemask clash.        Res = DAG.getBitcast(ShuffleVT, V1);        DCI.AddToWorklist(Res.getNode());        Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, @@ -28520,12 +28529,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,    }    if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, -                               V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, -                               ShuffleVT, UnaryShuffle)) { +                               V1, V2, DL, DAG, Subtarget, Shuffle, +                               ShuffleSrcVT, ShuffleVT, UnaryShuffle) && +      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {      if (Depth == 1 && Root.getOpcode() == Shuffle)        return SDValue(); // Nothing to do! -    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) -      return SDValue(); // AVX512 Writemask clash.      V1 = DAG.getBitcast(ShuffleSrcVT, V1);      DCI.AddToWorklist(V1.getNode());      V2 = DAG.getBitcast(ShuffleSrcVT, V2); @@ -28538,11 +28546,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,    if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,                                        AllowIntDomain, V1, V2, DL, DAG,                                        Subtarget, Shuffle, ShuffleVT, -                                      PermuteImm)) { +                                      PermuteImm) && +      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {      if (Depth == 1 && Root.getOpcode() == Shuffle)        return SDValue(); // Nothing to do! -    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) -      return SDValue(); // AVX512 Writemask clash.      V1 = DAG.getBitcast(ShuffleVT, V1);      DCI.AddToWorklist(V1.getNode());      V2 = DAG.getBitcast(ShuffleVT, V2); @@ -28594,8 +28601,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,      return SDValue();    // Depth threshold above which we can efficiently use variable mask shuffles. -  // TODO This should probably be target specific. -  bool AllowVariableMask = (Depth >= 3) || HasVariableMask; +  int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; +  bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;    bool MaskContainsZeros =        any_of(Mask, [](int M) { return M == SM_SentinelZero; }); @@ -29698,17 +29705,18 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,    return SDValue();  } -/// Returns true iff the shuffle node \p N can be replaced with ADDSUB -/// operation. If true is returned then the operands of ADDSUB operation +/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) +/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation  /// are written to the parameters \p Opnd0 and \p Opnd1.  /// -/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes +/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes  /// so it is easier to generically match. We also insert dummy vector shuffle  /// nodes for the operands which explicitly discard the lanes which are unused  /// by this operation to try to flow through the rest of the combiner  /// the fact that they're unused. -static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, -                     SDValue &Opnd0, SDValue &Opnd1) { +static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, +                             SDValue &Opnd0, SDValue &Opnd1,  +                             bool matchSubAdd = false) {    EVT VT = N->getValueType(0);    if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && @@ -29728,12 +29736,15 @@ static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,    SDValue V1 = N->getOperand(0);    SDValue V2 = N->getOperand(1); -  // We require the first shuffle operand to be the FSUB node, and the second to -  // be the FADD node. -  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { +  unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB; +  unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD; + +  // We require the first shuffle operand to be the ExpectedOpcode node, +  // and the second to be the NextExpectedOpcode node. +  if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {      ShuffleVectorSDNode::commuteMask(Mask);      std::swap(V1, V2); -  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) +  } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)      return false;    // If there are other uses of these operations we can't fold them. @@ -29767,7 +29778,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,                                                  const X86Subtarget &Subtarget,                                                  SelectionDAG &DAG) {    SDValue Opnd0, Opnd1; -  if (!isAddSub(N, Subtarget, Opnd0, Opnd1)) +  if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))      return SDValue();    EVT VT = N->getValueType(0); @@ -29775,7 +29786,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,    // Try to generate X86ISD::FMADDSUB node here.    SDValue Opnd2; -  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) +  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))      return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);    // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -29787,6 +29798,26 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,    return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);  } +/// \brief Try to combine a shuffle into a target-specific +/// mul-sub-add node. +static SDValue combineShuffleToFMSubAdd(SDNode *N, +                                        const X86Subtarget &Subtarget, +                                        SelectionDAG &DAG) { +  SDValue Opnd0, Opnd1; +  if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true)) +    return SDValue(); + +  EVT VT = N->getValueType(0); +  SDLoc DL(N); + +  // Try to generate X86ISD::FMSUBADD node here. +  SDValue Opnd2; +  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) +    return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2); + +  return SDValue(); +} +  // We are looking for a shuffle where both sources are concatenated with undef  // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so  // if we can express this as a single-source shuffle, that's preferable. @@ -29873,11 +29904,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,    EVT VT = N->getValueType(0);    const TargetLowering &TLI = DAG.getTargetLoweringInfo();    // If we have legalized the vector types, look for blends of FADD and FSUB -  // nodes that we can fuse into an ADDSUB node. +  // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.    if (TLI.isTypeLegal(VT)) {      if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))        return AddSub; +    if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG)) +      return FMSubAdd; +      if (SDValue HAddSub = foldShuffleOfHorizOp(N))        return HAddSub;    } @@ -30181,7 +30215,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,      // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))      // sign-extend to a 256-bit operation to avoid truncation.      if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && -        N0->getOperand(0)->getValueType(0).is256BitVector()) { +        N0->getOperand(0).getValueType().is256BitVector()) {        SExtVT = MVT::v4i64;        FPCastVT = MVT::v4f64;      } @@ -30194,8 +30228,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,      // 256-bit because the shuffle is cheaper than sign extending the result of      // the compare.      if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && -        (N0->getOperand(0)->getValueType(0).is256BitVector() || -         N0->getOperand(0)->getValueType(0).is512BitVector())) { +        (N0->getOperand(0).getValueType().is256BitVector() || +         N0->getOperand(0).getValueType().is512BitVector())) {        SExtVT = MVT::v8i32;        FPCastVT = MVT::v8f32;      } @@ -30484,7 +30518,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,    return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);  } -// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with +// PHMINPOSUW.  static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,                                               const X86Subtarget &Subtarget) {    // Bail without SSE41. @@ -30492,7 +30527,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,      return SDValue();    EVT ExtractVT = Extract->getValueType(0); -  if (ExtractVT != MVT::i16) +  if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)      return SDValue();    // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. @@ -30504,7 +30539,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,    EVT SrcVT = Src.getValueType();    EVT SrcSVT = SrcVT.getScalarType(); -  if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) +  if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)      return SDValue();    SDLoc DL(Extract); @@ -30520,22 +30555,39 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,      SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);      MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);    } -  assert(SrcVT == MVT::v8i16 && "Unexpected value type"); +  assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || +          (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && +         "Unexpected value type");    // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask    // to flip the value accordingly.    SDValue Mask; +  unsigned MaskEltsBits = ExtractVT.getSizeInBits();    if (BinOp == ISD::SMAX) -    Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); +    Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);    else if (BinOp == ISD::SMIN) -    Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); +    Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);    else if (BinOp == ISD::UMAX) -    Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); +    Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);    if (Mask)      MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); -  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); +  // For v16i8 cases we need to perform UMIN on pairs of byte elements, +  // shuffling each upper element down and insert zeros. This means that the +  // v16i8 UMIN will leave the upper element as zero, performing zero-extension +  // ready for the PHMINPOS. +  if (ExtractVT == MVT::i8) { +    SDValue Upper = DAG.getVectorShuffle( +        SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL), +        {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); +    MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); +  } + +  // Perform the PHMINPOS on a v8i16 vector, +  MinPos = DAG.getBitcast(MVT::v8i16, MinPos); +  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); +  MinPos = DAG.getBitcast(SrcVT, MinPos);    if (Mask)      MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); @@ -30851,7 +30903,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,    if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))      return Cmp; -  // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. +  // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.    if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))      return MinMax; @@ -32555,7 +32607,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {    // 1. MOVs can write to a register that differs from source    // 2. MOVs accept memory operands -  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || +  if (VT.isVector() || N1.getOpcode() != ISD::Constant ||        N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||        N0.getOperand(1).getOpcode() != ISD::Constant)      return SDValue(); @@ -32569,11 +32621,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {    if (SarConst.isNegative())      return SDValue(); -  for (MVT SVT : MVT::integer_valuetypes()) { +  for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {      unsigned ShiftSize = SVT.getSizeInBits();      // skipping types without corresponding sext/zext and      // ShlConst that is not one of [56,48,32,24,16] -    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) +    if (ShiftSize >= Size || ShlConst != Size - ShiftSize)        continue;      SDLoc DL(N);      SDValue NN = @@ -32626,37 +32678,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {    return SDValue();  } -/// \brief Returns a vector of 0s if the node in input is a vector logical -/// shift by a constant amount which is known to be bigger than or equal -/// to the vector element size in bits. -static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, -                                      const X86Subtarget &Subtarget) { -  EVT VT = N->getValueType(0); - -  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && -      (!Subtarget.hasInt256() || -       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) -    return SDValue(); - -  SDValue Amt = N->getOperand(1); -  SDLoc DL(N); -  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) -    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { -      const APInt &ShiftAmt = AmtSplat->getAPIntValue(); -      unsigned MaxAmount = -        VT.getSimpleVT().getScalarSizeInBits(); - -      // SSE2/AVX2 logical shifts always return a vector of 0s -      // if the shift amount is bigger than or equal to -      // the element size. The constant shift amount will be -      // encoded as a 8-bit immediate. -      if (ShiftAmt.trunc(8).uge(MaxAmount)) -        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); -    } - -  return SDValue(); -} -  static SDValue combineShift(SDNode* N, SelectionDAG &DAG,                              TargetLowering::DAGCombinerInfo &DCI,                              const X86Subtarget &Subtarget) { @@ -32672,11 +32693,6 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,      if (SDValue V = combineShiftRightLogical(N, DAG))        return V; -  // Try to fold this logical shift into a zero vector. -  if (N->getOpcode() != ISD::SRA) -    if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) -      return V; -    return SDValue();  } @@ -32996,21 +33012,20 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {  // register. In most cases we actually compare or select YMM-sized registers  // and mixing the two types creates horrible code. This method optimizes  // some of the transition sequences. +// Even with AVX-512 this is still useful for removing casts around logical +// operations on vXi1 mask types.  static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,                                   TargetLowering::DAGCombinerInfo &DCI,                                   const X86Subtarget &Subtarget) {    EVT VT = N->getValueType(0); -  if (!VT.is256BitVector()) -    return SDValue(); +  assert(VT.isVector() && "Expected vector type");    assert((N->getOpcode() == ISD::ANY_EXTEND ||            N->getOpcode() == ISD::ZERO_EXTEND ||            N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");    SDValue Narrow = N->getOperand(0); -  EVT NarrowVT = Narrow->getValueType(0); -  if (!NarrowVT.is128BitVector()) -    return SDValue(); +  EVT NarrowVT = Narrow.getValueType();    if (Narrow->getOpcode() != ISD::XOR &&        Narrow->getOpcode() != ISD::AND && @@ -33026,12 +33041,12 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,      return SDValue();    // The type of the truncated inputs. -  EVT WideVT = N0->getOperand(0)->getValueType(0); -  if (WideVT != VT) +  if (N0->getOperand(0).getValueType() != VT)      return SDValue();    // The right side has to be a 'trunc' or a constant vector. -  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; +  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && +                  N1.getOperand(0).getValueType() == VT;    ConstantSDNode *RHSConstSplat = nullptr;    if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))      RHSConstSplat = RHSBV->getConstantSplatNode(); @@ -33040,37 +33055,31 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,    const TargetLowering &TLI = DAG.getTargetLoweringInfo(); -  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) +  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))      return SDValue();    // Set N0 and N1 to hold the inputs to the new wide operation.    N0 = N0->getOperand(0);    if (RHSConstSplat) { -    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), +    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT.getVectorElementType(),                       SDValue(RHSConstSplat, 0)); -    N1 = DAG.getSplatBuildVector(WideVT, DL, N1); +    N1 = DAG.getSplatBuildVector(VT, DL, N1);    } else if (RHSTrunc) {      N1 = N1->getOperand(0);    }    // Generate the wide operation. -  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); +  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);    unsigned Opcode = N->getOpcode();    switch (Opcode) { +  default: llvm_unreachable("Unexpected opcode");    case ISD::ANY_EXTEND:      return Op; -  case ISD::ZERO_EXTEND: { -    unsigned InBits = NarrowVT.getScalarSizeInBits(); -    APInt Mask = APInt::getAllOnesValue(InBits); -    Mask = Mask.zext(VT.getScalarSizeInBits()); -    return DAG.getNode(ISD::AND, DL, VT, -                       Op, DAG.getConstant(Mask, DL, VT)); -  } +  case ISD::ZERO_EXTEND: +    return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());    case ISD::SIGN_EXTEND:      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,                         Op, DAG.getValueType(NarrowVT)); -  default: -    llvm_unreachable("Unexpected opcode");    }  } @@ -33882,16 +33891,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,    if (!Subtarget.hasSSE2())      return SDValue(); -  if (Subtarget.hasBWI()) { -    if (VT.getSizeInBits() > 512) -      return SDValue(); -  } else if (Subtarget.hasAVX2()) { -    if (VT.getSizeInBits() > 256) -      return SDValue(); -  } else { -    if (VT.getSizeInBits() > 128) -      return SDValue(); -  }    // Detect the following pattern:    // @@ -33903,7 +33902,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,    //   %6 = trunc <N x i32> %5 to <N x i8>    //    // In AVX512, the last instruction can also be a trunc store. -    if (In.getOpcode() != ISD::SRL)      return SDValue(); @@ -33924,6 +33922,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,      return true;    }; +  // Split vectors to legal target size and apply AVG. +  auto LowerToAVG = [&](SDValue Op0, SDValue Op1) { +    unsigned NumSubs = 1; +    if (Subtarget.hasBWI()) { +      if (VT.getSizeInBits() > 512) +        NumSubs = VT.getSizeInBits() / 512; +    } else if (Subtarget.hasAVX2()) { +      if (VT.getSizeInBits() > 256) +        NumSubs = VT.getSizeInBits() / 256; +    } else { +      if (VT.getSizeInBits() > 128) +        NumSubs = VT.getSizeInBits() / 128; +    } + +    if (NumSubs == 1) +      return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1); + +    SmallVector<SDValue, 4> Subs; +    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), +                                 VT.getVectorNumElements() / NumSubs); +    for (unsigned i = 0; i != NumSubs; ++i) { +      unsigned Idx = i * SubVT.getVectorNumElements(); +      SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits()); +      SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits()); +      Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS)); +    } +    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); +  }; +    // Check if each element of the vector is left-shifted by one.    auto LHS = In.getOperand(0);    auto RHS = In.getOperand(1); @@ -33947,8 +33974,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,      SDValue VecOnes = DAG.getConstant(1, DL, InVT);      Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);      Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); -    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), -                       Operands[1]); +    return LowerToAVG(Operands[0].getOperand(0), Operands[1]);    }    if (Operands[0].getOpcode() == ISD::ADD) @@ -33972,8 +33998,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,          return SDValue();      // The pattern is detected, emit X86ISD::AVG instruction. -    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), -                       Operands[1].getOperand(0)); +    return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));    }    return SDValue(); @@ -35872,14 +35897,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,    if (SDValue NewCMov = combineToExtendCMOV(N, DAG))      return NewCMov; -  if (!DCI.isBeforeLegalizeOps()) { -    if (InVT == MVT::i1) { -      SDValue Zero = DAG.getConstant(0, DL, VT); -      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); -      return DAG.getSelect(DL, VT, N0, AllOnes, Zero); -    } +  if (!DCI.isBeforeLegalizeOps())      return SDValue(); -  }    if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&        isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { @@ -35897,7 +35916,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,    if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))      return V; -  if (Subtarget.hasAVX() && VT.is256BitVector()) +  if (VT.isVector())      if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))        return R; @@ -36089,7 +36108,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,    if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))      return V; -  if (VT.is256BitVector()) +  if (VT.isVector())      if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))        return R; @@ -36244,39 +36263,54 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,                                      const X86Subtarget &Subtarget) {    SDLoc DL(N); -  // Pre-shrink oversized index elements to avoid triggering scalarization. -  if (DCI.isBeforeLegalize()) { +  if (DCI.isBeforeLegalizeOps()) {      SDValue Index = N->getOperand(4); -    if (Index.getScalarValueSizeInBits() > 64) { -      EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, +    // Remove any sign extends from 32 or smaller to larger than 32. +    // Only do this before LegalizeOps in case we need the sign extend for +    // legalization. +    if (Index.getOpcode() == ISD::SIGN_EXTEND) { +      if (Index.getScalarValueSizeInBits() > 32 && +          Index.getOperand(0).getScalarValueSizeInBits() <= 32) { +        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); +        NewOps[4] = Index.getOperand(0); +        DAG.UpdateNodeOperands(N, NewOps); +        // The original sign extend has less users, add back to worklist in case +        // it needs to be removed +        DCI.AddToWorklist(Index.getNode()); +        DCI.AddToWorklist(N); +        return SDValue(N, 0); +      } +    } + +    // Make sure the index is either i32 or i64 +    unsigned ScalarSize = Index.getScalarValueSizeInBits(); +    if (ScalarSize != 32 && ScalarSize != 64) { +      MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; +      EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,                                     Index.getValueType().getVectorNumElements()); -      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); +      Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); -      NewOps[4] = Trunc; +      NewOps[4] = Index;        DAG.UpdateNodeOperands(N, NewOps);        DCI.AddToWorklist(N);        return SDValue(N, 0);      } -  } -  // Try to remove sign extends from i32 to i64 on the index. -  // Only do this before legalize in case we are relying on it for -  // legalization. -  // TODO: We should maybe remove any sign extend once we learn how to sign -  // extend narrow index during lowering. -  if (DCI.isBeforeLegalizeOps()) { -    SDValue Index = N->getOperand(4); -    if (Index.getScalarValueSizeInBits() == 64 && -        Index.getOpcode() == ISD::SIGN_EXTEND && +    // Try to remove zero extends from 32->64 if we know the sign bit of +    // the input is zero. +    if (Index.getOpcode() == ISD::ZERO_EXTEND && +        Index.getScalarValueSizeInBits() == 64 &&          Index.getOperand(0).getScalarValueSizeInBits() == 32) { -      SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); -      NewOps[4] = Index.getOperand(0); -      DAG.UpdateNodeOperands(N, NewOps); -      // The original sign extend has less users, add back to worklist in case -      // it needs to be removed. -      DCI.AddToWorklist(Index.getNode()); -      DCI.AddToWorklist(N); -      return SDValue(N, 0); +      if (DAG.SignBitIsZero(Index.getOperand(0))) { +        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); +        NewOps[4] = Index.getOperand(0); +        DAG.UpdateNodeOperands(N, NewOps); +        // The original zero extend has less users, add back to worklist in case +        // it needs to be removed +        DCI.AddToWorklist(Index.getNode()); +        DCI.AddToWorklist(N); +        return SDValue(N, 0); +      }      }    } @@ -36288,6 +36322,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,      SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());      NewOps[2] = Mask.getOperand(0);      DAG.UpdateNodeOperands(N, NewOps); +    return SDValue(N, 0);    }    // With AVX2 we only demand the upper bit of the mask. @@ -36356,7 +36391,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,    EVT VT = N->getValueType(0);    if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||        N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || -      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) +      VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())      return SDValue();    // Now check that the other operand of the AND is a constant. We could diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 2acd8d17beb2..0d30b7d47f3e 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -116,14 +116,30 @@ defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>;  def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",                     [(int_x86_mmx_femms)], IIC_MMX_EMMS>; +// PREFETCHWT1 is supported we want to use it for everything but T0. +def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{ +  return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1(); +}]>; + +// Use PREFETCHWT1 for NTA, T2, T1. +def PrefetchWT1Level : ImmLeaf<i32, [{ +  return Imm < 3; +}]>; +  let SchedRW = [WriteLoad] in { +let Predicates = [Has3DNow, NoSSEPrefetch] in  def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),                        "prefetch\t$addr", -                      [(prefetch addr:$addr, (i32 0), imm, (i32 1))], +                      [(prefetch addr:$addr, imm, imm, (i32 1))],                        IIC_SSE_PREFETCH>; +  def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", -                  [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))], +                  [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))],                    IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>; + +def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr", +                    [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))], +                    IIC_SSE_PREFETCH>, TB, Requires<[HasPREFETCHWT1]>;  }  // "3DNowA" instructions diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 2a6ed02fadab..0b266e5591b4 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -349,8 +349,9 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,    let TSFlags{54}    = hasEVEX_RC;  } -class PseudoI<dag oops, dag iops, list<dag> pattern> -  : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> { +class PseudoI<dag oops, dag iops, list<dag> pattern, +              InstrItinClass itin = NoItinerary> +  : X86Inst<0, Pseudo, NoImm, oops, iops, "", itin> {    let Pattern = pattern;  } @@ -423,9 +424,8 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,  // FpI_ - Floating Point Pseudo Instruction template. Not Predicated.  class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,             InstrItinClass itin = NoItinerary> -  : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { +  : PseudoI<outs, ins, pattern, itin> {    let FPForm = fp; -  let Pattern = pattern;  }  // Templates for instructions that use a 16- or 32-bit segmented address as diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 42e89cb4831d..fdf3e73e4fcd 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -874,7 +874,10 @@ def HasADX       : Predicate<"Subtarget->hasADX()">;  def HasSHA       : Predicate<"Subtarget->hasSHA()">;  def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;  def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">; +def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; +def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;  def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;  def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;  def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;  def HasCLZERO    : Predicate<"Subtarget->hasCLZERO()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a86a0bfc168d..b48fa1841979 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3487,7 +3487,7 @@ let Predicates = [UseSSE2] in {  //===----------------------------------------------------------------------===//  // Prefetch intrinsic. -let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { +let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {  def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),      "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],      IIC_SSE_PREFETCH>, TB; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 1e04997ad294..e131f1a1e4bd 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -89,8 +89,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(      // Check to see if there is a specialized entry-point for memory zeroing.      ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val); -    if (const char *bzeroEntry = ValC && -        ValC->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { +    if (const char *bzeroName = (ValC && ValC->isNullValue()) +        ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) +        : nullptr) {        const TargetLowering &TLI = DAG.getTargetLoweringInfo();        EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());        Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); @@ -106,7 +107,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(        CLI.setDebugLoc(dl)            .setChain(Chain)            .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), -                        DAG.getExternalSymbol(bzeroEntry, IntPtr), +                        DAG.getExternalSymbol(bzeroName, IntPtr),                          std::move(Args))            .setDiscardResult(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 8b08766b6171..ad023623142f 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -174,28 +174,6 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,    return X86II::MO_NO_FLAG;  } -/// This function returns the name of a function which has an interface like -/// the non-standard bzero function, if such a function exists on the -/// current subtarget and it is considered preferable over memset with zero -/// passed as the second argument. Otherwise it returns null. -const char *X86Subtarget::getBZeroEntry() const { -  // Darwin 10 has a __bzero entry point for this purpose. -  if (getTargetTriple().isMacOSX() && -      !getTargetTriple().isMacOSXVersionLT(10, 6)) -    return "__bzero"; - -  return nullptr; -} - -bool X86Subtarget::hasSinCos() const { -  if (getTargetTriple().isMacOSX()) { -    return !getTargetTriple().isMacOSXVersionLT(10, 9) && is64Bit(); -  } else if (getTargetTriple().isOSFuchsia()) { -    return true; -  } -  return false; -} -  /// Return true if the subtarget allows calls to immediate address.  bool X86Subtarget::isLegalToCallImmediateAddr() const {    // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 @@ -324,6 +302,7 @@ void X86Subtarget::initializeEnvironment() {    HasVNNI = false;    HasBITALG = false;    HasSHA = false; +  HasPREFETCHWT1 = false;    HasPRFCHW = false;    HasRDSEED = false;    HasLAHFSAHF = false; @@ -342,6 +321,7 @@ void X86Subtarget::initializeEnvironment() {    HasSSEUnalignedMem = false;    HasCmpxchg16b = false;    UseLeaForSP = false; +  HasFastVariableShuffle = false;    HasFastPartialYMMorZMMWrite = false;    HasFastGather = false;    HasFastScalarFSQRT = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index be4d46c470de..c9435890fc1f 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -201,7 +201,7 @@ protected:    bool HasCLZERO;    /// Processor has Prefetch with intent to Write instruction -  bool HasPFPREFETCHWT1; +  bool HasPREFETCHWT1;    /// True if SHLD instructions are slow.    bool IsSHLDSlow; @@ -228,6 +228,10 @@ protected:    /// the stack pointer. This is an optimization for Intel Atom processors.    bool UseLeaForSP; +  /// True if its preferable to combine to a single shuffle using a variable +  /// mask over multiple fixed shuffles. +  bool HasFastVariableShuffle; +    /// True if there is no performance penalty to writing only the lower parts    /// of a YMM or ZMM register without clearing the upper part.    bool HasFastPartialYMMorZMMWrite; @@ -513,7 +517,14 @@ public:    bool hasRTM() const { return HasRTM; }    bool hasADX() const { return HasADX; }    bool hasSHA() const { return HasSHA; } -  bool hasPRFCHW() const { return HasPRFCHW; } +  bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; } +  bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } +  bool hasSSEPrefetch() const { +    // We implicitly enable these when we have a write prefix supporting cache +    // level OR if we have prfchw, but don't already have a read prefetch from +    // 3dnow. +    return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); +  }    bool hasRDSEED() const { return HasRDSEED; }    bool hasLAHFSAHF() const { return HasLAHFSAHF; }    bool hasMWAITX() const { return HasMWAITX; } @@ -527,6 +538,9 @@ public:    bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }    bool hasCmpxchg16b() const { return HasCmpxchg16b; }    bool useLeaForSP() const { return UseLeaForSP; } +  bool hasFastVariableShuffle() const { +    return HasFastVariableShuffle; +  }    bool hasFastPartialYMMorZMMWrite() const {      return HasFastPartialYMMorZMMWrite;    } @@ -682,17 +696,6 @@ public:    /// Return true if the subtarget allows calls to immediate address.    bool isLegalToCallImmediateAddr() const; -  /// This function returns the name of a function which has an interface -  /// like the non-standard bzero function, if such a function exists on -  /// the current subtarget and it is considered prefereable over -  /// memset with zero passed as the second argument. Otherwise it -  /// returns null. -  const char *getBZeroEntry() const; - -  /// This function returns true if the target has sincos() routine in its -  /// compiler runtime or math libraries. -  bool hasSinCos() const; -    /// Enable the MachineScheduler pass for all X86 subtargets.    bool enableMachineScheduler() const override { return true; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index ea8c9862230e..e95e6ecae091 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -281,10 +281,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,  // X86 TTI query.  //===----------------------------------------------------------------------===// -TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(X86TTIImpl(this, F)); -  }); +TargetTransformInfo +X86TargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(X86TTIImpl(this, F));  }  //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 952bd1321ff9..5b21cd82b5b1 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -45,7 +45,7 @@ public:    // attributes of each function.    const X86Subtarget *getSubtargetImpl() const = delete; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    // Set up the pass pipeline.    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 3aa7187e0cd1..38925bfd51b0 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -108,8 +108,7 @@ extern "C" void LLVMInitializeXCoreTarget() {    RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());  } -TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() { -  return TargetIRAnalysis([this](const Function &F) { -    return TargetTransformInfo(XCoreTTIImpl(this, F)); -  }); +TargetTransformInfo +XCoreTargetMachine::getTargetTransformInfo(const Function &F) { +  return TargetTransformInfo(XCoreTTIImpl(this, F));  } diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index 5baa3524d2a6..965b9b2c4d65 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -43,7 +43,7 @@ public:    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -  TargetIRAnalysis getTargetIRAnalysis() override; +  TargetTransformInfo getTargetTransformInfo(const Function &F) override;    TargetLoweringObjectFile *getObjFileLowering() const override {      return TLOF.get(); diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 683655f1f68b..a9cfd8ded6fb 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -710,7 +710,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {  // Check if there is PGO data or user annoated branch data:  static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { -  if (F->getEntryCount()) +  if (F->hasProfileData())      return true;    // Now check if any of the entry block has MD_prof data:    for (auto *E : OI->Entries) { @@ -863,6 +863,7 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {      case Instruction::GetElementPtr:        if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())          continue; +      break;      default:        break;      } @@ -1273,7 +1274,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {    // Only try to outline cold regions if we have a profile summary, which    // implies we have profiling information. -  if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() && +  if (PSI->hasProfileSummary() && F->hasProfileData() &&        !DisableMultiRegionPartialInline) {      std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =          computeOutliningColdRegionsInfo(F); @@ -1379,10 +1380,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {                              Cloner.ClonedFunc->user_end());    DenseMap<User *, uint64_t> CallSiteToProfCountMap; -  if (Cloner.OrigFunc->getEntryCount()) +  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); +  if (CalleeEntryCount)      computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); -  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();    uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);    bool AnyInline = false; diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index f0e781b9d923..7086c2eb52c4 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -1583,7 +1583,10 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {  }  bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { -  F.setEntryCount(0); +  // Initialize the entry count to -1, which will be treated conservatively +  // by getEntryCount as the same as unknown (None). If we have samples this +  // will be overwritten in emitAnnotations. +  F.setEntryCount(-1);    std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;    if (AM) {      auto &FAM = diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp index ec56f0cde25d..5fbb001216a3 100644 --- a/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1346,6 +1346,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {        Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,                                       ResByArg.Bit);        applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit); +      break;      }      default:        break; diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index aa055121e710..a088d447337f 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4394,6 +4394,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,              cast<CallInst>(Caller)->getCallingConv());          cast<CallInst>(NewCaller)->setAttributes(NewPAL);        } +      NewCaller->setDebugLoc(Caller->getDebugLoc());        return NewCaller;      } diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 2a25423e04bd..8e2833d22032 100644 --- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -80,6 +80,11 @@ static cl::opt<bool> ClInstrumentAtomics(      cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,      cl::init(true)); +static cl::opt<bool> ClRecover( +    "hwasan-recover", +    cl::desc("Enable recovery mode (continue-after-error)."), +    cl::Hidden, cl::init(false)); +  namespace {  /// \brief An instrumentation pass implementing detection of addressability bugs @@ -89,7 +94,8 @@ public:    // Pass identification, replacement for typeid.    static char ID; -  HWAddressSanitizer() : FunctionPass(ID) {} +  HWAddressSanitizer(bool Recover = false) +      : FunctionPass(ID), Recover(Recover || ClRecover) {}    StringRef getPassName() const override { return "HWAddressSanitizer"; } @@ -109,6 +115,8 @@ private:    LLVMContext *C;    Type *IntptrTy; +  bool Recover; +    Function *HwasanCtorFunction;    Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes]; @@ -126,8 +134,8 @@ INITIALIZE_PASS_END(      HWAddressSanitizer, "hwasan",      "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false) -FunctionPass *llvm::createHWAddressSanitizerPass() { -  return new HWAddressSanitizer(); +FunctionPass *llvm::createHWAddressSanitizerPass(bool Recover) { +  return new HWAddressSanitizer(Recover);  }  /// \brief Module-level initialization. @@ -156,10 +164,11 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {    IRBuilder<> IRB(*C);    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {      const std::string TypeStr = AccessIsWrite ? "store" : "load"; +    const std::string EndingStr = Recover ? "_noabort" : "";      HwasanMemoryAccessCallbackSized[AccessIsWrite] =          checkSanitizerInterfaceFunction(M.getOrInsertFunction( -            ClMemoryAccessCallbackPrefix + TypeStr, +            ClMemoryAccessCallbackPrefix + TypeStr + EndingStr,              FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false)));      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; @@ -167,7 +176,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {        HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =            checkSanitizerInterfaceFunction(M.getOrInsertFunction(                ClMemoryAccessCallbackPrefix + TypeStr + -                  itostr(1ULL << AccessSizeIndex), +                  itostr(1ULL << AccessSizeIndex) + EndingStr,                FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)));      }    } @@ -246,14 +255,16 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,    Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);    TerminatorInst *CheckTerm = -      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false, +      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,                                  MDBuilder(*C).createBranchWeights(1, 100000));    IRB.SetInsertPoint(CheckTerm);    // The signal handler will find the data address in x0.    InlineAsm *Asm = InlineAsm::get(        FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false), -      "hlt #" + itostr(0x100 + IsWrite * 0x10 + AccessSizeIndex), "{x0}", +      "hlt #" + +          itostr(0x100 + Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex), +      "{x0}",        /*hasSideEffects=*/true);    IRB.CreateCall(Asm, PtrLong);  } diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index d8c408035038..207243231aad 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -13,10 +13,11 @@  // threading, or IPA-CP based function cloning, etc.).  // As of now we support two cases :  // -// 1) If a call site is dominated by an OR condition and if any of its arguments -// are predicated on this OR condition, try to split the condition with more -// constrained arguments. For example, in the code below, we try to split the -// call site since we can predicate the argument(ptr) based on the OR condition. +// 1) Try to a split call-site with constrained arguments, if any constraints +// on any argument can be found by following the single predecessors of the +// all site's predecessors. Currently this pass only handles call-sites with 2 +// predecessors. For example, in the code below, we try to split the call-site +// since we can predicate the argument(ptr) based on the OR condition.  //  // Split from :  //   if (!ptr || c) @@ -200,16 +201,15 @@ static bool canSplitCallSite(CallSite CS) {  }  /// Return true if the CS is split into its new predecessors which are directly -/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. -/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point -/// to the second compare block. CallInst1 and CallInst2 will be the new -/// call-sites placed in the new predecessors split for PredBB1 and PredBB2, -/// repectively. Therefore, CallInst1 will be the call-site placed -/// between Header and Tail, and CallInst2 will be the call-site between TBB and -/// Tail. For example, in the IR below with an OR condition, the call-site can -/// be split +/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2. +/// CallInst1 and CallInst2 will be the new call-sites placed in the new +/// predecessors split for PredBB1 and PredBB2, respectively. +/// For example, in the IR below with an OR condition, the call-site can +/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the +/// call-site placed between Header and Tail, and CallInst2 will be the +/// call-site between TBB and Tail.  /// -/// from : +/// From :  ///  ///   Header:  ///     %c = icmp eq i32* %a, null @@ -237,9 +237,9 @@ static bool canSplitCallSite(CallSite CS) {  ///   Tail:  ///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]  /// -/// Note that for an OR predicated case, CallInst1 and CallInst2 should be -/// created with more constrained arguments in -/// createCallSitesOnOrPredicatedArgument(). +/// Note that in case any arguments at the call-site are constrained by its +/// predecessors, new call-sites with more constrained arguments will be +/// created in createCallSitesOnPredicatedArgument().  static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,                            Instruction *CallInst1, Instruction *CallInst2) {    Instruction *Instr = CS.getInstruction(); @@ -332,18 +332,10 @@ static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {    splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);    return true;  } -// Check if one of the predecessors is a single predecessors of the other. -// This is a requirement for control flow modeling an OR. HeaderBB points to -// the single predecessor and OrBB points to other node. HeaderBB potentially -// contains the first compare of the OR and OrBB the second. -static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) { -  return OrBB->getSinglePredecessor() == HeaderBB && -         HeaderBB->getTerminator()->getNumSuccessors() == 2; -} -static bool tryToSplitOnOrPredicatedArgument(CallSite CS) { +static bool tryToSplitOnPredicatedArgument(CallSite CS) {    auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); -  if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0])) +  if (Preds[0] == Preds[1])      return false;    SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2; @@ -362,7 +354,7 @@ static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {  static bool tryToSplitCallSite(CallSite CS) {    if (!CS.arg_size() || !canSplitCallSite(CS))      return false; -  return tryToSplitOnOrPredicatedArgument(CS) || +  return tryToSplitOnPredicatedArgument(CS) ||           tryToSplitOnPHIPredicatedArgument(CS);  } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 6b0377e0ecb3..1476f7850cf0 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -282,7 +282,7 @@ bool JumpThreading::runOnFunction(Function &F) {    auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    std::unique_ptr<BlockFrequencyInfo> BFI;    std::unique_ptr<BranchProbabilityInfo> BPI; -  bool HasProfileData = F.getEntryCount().hasValue(); +  bool HasProfileData = F.hasProfileData();    if (HasProfileData) {      LoopInfo LI{DominatorTree(F)};      BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); @@ -307,8 +307,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,    std::unique_ptr<BlockFrequencyInfo> BFI;    std::unique_ptr<BranchProbabilityInfo> BPI; -  bool HasProfileData = F.getEntryCount().hasValue(); -  if (HasProfileData) { +  if (F.hasProfileData()) {      LoopInfo LI{DominatorTree(F)};      BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));      BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); @@ -1333,6 +1332,20 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {    // code size.    BasicBlock *UnavailablePred = nullptr; +  // If the value is unavailable in one of predecessors, we will end up +  // inserting a new instruction into them. It is only valid if all the +  // instructions before LI are guaranteed to pass execution to its successor, +  // or if LI is safe to speculate. +  // TODO: If this logic becomes more complex, and we will perform PRE insertion +  // farther than to a predecessor, we need to reuse the code from GVN's PRE. +  // It requires domination tree analysis, so for this simple case it is an +  // overkill. +  if (PredsScanned.size() != AvailablePreds.size() && +      !isSafeToSpeculativelyExecute(LI)) +    for (auto I = LoadBB->begin(); &*I != LI; ++I) +      if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) +        return false; +    // If there is exactly one predecessor where the value is unavailable, the    // already computed 'OneUnavailablePred' block is it.  If it ends in an    // unconditional branch, we know that it isn't a critical edge. diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp index c9d55b4594fe..430a7085d93f 100644 --- a/lib/Transforms/Scalar/LoopSink.cpp +++ b/lib/Transforms/Scalar/LoopSink.cpp @@ -247,7 +247,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,    // Enable LoopSink only when runtime profile is available.    // With static profile, the sinking decision may be sub-optimal. -  if (!Preheader->getParent()->getEntryCount()) +  if (!Preheader->getParent()->hasProfileData())      return false;    const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 7b1d6446a24a..15e7da5e1a7a 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -882,7 +882,7 @@ static bool computeUnrollCount(    }    // Check if the runtime trip count is too small when profile is available. -  if (L->getHeader()->getParent()->getEntryCount()) { +  if (L->getHeader()->getParent()->hasProfileData()) {      if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {        if (*ProfileTripCount < FlatLoopTripCountThreshold)          return false; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9c870b42a747..6af3fef963dc 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -476,22 +476,33 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,        Alignment = DL.getABITypeAlignment(EltType);      } -    AMemSet = -      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); +    // Remember the debug location. +    DebugLoc Loc; +    if (!Range.TheStores.empty()) +      Loc = Range.TheStores[0]->getDebugLoc();      DEBUG(dbgs() << "Replace stores:\n";            for (Instruction *SI : Range.TheStores) -            dbgs() << *SI << '\n'; -          dbgs() << "With: " << *AMemSet << '\n'); - -    if (!Range.TheStores.empty()) -      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); +            dbgs() << *SI << '\n');      // Zap all the stores.      for (Instruction *SI : Range.TheStores) {        MD->removeInstruction(SI);        SI->eraseFromParent();      } + +    // Create the memset after removing the stores, so that if there any cached +    // non-local dependencies on the removed instructions in +    // MemoryDependenceAnalysis, the cache entries are updated to "dirty" +    // entries pointing below the memset, so subsequent queries include the +    // memset. +    AMemSet = +      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); +    if (!Range.TheStores.empty()) +      AMemSet->setDebugLoc(Loc); + +    DEBUG(dbgs() << "With: " << *AMemSet << '\n'); +      ++NumMemSetInfer;    } @@ -1031,9 +1042,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,    //    // NOTE: This is conservative, it will stop on any read from the source loc,    // not just the defining memcpy. -  MemDepResult SourceDep = -      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, -                                   M->getIterator(), M->getParent()); +  MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep); +  MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false, +                                                        M->getIterator(), M->getParent()); + +  if (SourceDep.isNonLocal()) { +    SmallVector<NonLocalDepResult, 2> NonLocalDepResults; +    MD->getNonLocalPointerDependencyFrom(M, SourceLoc, /*isLoad=*/false, +                                         NonLocalDepResults); +    if (NonLocalDepResults.size() == 1) { +      SourceDep = NonLocalDepResults[0].getResult(); +      assert((!SourceDep.getInst() || +              LookupDomTree().dominates(SourceDep.getInst(), M)) && +             "when memdep returns exactly one result, it should dominate"); +    } +  } +    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)      return false; @@ -1235,6 +1259,18 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {    MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(        SrcLoc, true, M->getIterator(), M->getParent()); +  if (SrcDepInfo.isNonLocal()) { +    SmallVector<NonLocalDepResult, 2> NonLocalDepResults; +    MD->getNonLocalPointerDependencyFrom(M, SrcLoc, /*isLoad=*/true, +                                         NonLocalDepResults); +    if (NonLocalDepResults.size() == 1) { +      SrcDepInfo = NonLocalDepResults[0].getResult(); +      assert((!SrcDepInfo.getInst() || +              LookupDomTree().dominates(SrcDepInfo.getInst(), M)) && +             "when memdep returns exactly one result, it should dominate"); +    } +  } +    if (SrcDepInfo.isClobber()) {      if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))        return processMemCpyMemCpyDependence(M, MDep); diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index e5866b4718da..66608ec631f6 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1929,9 +1929,32 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,          if (!I) continue;          bool Folded = ConstantFoldTerminator(I->getParent()); -        assert(Folded && -              "Expect TermInst on constantint or blockaddress to be folded"); -        (void) Folded; +        if (!Folded) { +          // The constant folder may not have been able to fold the terminator +          // if this is a branch or switch on undef.  Fold it manually as a +          // branch to the first successor. +#ifndef NDEBUG +          if (auto *BI = dyn_cast<BranchInst>(I)) { +            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) && +                   "Branch should be foldable!"); +          } else if (auto *SI = dyn_cast<SwitchInst>(I)) { +            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold"); +          } else { +            llvm_unreachable("Didn't fold away reference to block!"); +          } +#endif + +          // Make this an uncond branch to the first successor. +          TerminatorInst *TI = I->getParent()->getTerminator(); +          BranchInst::Create(TI->getSuccessor(0), TI); + +          // Remove entries in successor phi nodes to remove edges. +          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i) +            TI->getSuccessor(i)->removePredecessor(TI->getParent()); + +          // Remove the old terminator. +          TI->eraseFromParent(); +        }        }        // Finally, delete the basic block. diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 209821ff21d7..8fa9ffb6d014 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -97,7 +97,7 @@  //    load %p2  //    ...  // -// We can not do CSE for to the common part related to index "i64 %i". Lowering +// We can not do CSE to the common part related to index "i64 %i". Lowering  // GEPs can achieve such goals.  // If the target does not use alias analysis in codegen, this pass will  // lower a GEP with multiple indices into arithmetic operations: diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp index eb3139ce4293..8825f77555e7 100644 --- a/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -23,10 +23,30 @@ using namespace llvm;  /// Fix-up phi nodes in an invoke instruction's normal destination.  ///  /// After versioning an invoke instruction, values coming from the original -/// block will now either be coming from the original block or the "else" block. +/// block will now be coming from the "merge" block. For example, in the code +/// below: +/// +///   then_bb: +///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +///   else_bb: +///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +///   merge_bb: +///     %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ] +///     br %normal_dst +/// +///   normal_dst: +///     %t3 = phi i32 [ %x, %orig_bb ], ... +/// +/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in +/// "normal_dst" must be fixed to refer to "merge_bb": +/// +///    normal_dst: +///      %t3 = phi i32 [ %x, %merge_bb ], ... +///  static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, -                                      BasicBlock *ElseBlock, -                                      Instruction *NewInst) { +                                      BasicBlock *MergeBlock) {    for (auto &I : *Invoke->getNormalDest()) {      auto *Phi = dyn_cast<PHINode>(&I);      if (!Phi) @@ -34,13 +54,7 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,      int Idx = Phi->getBasicBlockIndex(OrigBlock);      if (Idx == -1)        continue; -    Value *V = Phi->getIncomingValue(Idx); -    if (dyn_cast<Instruction>(V) == Invoke) { -      Phi->setIncomingBlock(Idx, ElseBlock); -      Phi->addIncoming(NewInst, OrigBlock); -      continue; -    } -    Phi->addIncoming(V, ElseBlock); +    Phi->setIncomingBlock(Idx, MergeBlock);    }  } @@ -48,6 +62,23 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,  ///  /// After versioning an invoke instruction, values coming from the original  /// block will now be coming from either the "then" block or the "else" block. +/// For example, in the code below: +/// +///   then_bb: +///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +///   else_bb: +///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +///   unwind_dst: +///     %t3 = phi i32 [ %x, %orig_bb ], ... +/// +/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in +/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb": +/// +///   unwind_dst: +///     %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ... +///  static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,                                        BasicBlock *ThenBlock,                                        BasicBlock *ElseBlock) { @@ -64,44 +95,26 @@ static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,    }  } -/// Get the phi node having the returned value of a call or invoke instruction -/// as it's operand. -static bool getRetPhiNode(Instruction *Inst, BasicBlock *Block) { -  BasicBlock *FromBlock = Inst->getParent(); -  for (auto &I : *Block) { -    PHINode *PHI = dyn_cast<PHINode>(&I); -    if (!PHI) -      break; -    int Idx = PHI->getBasicBlockIndex(FromBlock); -    if (Idx == -1) -      continue; -    auto *V = PHI->getIncomingValue(Idx); -    if (V == Inst) -      return true; -  } -  return false; -} -  /// Create a phi node for the returned value of a call or invoke instruction.  ///  /// After versioning a call or invoke instruction that returns a value, we have  /// to merge the value of the original and new instructions. We do this by  /// creating a phi node and replacing uses of the original instruction with this  /// phi node. -static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) { +/// +/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is +/// defined in "then_bb", we create the following phi node: +/// +///   ; Uses of the original instruction are replaced by uses of the phi node. +///   %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ], +/// +static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst, +                             BasicBlock *MergeBlock, IRBuilder<> &Builder) {    if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty())      return; -  BasicBlock *RetValBB = NewInst->getParent(); -  if (auto *Invoke = dyn_cast<InvokeInst>(NewInst)) -    RetValBB = Invoke->getNormalDest(); -  BasicBlock *PhiBB = RetValBB->getSingleSuccessor(); - -  if (getRetPhiNode(OrigInst, PhiBB)) -    return; - -  IRBuilder<> Builder(&PhiBB->front()); +  Builder.SetInsertPoint(&MergeBlock->front());    PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);    SmallVector<User *, 16> UsersToUpdate;    for (User *U : OrigInst->users()) @@ -109,7 +122,7 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {    for (User *U : UsersToUpdate)      U->replaceUsesOfWith(OrigInst, Phi);    Phi->addIncoming(OrigInst, OrigInst->getParent()); -  Phi->addIncoming(NewInst, RetValBB); +  Phi->addIncoming(NewInst, NewInst->getParent());  }  /// Cast a call or invoke instruction to the given type. @@ -118,7 +131,41 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {  /// that of the callee. If this is the case, we have to cast the returned value  /// to the correct type. The location of the cast depends on if we have a call  /// or invoke instruction. -Instruction *createRetBitCast(CallSite CS, Type *RetTy) { +/// +/// For example, if the call instruction below requires a bitcast after +/// promotion: +/// +///   orig_bb: +///     %t0 = call i32 @func() +///     ... +/// +/// The bitcast is placed after the call instruction: +/// +///   orig_bb: +///     ; Uses of the original return value are replaced by uses of the bitcast. +///     %t0 = call i32 @func() +///     %t1 = bitcast i32 %t0 to ... +///     ... +/// +/// A similar transformation is performed for invoke instructions. However, +/// since invokes are terminating, a new block is created for the bitcast. For +/// example, if the invoke instruction below requires a bitcast after promotion: +/// +///   orig_bb: +///     %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst +/// +/// The edge between the original block and the invoke's normal destination is +/// split, and the bitcast is placed there: +/// +///   orig_bb: +///     %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst +/// +///   split_bb: +///     ; Uses of the original return value are replaced by uses of the bitcast. +///     %t1 = bitcast i32 %t0 to ... +///     br label %normal_dst +/// +static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {    // Save the users of the calling instruction. These uses will be changed to    // use the bitcast after we create it. @@ -130,19 +177,20 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) {    // value. The location depends on if we have a call or invoke instruction.    Instruction *InsertBefore = nullptr;    if (auto *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) -    InsertBefore = &*Invoke->getNormalDest()->getFirstInsertionPt(); +    InsertBefore = +        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();    else      InsertBefore = &*std::next(CS.getInstruction()->getIterator());    // Bitcast the return value to the correct type.    auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(),                                  RetTy, "", InsertBefore); +  if (RetBitCast) +    *RetBitCast = Cast;    // Replace all the original uses of the calling instruction with the bitcast.    for (User *U : UsersToUpdate)      U->replaceUsesOfWith(CS.getInstruction(), Cast); - -  return Cast;  }  /// Predicate and clone the given call site. @@ -152,21 +200,78 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) {  /// callee. The original call site is moved into the "else" block, and a clone  /// of the call site is placed in the "then" block. The cloned instruction is  /// returned. +/// +/// For example, the call instruction below: +/// +///   orig_bb: +///     %t0 = call i32 %ptr() +///     ... +/// +/// Is replace by the following: +/// +///   orig_bb: +///     %cond = icmp eq i32 ()* %ptr, @func +///     br i1 %cond, %then_bb, %else_bb +/// +///   then_bb: +///     ; The clone of the original call instruction is placed in the "then" +///     ; block. It is not yet promoted. +///     %t1 = call i32 %ptr() +///     br merge_bb +/// +///   else_bb: +///     ; The original call instruction is moved to the "else" block. +///     %t0 = call i32 %ptr() +///     br merge_bb +/// +///   merge_bb: +///     ; Uses of the original call instruction are replaced by uses of the phi +///     ; node. +///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] +///     ... +/// +/// A similar transformation is performed for invoke instructions. However, +/// since invokes are terminating, more work is required. For example, the +/// invoke instruction below: +/// +///   orig_bb: +///     %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst +/// +/// Is replace by the following: +/// +///   orig_bb: +///     %cond = icmp eq i32 ()* %ptr, @func +///     br i1 %cond, %then_bb, %else_bb +/// +///   then_bb: +///     ; The clone of the original invoke instruction is placed in the "then" +///     ; block, and its normal destination is set to the "merge" block. It is +///     ; not yet promoted. +///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +///   else_bb: +///     ; The original invoke instruction is moved into the "else" block, and +///     ; its normal destination is set to the "merge" block. +///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +///   merge_bb: +///     ; Uses of the original invoke instruction are replaced by uses of the +///     ; phi node, and the merge block branches to the normal destination. +///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] +///     br %normal_dst +///  static Instruction *versionCallSite(CallSite CS, Value *Callee, -                                    MDNode *BranchWeights, -                                    BasicBlock *&ThenBlock, -                                    BasicBlock *&ElseBlock, -                                    BasicBlock *&MergeBlock) { +                                    MDNode *BranchWeights) {    IRBuilder<> Builder(CS.getInstruction());    Instruction *OrigInst = CS.getInstruction(); +  BasicBlock *OrigBlock = OrigInst->getParent();    // Create the compare. The called value and callee must have the same type to    // be compared. -  auto *LHS = -      Builder.CreateBitCast(CS.getCalledValue(), Builder.getInt8PtrTy()); -  auto *RHS = Builder.CreateBitCast(Callee, Builder.getInt8PtrTy()); -  auto *Cond = Builder.CreateICmpEQ(LHS, RHS); +  if (CS.getCalledValue()->getType() != Callee->getType()) +    Callee = Builder.CreateBitCast(Callee, CS.getCalledValue()->getType()); +  auto *Cond = Builder.CreateICmpEQ(CS.getCalledValue(), Callee);    // Create an if-then-else structure. The original instruction is moved into    // the "else" block, and a clone of the original instruction is placed in the @@ -175,9 +280,9 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,    TerminatorInst *ElseTerm = nullptr;    SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm,                                  BranchWeights); -  ThenBlock = ThenTerm->getParent(); -  ElseBlock = ElseTerm->getParent(); -  MergeBlock = OrigInst->getParent(); +  BasicBlock *ThenBlock = ThenTerm->getParent(); +  BasicBlock *ElseBlock = ElseTerm->getParent(); +  BasicBlock *MergeBlock = OrigInst->getParent();    ThenBlock->setName("if.true.direct_targ");    ElseBlock->setName("if.false.orig_indirect"); @@ -188,7 +293,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,    NewInst->insertBefore(ThenTerm);    // If the original call site is an invoke instruction, we have extra work to -  // do since invoke instructions are terminating. +  // do since invoke instructions are terminating. We have to fix-up phi nodes +  // in the invoke's normal and unwind destinations.    if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) {      auto *NewInvoke = cast<InvokeInst>(NewInst); @@ -201,11 +307,19 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,      Builder.SetInsertPoint(MergeBlock);      Builder.CreateBr(OrigInvoke->getNormalDest()); -    // Now set the normal destination of new the invoke instruction to be the +    // Fix-up phi nodes in the original invoke's normal and unwind destinations. +    fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock); +    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock); + +    // Now set the normal destinations of the invoke instructions to be the      // "merge" block. +    OrigInvoke->setNormalDest(MergeBlock);      NewInvoke->setNormalDest(MergeBlock);    } +  // Create a phi node for the returned value of the call site. +  createRetPHINode(OrigInst, NewInst, MergeBlock, Builder); +    return NewInst;  } @@ -253,7 +367,8 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,    return true;  } -static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) { +Instruction *llvm::promoteCall(CallSite CS, Function *Callee, +                               CastInst **RetBitCast) {    assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");    // Set the called function of the call site to be the given callee. @@ -268,7 +383,7 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {    // If the function type of the call site matches that of the callee, no    // additional work is required.    if (CS.getFunctionType() == Callee->getFunctionType()) -    return; +    return CS.getInstruction();    // Save the return types of the call site and callee.    Type *CallSiteRetTy = CS.getInstruction()->getType(); @@ -294,7 +409,9 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {    // If the return type of the call site doesn't match that of the callee, cast    // the returned value to the appropriate type.    if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) -    Cast = createRetBitCast(CS, CallSiteRetTy); +    createRetBitCast(CS, CallSiteRetTy, RetBitCast); + +  return CS.getInstruction();  }  Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee, @@ -303,26 +420,10 @@ Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee,    // Version the indirect call site. If the called value is equal to the given    // callee, 'NewInst' will be executed, otherwise the original call site will    // be executed. -  BasicBlock *ThenBlock, *ElseBlock, *MergeBlock; -  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights, ThenBlock, -                                         ElseBlock, MergeBlock); +  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights);    // Promote 'NewInst' so that it directly calls the desired function. -  Instruction *Cast = NewInst; -  promoteCall(CallSite(NewInst), Callee, Cast); - -  // If the original call site is an invoke instruction, we have to fix-up phi -  // nodes in the invoke's normal and unwind destinations. -  if (auto *OrigInvoke = dyn_cast<InvokeInst>(CS.getInstruction())) { -    fixupPHINodeForNormalDest(OrigInvoke, MergeBlock, ElseBlock, Cast); -    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock); -  } - -  // Create a phi node for the returned value of the call site. -  createRetPHINode(CS.getInstruction(), Cast ? Cast : NewInst); - -  // Return the new direct call. -  return NewInst; +  return promoteCall(CallSite(NewInst), Callee);  }  #undef DEBUG_TYPE diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp index 4273ce0b6200..c84ae7d693d7 100644 --- a/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -203,7 +203,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,    // hit the peeled section.    // We only do this in the presence of profile information, since otherwise    // our estimates of the trip count are not reliable enough. -  if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) { +  if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) {      Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);      if (!PeelCount)        return; diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index f02f80cc1b78..b3c80424c8b9 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,16 @@ static cl::opt<unsigned> MaxSpeculationDepth(      cl::desc("Limit maximum recursion depth when calculating costs of "               "speculatively executed instructions")); +static cl::opt<unsigned> DependenceChainLatency( +    "dependence-chain-latency", cl::Hidden, cl::init(8), +    cl::desc("Limit the maximum latency of dependence chain containing cmp " +             "for if conversion")); + +static cl::opt<unsigned> SmallBBSize( +    "small-bb-size", cl::Hidden, cl::init(40), +    cl::desc("Check dependence chain latency only in basic block smaller than " +             "this number")); +  STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");  STATISTIC(NumLinearMaps,            "Number of switch instructions turned into linear mapping"); @@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,    return true;  } +/// Estimate the code size of the specified BB. +static unsigned CountBBCodeSize(BasicBlock *BB, +                                const TargetTransformInfo &TTI) { +  unsigned Size = 0; +  for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II) +    Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize); +  return Size; +} + +/// Find out the latency of the longest dependence chain in the BB if +/// LongestChain is true, or the dependence chain containing the compare +/// instruction feeding the block's conditional branch. +static unsigned FindDependenceChainLatency(BasicBlock *BB, +                            DenseMap<Instruction *, unsigned> &Instructions, +                            const TargetTransformInfo &TTI, +                            bool LongestChain) { +  unsigned MaxLatency = 0; + +  BasicBlock::iterator II; +  for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) { +    unsigned Latency = 0; +    for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) { +      Instruction *Op = dyn_cast<Instruction>(II->getOperand(O)); +      if (Op && Instructions.count(Op)) { +        auto OpLatency = Instructions[Op]; +        if (OpLatency > Latency) +          Latency = OpLatency; +      } +    } +    Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency); +    Instructions[&(*II)] = Latency; + +    if (Latency > MaxLatency) +      MaxLatency = Latency; +  } + +  if (LongestChain) +    return MaxLatency; + +  // The length of the dependence chain containing the compare instruction is +  // wanted, so the terminator must be a BranchInst. +  assert(isa<BranchInst>(II)); +  BranchInst* Br = cast<BranchInst>(II); +  Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition()); +  if (Cmp && Instructions.count(Cmp)) +    return Instructions[Cmp]; +  else +    return 0; +} + +/// Instructions in BB2 may depend on instructions in BB1, and instructions +/// in BB1 may have users in BB2. If the last (in terms of latency) such kind +/// of instruction in BB1 is I, then the instructions after I can be executed +/// in parallel with instructions in BB2. +/// This function returns the latency of I. +static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2, +                        BasicBlock *IfBlock1, BasicBlock *IfBlock2, +                        DenseMap<Instruction *, unsigned> &BB1Instructions) { +  unsigned LastLatency = 0; +  SmallVector<Instruction *, 16> Worklist; +  BasicBlock::iterator II; +  for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) { +    if (PHINode *PN = dyn_cast<PHINode>(II)) { +      // Look for users in BB2. +      bool InBBUser = false; +      for (User *U : PN->users()) { +        if (cast<Instruction>(U)->getParent() == BB2) { +          InBBUser = true; +          break; +        } +      } +      // No such user, we don't care about this instruction and its operands. +      if (!InBBUser) +        break; +    } +    Worklist.push_back(&(*II)); +  } + +  while (!Worklist.empty()) { +    Instruction *I = Worklist.pop_back_val(); +    for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) { +      if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) { +        if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2) +          Worklist.push_back(Op); +        else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) { +          if (BB1Instructions[Op] > LastLatency) +            LastLatency = BB1Instructions[Op]; +        } +      } +    } +  } + +  return LastLatency; +} + +/// If after if conversion, most of the instructions in this new BB construct a +/// long and slow dependence chain, it may be slower than cmp/branch, even +/// if the branch has a high miss rate, because the control dependence is +/// transformed into data dependence, and control dependence can be speculated, +/// and thus, the second part can execute in parallel with the first part on +/// modern OOO processor. +/// +/// To check this condition, this function finds the length of the dependence +/// chain in BB1 (only the part that can be executed in parallel with code after +/// branch in BB2) containing cmp, and if the length is longer than a threshold, +/// don't perform if conversion. +/// +/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion. +/// SpeculationSize contains the code size of IfBlock1 and IfBlock2. +static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2, +                             BasicBlock *IfBlock1, BasicBlock *IfBlock2, +                             unsigned SpeculationSize, +                             const TargetTransformInfo &TTI) { +  // Accumulated latency of each instruction in their BBs. +  DenseMap<Instruction *, unsigned> BB1Instructions; +  DenseMap<Instruction *, unsigned> BB2Instructions; + +  if (!TTI.isOutOfOrder()) +    return false; + +  unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI) +                         + SpeculationSize; + +  // We check small BB only since it is more difficult to find unrelated +  // instructions to fill functional units in a small BB. +  if (NewBBSize > SmallBBSize) +    return false; + +  auto BB1Chain = +         FindDependenceChainLatency(BB1, BB1Instructions, TTI, false); +  auto BB2Chain = +         FindDependenceChainLatency(BB2, BB2Instructions, TTI, true); + +  // If there are many unrelated instructions in the new BB, there will be +  // other instructions for the processor to issue regardless of the length +  // of this new dependence chain. +  // Modern processors can issue 3 or more instructions in each cycle. But in +  // real world applications, an IPC of 2 is already very good for non-loop +  // code with small basic blocks. Higher IPC is usually found in programs with +  // small kernel. So IPC of 2 is more reasonable for most applications. +  if ((BB1Chain + BB2Chain) * 2 <= NewBBSize) +    return false; + +  // We only care about part of the dependence chain in BB1 that can be +  // executed in parallel with BB2, so adjust the latency. +  BB1Chain -= +      LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions); + +  // Correctly predicted branch instruction can skip the dependence chain in +  // BB1, but misprediction has a penalty, so only when the dependence chain is +  // longer than DependenceChainLatency, then branch is better than select. +  // Besides misprediction penalty, the threshold value DependenceChainLatency +  // also depends on branch misprediction rate, taken branch latency and cmov +  // latency. +  if (BB1Chain >= DependenceChainLatency) +    return true; + +  return false; +} +  /// Extract ConstantInt from value, looking through IntToPtr  /// and PointerNullValue. Return NULL if value is not a constant int.  static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { @@ -1654,14 +1824,11 @@ namespace {  } // end anonymous namespace -/// Given an unconditional branch that goes to BBEnd, -/// check whether BBEnd has only two predecessors and the other predecessor -/// ends with an unconditional branch. If it is true, sink any common code -/// in the two predecessors to BBEnd. -static bool SinkThenElseCodeToEnd(BranchInst *BI1) { -  assert(BI1->isUnconditional()); -  BasicBlock *BBEnd = BI1->getSuccessor(0); - +/// Check whether BB's predecessors end with unconditional branches. If it is +/// true, sink any common code from the predecessors to BB. +/// We also allow one predecessor to end with conditional branch (but no more +/// than one). +static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {    // We support two situations:    //   (1) all incoming arcs are unconditional    //   (2) one incoming arc is conditional @@ -1705,7 +1872,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {    //    SmallVector<BasicBlock*,4> UnconditionalPreds;    Instruction *Cond = nullptr; -  for (auto *B : predecessors(BBEnd)) { +  for (auto *B : predecessors(BB)) {      auto *T = B->getTerminator();      if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())        UnconditionalPreds.push_back(B); @@ -1773,8 +1940,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {      DEBUG(dbgs() << "SINK: Splitting edge\n");      // We have a conditional edge and we're going to sink some instructions.      // Insert a new block postdominating all blocks we're going to sink from. -    if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds, -                                ".sink.split")) +    if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))        // Edges couldn't be split.        return false;      Changed = true; @@ -2048,6 +2214,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,    if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))      return false; +  // Don't do if conversion for long dependence chain. +  if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr, +                              CountBBCodeSize(ThenBB, TTI), TTI)) +    return false; +    // If we get here, we can hoist the instruction and if-convert.    DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); @@ -2355,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,        }    } +  if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2, +                              AggressiveInsts.size(), TTI)) +    return false; +    DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "                 << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n"); @@ -5728,9 +5903,6 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,    BasicBlock *BB = BI->getParent();    BasicBlock *Succ = BI->getSuccessor(0); -  if (SinkCommon && Options.SinkCommonInsts && SinkThenElseCodeToEnd(BI)) -    return true; -    // If the Terminator is the only non-phi instruction, simplify the block.    // If LoopHeader is provided, check if the block or its successor is a loop    // header. (This is for early invocations before loop simplify and @@ -6008,6 +6180,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {    if (MergeBlockIntoPredecessor(BB))      return true; +  if (SinkCommon && Options.SinkCommonInsts) +    Changed |= SinkCommonCodeFromPredecessors(BB); +    IRBuilder<> Builder(BB);    // If there is a trivial two-entry PHI node in this basic block, and we can diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index fbcdc0df0f1c..52f32cda2609 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5049,13 +5049,13 @@ bool LoopVectorizationLegality::canVectorize() {    bool Result = true;    bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); -  if (DoExtraAnalysis)    // We must have a loop in canonical form. Loops with indirectbr in them cannot    // be canonicalized.    if (!TheLoop->getLoopPreheader()) { +    DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");      ORE->emit(createMissedAnalysis("CFGNotUnderstood")                << "loop control flow is not understood by vectorizer"); -  if (DoExtraAnalysis) +    if (DoExtraAnalysis)        Result = false;      else        return false; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 76ba62f5d596..a7ccd3faec44 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -646,23 +646,17 @@ private:    int getEntryCost(TreeEntry *E);    /// This is the recursive part of buildTree. -  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1, -                     int OpdNum = 0); +  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);    /// \returns True if the ExtractElement/ExtractValue instructions in VL can    /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).    bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const; -  /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of -  /// operand corrsponding to this tree entry \p E for the user tree entry -  /// indicated by \p UserIndx. -  //  In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)". -  Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1); +  /// Vectorize a single entry in the tree. +  Value *vectorizeTree(TreeEntry *E); -  /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate -  /// the ordinality of operand corrsponding to the \p VL of scalar values for the -  /// user indicated by \p UserIndx this \p VL feeds into. -  Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1); +  /// Vectorize a single entry in the tree, starting in \p VL. +  Value *vectorizeTree(ArrayRef<Value *> VL);    /// \returns the pointer to the vectorized value if \p VL is already    /// vectorized, or NULL. They may happen in cycles. @@ -708,16 +702,6 @@ private:        return std::equal(VL.begin(), VL.end(), Scalars.begin());      } -    /// \returns true if the scalars in VL are found in this tree entry. -    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL, -        ScalarEvolution &SE) const { -      assert(VL.size() == Scalars.size() && "Invalid size"); -      SmallVector<Value *, 8> List; -      if (!sortLoadAccesses(VL, DL, SE, List)) -        return false; -      return std::equal(List.begin(), List.end(), Scalars.begin()); -    } -      /// A vector of scalars.      ValueList Scalars; @@ -727,14 +711,6 @@ private:      /// Do we need to gather this sequence ?      bool NeedToGather = false; -    /// Records optional shuffle mask for the uses of jumbled memory accesses. -    /// For example, a non-empty ShuffleMask[1] represents the permutation of -    /// lanes that operand #1 of this vectorized instruction should undergo -    /// before feeding this vectorized instruction, whereas an empty -    /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized -    /// instruction need not be permuted at all. -    SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask; -      /// Points back to the VectorizableTree.      ///      /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has @@ -750,31 +726,12 @@ private:    /// Create a new VectorizableTree entry.    TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, -                          int &UserTreeIdx, const InstructionsState &S, -                          ArrayRef<unsigned> ShuffleMask = None, -                          int OpdNum = 0) { -    assert((!Vectorized || S.Opcode != 0) && -           "Vectorized TreeEntry without opcode"); +                          int &UserTreeIdx) {      VectorizableTree.emplace_back(VectorizableTree); -      int idx = VectorizableTree.size() - 1;      TreeEntry *Last = &VectorizableTree[idx];      Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());      Last->NeedToGather = !Vectorized; - -    TreeEntry *UserTreeEntry = nullptr; -    if (UserTreeIdx != -1) -      UserTreeEntry = &VectorizableTree[UserTreeIdx]; - -    if (UserTreeEntry && !ShuffleMask.empty()) { -      if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size()) -        UserTreeEntry->ShuffleMask.resize(OpdNum + 1); -      assert(UserTreeEntry->ShuffleMask[OpdNum].empty() && -             "Mask already present"); -      using mask = SmallVector<unsigned, 4>; -      mask tempMask(ShuffleMask.begin(), ShuffleMask.end()); -      UserTreeEntry->ShuffleMask[OpdNum] = tempMask; -    }      if (Vectorized) {        for (int i = 0, e = VL.size(); i != e; ++i) {          assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -1427,34 +1384,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,  }  void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, -                            int UserTreeIdx, int OpdNum) { +                            int UserTreeIdx) {    assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");    InstructionsState S = getSameOpcode(VL);    if (Depth == RecursionMaxDepth) {      DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); -    newTreeEntry(VL, false, UserTreeIdx, S); +    newTreeEntry(VL, false, UserTreeIdx);      return;    }    // Don't handle vectors.    if (S.OpValue->getType()->isVectorTy()) {      DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); -    newTreeEntry(VL, false, UserTreeIdx, S); +    newTreeEntry(VL, false, UserTreeIdx);      return;    }    if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))      if (SI->getValueOperand()->getType()->isVectorTy()) {        DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); -      newTreeEntry(VL, false, UserTreeIdx, S); +      newTreeEntry(VL, false, UserTreeIdx);        return;      }    // If all of the operands are identical or constant we have a simple solution.    if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {      DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); -    newTreeEntry(VL, false, UserTreeIdx, S); +    newTreeEntry(VL, false, UserTreeIdx);      return;    } @@ -1466,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      if (EphValues.count(VL[i])) {        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<              ") is ephemeral.\n"); -      newTreeEntry(VL, false, UserTreeIdx, S); +      newTreeEntry(VL, false, UserTreeIdx);        return;      }    } @@ -1477,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");        if (E->Scalars[i] != VL[i]) {          DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); -        newTreeEntry(VL, false, UserTreeIdx, S); +        newTreeEntry(VL, false, UserTreeIdx);          return;        }      } @@ -1496,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      if (getTreeEntry(I)) {        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<              ") is already in tree.\n"); -      newTreeEntry(VL, false, UserTreeIdx, S); +      newTreeEntry(VL, false, UserTreeIdx);        return;      }    } @@ -1506,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    for (unsigned i = 0, e = VL.size(); i != e; ++i) {      if (MustGather.count(VL[i])) {        DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); -      newTreeEntry(VL, false, UserTreeIdx, S); +      newTreeEntry(VL, false, UserTreeIdx);        return;      }    } @@ -1520,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      // Don't go into unreachable blocks. They may contain instructions with      // dependency cycles which confuse the final scheduling.      DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); -    newTreeEntry(VL, false, UserTreeIdx, S); +    newTreeEntry(VL, false, UserTreeIdx);      return;    } @@ -1529,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      for (unsigned j = i + 1; j < e; ++j)        if (VL[i] == VL[j]) {          DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); -        newTreeEntry(VL, false, UserTreeIdx, S); +        newTreeEntry(VL, false, UserTreeIdx);          return;        } @@ -1544,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      assert((!BS.getScheduleData(VL0) ||              !BS.getScheduleData(VL0)->isPartOfBundle()) &&             "tryScheduleBundle should cancelScheduling on failure"); -    newTreeEntry(VL, false, UserTreeIdx, S); +    newTreeEntry(VL, false, UserTreeIdx);      return;    }    DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1563,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            if (Term) {              DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");              BS.cancelScheduling(VL, VL0); -            newTreeEntry(VL, false, UserTreeIdx, S); +            newTreeEntry(VL, false, UserTreeIdx);              return;            }          } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1578,7 +1535,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(                PH->getIncomingBlock(i))); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return;      } @@ -1590,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        } else {          BS.cancelScheduling(VL, VL0);        } -      newTreeEntry(VL, Reuse, UserTreeIdx, S); +      newTreeEntry(VL, Reuse, UserTreeIdx);        return;      }      case Instruction::Load: { @@ -1605,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        if (DL->getTypeSizeInBits(ScalarTy) !=            DL->getTypeAllocSizeInBits(ScalarTy)) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx, S); +        newTreeEntry(VL, false, UserTreeIdx);          DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");          return;        } @@ -1616,13 +1573,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          LoadInst *L = cast<LoadInst>(VL[i]);          if (!L->isSimple()) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");            return;          }        }        // Check if the loads are consecutive, reversed, or neither. +      // TODO: What we really want is to sort the loads, but for now, check +      // the two likely directions.        bool Consecutive = true;        bool ReverseConsecutive = true;        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { @@ -1636,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        if (Consecutive) {          ++NumLoadsWantToKeepOrder; -        newTreeEntry(VL, true, UserTreeIdx, S); +        newTreeEntry(VL, true, UserTreeIdx);          DEBUG(dbgs() << "SLP: added a vector of loads.\n");          return;        } @@ -1650,41 +1609,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,              break;            } +      BS.cancelScheduling(VL, VL0); +      newTreeEntry(VL, false, UserTreeIdx); +        if (ReverseConsecutive) { -        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");          ++NumLoadsWantToChangeOrder; -        BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx, S); -        return; -      } - -      if (VL.size() > 2) { -        bool ShuffledLoads = true; -        SmallVector<Value *, 8> Sorted; -        SmallVector<unsigned, 4> Mask; -        if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) { -          auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end()); -          for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { -            if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { -              ShuffledLoads = false; -              break; -            } -          } -          // TODO: Tracking how many load wants to have arbitrary shuffled order -          // would be usefull. -          if (ShuffledLoads) { -            DEBUG(dbgs() << "SLP: added a vector of loads which needs " -                            "permutation of loaded lanes.\n"); -            newTreeEntry(NewVL, true, UserTreeIdx, S, -                         makeArrayRef(Mask.begin(), Mask.end()), OpdNum); -            return; -          } -        } +        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); +      } else { +        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");        } - -      DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); -      BS.cancelScheduling(VL, VL0); -      newTreeEntry(VL, false, UserTreeIdx, S);        return;      }      case Instruction::ZExt: @@ -1704,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();          if (Ty != SrcTy || !isValidElementType(Ty)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a vector of casts.\n");        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1718,7 +1651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return;      } @@ -1732,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          if (Cmp->getPredicate() != P0 ||              Cmp->getOperand(0)->getType() != ComparedTy) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a vector of compares.\n");        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1747,7 +1680,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return;      } @@ -1770,7 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      case Instruction::And:      case Instruction::Or:      case Instruction::Xor: -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a vector of bin op.\n");        // Sort operands of the instructions so that each side is more likely to @@ -1779,7 +1712,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          ValueList Left, Right;          reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);          buildTree_rec(Left, Depth + 1, UserTreeIdx); -        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1); +        buildTree_rec(Right, Depth + 1, UserTreeIdx);          return;        } @@ -1789,7 +1722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return; @@ -1799,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          if (cast<Instruction>(VL[j])->getNumOperands() != 2) {            DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            return;          }        } @@ -1812,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          if (Ty0 != CurTy) {            DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            return;          }        } @@ -1824,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            DEBUG(                dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");        for (unsigned i = 0, e = 2; i < e; ++i) {          ValueList Operands; @@ -1837,7 +1770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return;      } @@ -1846,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)          if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            DEBUG(dbgs() << "SLP: Non-consecutive store.\n");            return;          } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a vector of stores.\n");        ValueList Operands; @@ -1869,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);        if (!isTriviallyVectorizable(ID)) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx, S); +        newTreeEntry(VL, false, UserTreeIdx);          DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");          return;        } @@ -1883,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,              getVectorIntrinsicIDForCall(CI2, TLI) != ID ||              !CI->hasIdenticalOperandBundleSchema(*CI2)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]                         << "\n");            return; @@ -1894,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            Value *A1J = CI2->getArgOperand(1);            if (A1I != A1J) {              BS.cancelScheduling(VL, VL0); -            newTreeEntry(VL, false, UserTreeIdx, S); +            newTreeEntry(VL, false, UserTreeIdx);              DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI                           << " argument "<< A1I<<"!=" << A1J                           << "\n"); @@ -1907,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,                          CI->op_begin() + CI->getBundleOperandsEndIndex(),                          CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx, S); +          newTreeEntry(VL, false, UserTreeIdx);            DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="                         << *VL[i] << '\n');            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {          ValueList Operands;          // Prepare the operand vector. @@ -1922,7 +1855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            CallInst *CI2 = dyn_cast<CallInst>(j);            Operands.push_back(CI2->getArgOperand(i));          } -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return;      } @@ -1931,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        // then do not vectorize this instruction.        if (!S.IsAltShuffle) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx, S); +        newTreeEntry(VL, false, UserTreeIdx);          DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");          return;        } -      newTreeEntry(VL, true, UserTreeIdx, S); +      newTreeEntry(VL, true, UserTreeIdx);        DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");        // Reorder operands if reordering would enable vectorization. @@ -1943,7 +1876,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          ValueList Left, Right;          reorderAltShuffleOperands(S.Opcode, VL, Left, Right);          buildTree_rec(Left, Depth + 1, UserTreeIdx); -        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1); +        buildTree_rec(Right, Depth + 1, UserTreeIdx);          return;        } @@ -1953,13 +1886,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);        }        return;      default:        BS.cancelScheduling(VL, VL0); -      newTreeEntry(VL, false, UserTreeIdx, S); +      newTreeEntry(VL, false, UserTreeIdx);        DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");        return;    } @@ -2797,20 +2730,12 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {    return nullptr;  } -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) { +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {    InstructionsState S = getSameOpcode(VL);    if (S.Opcode) {      if (TreeEntry *E = getTreeEntry(S.OpValue)) { -      TreeEntry *UserTreeEntry = nullptr; -      if (UserIndx != -1) -        UserTreeEntry = &VectorizableTree[UserIndx]; - -      if (E->isSame(VL) || -          (UserTreeEntry && -           (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && -           !UserTreeEntry->ShuffleMask[OpdNum].empty() && -           E->isFoundJumbled(VL, *DL, *SE))) -        return vectorizeTree(E, OpdNum, UserIndx); +      if (E->isSame(VL)) +        return vectorizeTree(E);      }    } @@ -2822,10 +2747,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {    return Gather(VL, VecTy);  } -Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E) {    IRBuilder<>::InsertPointGuard Guard(Builder); -  TreeEntry *UserTreeEntry = nullptr;    if (E->VectorizedValue) {      DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");      return E->VectorizedValue; @@ -2845,10 +2769,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {      return V;    } -  assert(ScalarToTreeEntry.count(E->Scalars[0]) && -         "Expected user tree entry, missing!"); -  int CurrIndx = ScalarToTreeEntry[E->Scalars[0]]; -    unsigned ShuffleOrOp = S.IsAltShuffle ?             (unsigned) Instruction::ShuffleVector : S.Opcode;    switch (ShuffleOrOp) { @@ -2878,7 +2798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {          Builder.SetInsertPoint(IBB->getTerminator());          Builder.SetCurrentDebugLocation(PH->getDebugLoc()); -        Value *Vec = vectorizeTree(Operands, i, CurrIndx); +        Value *Vec = vectorizeTree(Operands);          NewPhi->addIncoming(Vec, IBB);        } @@ -2931,7 +2851,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        setInsertPointAfterBundle(E->Scalars, VL0); -      Value *InVec = vectorizeTree(INVL, 0, CurrIndx); +      Value *InVec = vectorizeTree(INVL);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -2952,8 +2872,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        setInsertPointAfterBundle(E->Scalars, VL0); -      Value *L = vectorizeTree(LHSV, 0, CurrIndx); -      Value *R = vectorizeTree(RHSV, 1, CurrIndx); +      Value *L = vectorizeTree(LHSV); +      Value *R = vectorizeTree(RHSV);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -2980,9 +2900,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        setInsertPointAfterBundle(E->Scalars, VL0); -      Value *Cond = vectorizeTree(CondVec, 0, CurrIndx); -      Value *True = vectorizeTree(TrueVec, 1, CurrIndx); -      Value *False = vectorizeTree(FalseVec, 2, CurrIndx); +      Value *Cond = vectorizeTree(CondVec); +      Value *True = vectorizeTree(TrueVec); +      Value *False = vectorizeTree(FalseVec);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -3023,8 +2943,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        setInsertPointAfterBundle(E->Scalars, VL0); -      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); -      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx); +      Value *LHS = vectorizeTree(LHSVL); +      Value *RHS = vectorizeTree(RHSVL);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -3045,20 +2965,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        // sink them all the way down past store instructions.        setInsertPointAfterBundle(E->Scalars, VL0); -      if (UserIndx != -1) -        UserTreeEntry = &VectorizableTree[UserIndx]; - -      bool isJumbled = false; -      LoadInst *LI = NULL; -      if (UserTreeEntry && -          (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && -          !UserTreeEntry->ShuffleMask[OpdNum].empty()) { -        isJumbled = true; -        LI = cast<LoadInst>(E->Scalars[0]); -      } else { -        LI = cast<LoadInst>(VL0); -      } - +      LoadInst *LI = cast<LoadInst>(VL0);        Type *ScalarLoadTy = LI->getType();        unsigned AS = LI->getPointerAddressSpace(); @@ -3080,21 +2987,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        LI->setAlignment(Alignment);        E->VectorizedValue = LI;        ++NumVectorInstructions; -      propagateMetadata(LI, E->Scalars); - -      if (isJumbled) { -        SmallVector<Constant *, 8> Mask; -        for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum]) -          Mask.push_back(Builder.getInt32(LaneEntry)); -        // Generate shuffle for jumbled memory access -        Value *Undef = UndefValue::get(VecTy); -        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, -                                                  ConstantVector::get(Mask)); -        E->VectorizedValue = Shuf; -        ++NumVectorInstructions; -        return Shuf; -      } -      return LI; +      return propagateMetadata(LI, E->Scalars);      }      case Instruction::Store: {        StoreInst *SI = cast<StoreInst>(VL0); @@ -3107,7 +3000,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        setInsertPointAfterBundle(E->Scalars, VL0); -      Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx); +      Value *VecValue = vectorizeTree(ScalarStoreValues);        Value *ScalarPtr = SI->getPointerOperand();        Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));        StoreInst *S = Builder.CreateStore(VecValue, VecPtr); @@ -3133,7 +3026,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        for (Value *V : E->Scalars)          Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); -      Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx); +      Value *Op0 = vectorizeTree(Op0VL);        std::vector<Value *> OpVecs;        for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; @@ -3142,7 +3035,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {          for (Value *V : E->Scalars)            OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); -        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx); +        Value *OpVec = vectorizeTree(OpVL);          OpVecs.push_back(OpVec);        } @@ -3181,7 +3074,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {            OpVL.push_back(CEI->getArgOperand(j));          } -        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx); +        Value *OpVec = vectorizeTree(OpVL);          DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");          OpVecs.push_back(OpVec);        } @@ -3212,8 +3105,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {        reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);        setInsertPointAfterBundle(E->Scalars, VL0); -      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); -      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx); +      Value *LHS = vectorizeTree(LHSVL); +      Value *RHS = vectorizeTree(RHSVL);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -3313,14 +3206,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {        continue;      TreeEntry *E = getTreeEntry(Scalar);      assert(E && "Invalid scalar"); -    assert((!E->NeedToGather) && "Extracting from a gather list"); +    assert(!E->NeedToGather && "Extracting from a gather list"); -    Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue); -    if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) { -      Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0); -    } else { -      Vec = E->VectorizedValue; -    } +    Value *Vec = E->VectorizedValue;      assert(Vec && "Can't find vectorizable value");      Value *Lane = Builder.getInt32(ExternalUse.Lane); @@ -4017,6 +3905,7 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,    // seed additional demotion, we save the truncated value.    case Instruction::Trunc:      Roots.push_back(I->getOperand(0)); +    break;    case Instruction::ZExt:    case Instruction::SExt:      break;  | 
