diff options
Diffstat (limited to 'lib/CodeGen')
154 files changed, 13182 insertions, 5237 deletions
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index bb908618b679..955524c2a676 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -163,9 +163,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {    // callee-saved register that is not saved in the prolog.    const MachineFrameInfo &MFI = MF.getFrameInfo();    BitVector Pristine = MFI.getPristineRegs(MF); -  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { +  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; +       ++I) {      unsigned Reg = *I; -    if (!IsReturnBlock && !Pristine.test(Reg)) continue; +    if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) +      continue;      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {        unsigned AliasReg = *AI;        State->UnionGroups(AliasReg, 0); diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp index 79ecc4308fe7..09a37a77e9fb 100644 --- a/lib/CodeGen/Analysis.cpp +++ b/lib/CodeGen/Analysis.cpp @@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,    bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS;    ADS = true; -  AttrBuilder CallerAttrs(F->getAttributes(), -                          AttributeSet::ReturnIndex); +  AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex);    AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(), -                          AttributeSet::ReturnIndex); +                          AttributeList::ReturnIndex);    // Noalias is completely benign as far as calling convention goes, it    // shouldn't affect whether the call is a tail call. @@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,    return true;  } -bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) { -  if (!GV->hasLinkOnceODRLinkage()) -    return false; - -  // We assume that anyone who sets global unnamed_addr on a non-constant knows -  // what they're doing. -  if (GV->hasGlobalUnnamedAddr()) -    return true; - -  // If it is a non constant variable, it needs to be uniqued across shared -  // objects. -  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) { -    if (!Var->isConstant()) -      return false; -  } - -  return GV->hasAtLeastLocalUnnamedAddr(); -} -  static void collectFuncletMembers(      DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,      const MachineBasicBlock *MBB) { diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 24fdbfc901fd..6c18d56b8272 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -11,48 +11,102 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/AsmPrinter.h" +#include "AsmPrinterHandler.h"  #include "CodeViewDebug.h"  #include "DwarfDebug.h"  #include "DwarfException.h"  #include "WinException.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h"  #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ObjectUtils.h"  #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h"  #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineConstantPool.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineInstrBundle.h"  #include "llvm/CodeGen/MachineJumpTableInfo.h"  #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h"  #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalIFunc.h" +#include "llvm/IR/GlobalIndirectSymbol.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h"  #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h"  #include "llvm/IR/Module.h"  #include "llvm/IR/Operator.h" +#include "llvm/IR/Value.h"  #include "llvm/MC/MCAsmInfo.h"  #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h"  #include "llvm/MC/MCExpr.h"  #include "llvm/MC/MCInst.h"  #include "llvm/MC/MCSection.h"  #include "llvm/MC/MCSectionELF.h"  #include "llvm/MC/MCSectionMachO.h"  #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetOptions.h"  #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/Format.h"  #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h"  #include "llvm/Support/TargetRegistry.h"  #include "llvm/Support/Timer.h"  #include "llvm/Target/TargetFrameLowering.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetLowering.h"  #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cinttypes> +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <utility> +#include <vector> +  using namespace llvm;  #define DEBUG_TYPE "asm-printer" @@ -69,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription =  STATISTIC(EmittedInsts, "Number of machine instrs printed"); +static cl::opt<bool> +    PrintSchedule("print-schedule", cl::Hidden, cl::init(false), +                  cl::desc("Print 'sched: [latency:throughput]' in .s output")); +  char AsmPrinter::ID = 0;  typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type; @@ -78,7 +136,6 @@ static gcp_map_type &getGCMap(void *&P) {    return *(gcp_map_type*)P;  } -  /// getGVAlignmentLog2 - Return the alignment to use for the specified global  /// value in log2 form.  This rounds up to the preferred alignment if possible  /// and legal. @@ -107,16 +164,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,  AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)      : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()), -      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)), -      isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) { -  DD = nullptr; -  MMI = nullptr; -  LI = nullptr; -  MF = nullptr; -  CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr; -  CurrentFnBegin = nullptr; -  CurrentFnEnd = nullptr; -  GCMetadataPrinters = nullptr; +      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) {    VerboseAsm = OutStreamer->isVerboseAsm();  } @@ -171,6 +219,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {    AU.setPreservesAll();    MachineFunctionPass::getAnalysisUsage(AU);    AU.addRequired<MachineModuleInfo>(); +  AU.addRequired<MachineOptimizationRemarkEmitterPass>();    AU.addRequired<GCModuleInfo>();    if (isVerbose())      AU.addRequired<MachineLoopInfo>(); @@ -223,7 +272,7 @@ bool AsmPrinter::doInitialization(Module &M) {    // don't, this at least helps the user find where a global came from.    if (MAI->hasSingleParameterDotFile()) {      // .file "foo.c" -    OutStreamer->EmitFileDirective(M.getModuleIdentifier()); +    OutStreamer->EmitFileDirective(M.getSourceFileName());    }    GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>(); @@ -571,7 +620,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {  ///  /// \p Value - The value to emit.  /// \p Size - The size of the integer (in bytes) to emit. -void AsmPrinter::EmitDebugValue(const MCExpr *Value, +void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,                                        unsigned Size) const {    OutStreamer->EmitValue(Value, Size);  } @@ -602,8 +651,23 @@ void AsmPrinter::EmitFunctionHeader() {    }    // Emit the prefix data. -  if (F->hasPrefixData()) -    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); +  if (F->hasPrefixData()) { +    if (MAI->hasSubsectionsViaSymbols()) { +      // Preserving prefix data on platforms which use subsections-via-symbols +      // is a bit tricky. Here we introduce a symbol for the prefix data +      // and use the .alt_entry attribute to mark the function's real entry point +      // as an alternative entry point to the prefix-data symbol. +      MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol(); +      OutStreamer->EmitLabel(PrefixSym); + +      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + +      // Emit an .alt_entry directive for the actual function symbol. +      OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry); +    } else { +      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); +    } +  }    // Emit the CurrentFnSym.  This is a virtual function to allow targets to    // do their wild and crazy things as required. @@ -660,7 +724,8 @@ void AsmPrinter::EmitFunctionEntryLabel() {  }  /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS, +                         AsmPrinter *AP) {    const MachineFunction *MF = MI.getParent()->getParent();    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -668,6 +733,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {    int FI;    const MachineFrameInfo &MFI = MF->getFrameInfo(); +  bool Commented = false;    // We assume a single instruction only has a spill or reload, not    // both. @@ -675,24 +741,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {    if (TII->isLoadFromStackSlotPostFE(MI, FI)) {      if (MFI.isSpillSlotObjectIndex(FI)) {        MMO = *MI.memoperands_begin(); -      CommentOS << MMO->getSize() << "-byte Reload\n"; +      CommentOS << MMO->getSize() << "-byte Reload"; +      Commented = true;      }    } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) { -    if (MFI.isSpillSlotObjectIndex(FI)) -      CommentOS << MMO->getSize() << "-byte Folded Reload\n"; +    if (MFI.isSpillSlotObjectIndex(FI)) { +      CommentOS << MMO->getSize() << "-byte Folded Reload"; +      Commented = true; +    }    } else if (TII->isStoreToStackSlotPostFE(MI, FI)) {      if (MFI.isSpillSlotObjectIndex(FI)) {        MMO = *MI.memoperands_begin(); -      CommentOS << MMO->getSize() << "-byte Spill\n"; +      CommentOS << MMO->getSize() << "-byte Spill"; +      Commented = true;      }    } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) { -    if (MFI.isSpillSlotObjectIndex(FI)) -      CommentOS << MMO->getSize() << "-byte Folded Spill\n"; +    if (MFI.isSpillSlotObjectIndex(FI)) { +      CommentOS << MMO->getSize() << "-byte Folded Spill"; +      Commented = true; +    }    }    // Check for spill-induced copies -  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) -    CommentOS << " Reload Reuse\n"; +  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) { +    Commented = true; +    CommentOS << " Reload Reuse"; +  } + +  if (Commented && AP->EnablePrintSchedInfo) +    // If any comment was added above and we need sched info comment then +    // add this new comment just after the above comment w/o "\n" between them. +    CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n"; +  else if (Commented) +    CommentOS << "\n";  }  /// emitImplicitDef - This method emits the specified machine instruction @@ -883,6 +964,7 @@ void AsmPrinter::EmitFunctionBody() {    // Print out code for the function.    bool HasAnyRealCode = false; +  int NumInstsInFunction = 0;    for (auto &MBB : *MF) {      // Print a label for the basic block.      EmitBasicBlockStart(MBB); @@ -892,7 +974,7 @@ void AsmPrinter::EmitFunctionBody() {        if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&            !MI.isDebugValue()) {          HasAnyRealCode = true; -        ++EmittedInsts; +        ++NumInstsInFunction;        }        if (ShouldPrintDebugScopes) { @@ -905,7 +987,7 @@ void AsmPrinter::EmitFunctionBody() {        }        if (isVerbose()) -        emitComments(MI, OutStreamer->GetCommentOS()); +        emitComments(MI, OutStreamer->GetCommentOS(), this);        switch (MI.getOpcode()) {        case TargetOpcode::CFI_INSTRUCTION: @@ -953,6 +1035,14 @@ void AsmPrinter::EmitFunctionBody() {      EmitBasicBlockEnd(MBB);    } +  EmittedInsts += NumInstsInFunction; +  MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount", +                                      MF->getFunction()->getSubprogram(), +                                      &MF->front()); +  R << ore::NV("NumInstructions", NumInstsInFunction) +    << " instructions in function"; +  ORE->emit(R); +    // If the function is empty and the object file uses .subsections_via_symbols,    // then we need to emit *something* to the function body to prevent the    // labels from collapsing together.  Just emit a noop. @@ -1238,7 +1328,7 @@ bool AsmPrinter::doFinalization(Module &M) {          break;        AliasStack.push_back(Cur);      } -    for (const GlobalAlias *AncestorAlias : reverse(AliasStack)) +    for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack))        emitGlobalIndirectSymbol(M, *AncestorAlias);      AliasStack.clear();    } @@ -1311,19 +1401,28 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {        CurrentFnSymForSize = CurrentFnBegin;    } +  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();    if (isVerbose())      LI = &getAnalysis<MachineLoopInfo>(); + +  const TargetSubtargetInfo &STI = MF.getSubtarget(); +  EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() +                             ? PrintSchedule +                             : STI.supportPrintSchedInfo();  }  namespace { +  // Keep track the alignment, constpool entries per Section.    struct SectionCPs {      MCSection *S;      unsigned Alignment;      SmallVector<unsigned, 4> CPEs; +      SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {}    }; -} + +} // end anonymous namespace  /// EmitConstantPool - Print to the current output stream assembly  /// representations of the constants in the constant pool MCP. This is @@ -1547,7 +1646,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,    OutStreamer->EmitValue(Value, EntrySize);  } -  /// EmitSpecialLLVMGlobal - Check to see if the specified global is a  /// special global used by LLVM.  If so, emit it and return true, otherwise  /// do nothing and return false. @@ -1598,13 +1696,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {  }  namespace { +  struct Structor { -  Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {} -  int Priority; -  llvm::Constant *Func; -  llvm::GlobalValue *ComdatKey; +  int Priority = 0; +  Constant *Func = nullptr; +  GlobalValue *ComdatKey = nullptr; + +  Structor() = default;  }; -} // end namespace + +}  // end anonymous namespace  /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init  /// priority. @@ -1653,8 +1754,11 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,      const TargetLoweringObjectFile &Obj = getObjFileLowering();      const MCSymbol *KeySym = nullptr;      if (GlobalValue *GV = S.ComdatKey) { -      if (GV->hasAvailableExternallyLinkage()) -        // If the associated variable is available_externally, some other TU +      if (GV->isDeclarationForLinker()) +        // If the associated variable is not defined in this module +        // (it might be available_externally, or have been an +        // available_externally definition that was dropped by the +        // EliminateAvailableExternally pass), some other TU          // will provide its dynamic initializer.          continue; @@ -1931,7 +2035,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) {    return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.  } -  /// isRepeatedByteSequence - Determine whether the given value is  /// composed of a repeated sequence of identical bytes and return the  /// byte value.  If it is not a repeated sequence, return -1. @@ -1972,7 +2075,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {  static void emitGlobalConstantDataSequential(const DataLayout &DL,                                               const ConstantDataSequential *CDS,                                               AsmPrinter &AP) { -    // See if we can aggregate this into a .fill, if so, emit it as such.    int Value = isRepeatedByteSequence(CDS, DL);    if (Value != -1) { @@ -2006,7 +2108,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,                          CDS->getNumElements();    if (unsigned Padding = Size - EmittedSize)      AP.OutStreamer->EmitZeros(Padding); -  }  static void emitGlobalConstantArray(const DataLayout &DL, @@ -2420,8 +2521,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {    return OutContext.getOrCreateSymbol(NameStr);  } - -  /// PrintParentLoopComment - Print comments about parent loops of this one.  static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,                                     unsigned FunctionNumber) { @@ -2486,7 +2585,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,    PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());  } -  /// EmitBasicBlockStart - This method prints the label for the specified  /// MachineBasicBlock, an alignment (if present) and a comment describing  /// it if appropriate. @@ -2607,8 +2705,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {    return true;  } - -  GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {    if (!S.usesMetadata())      return nullptr; @@ -2639,7 +2735,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {  }  /// Pin vtable to this file. -AsmPrinterHandler::~AsmPrinterHandler() {} +AsmPrinterHandler::~AsmPrinterHandler() = default;  void AsmPrinterHandler::markFunctionEnd() {} @@ -2702,8 +2798,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,    SledKind Kind) {    auto Fn = MI.getParent()->getParent()->getFunction();    auto Attr = Fn->getFnAttribute("function-instrument"); +  bool LogArgs = Fn->hasFnAttribute("xray-log-args");    bool AlwaysInstrument =      Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always"; +  if (Kind == SledKind::FUNCTION_ENTER && LogArgs) +    Kind = SledKind::LOG_ARGS_ENTER;    Sleds.emplace_back(      XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn });  } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 57864e4e4d4f..683e622e3d53 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -40,25 +40,24 @@ using namespace llvm;  #define DEBUG_TYPE "asm-printer" -namespace { -  struct SrcMgrDiagInfo { -    const MDNode *LocInfo; -    LLVMContext::InlineAsmDiagHandlerTy DiagHandler; -    void *DiagContext; -  }; -} -  /// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an  /// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo  /// struct above.  static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) { -  SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo); +  AsmPrinter::SrcMgrDiagInfo *DiagInfo = +      static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo);    assert(DiagInfo && "Diagnostic context not passed down?"); +  // Look up a LocInfo for the buffer this diagnostic is coming from. +  unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc()); +  const MDNode *LocInfo = nullptr; +  if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size()) +    LocInfo = DiagInfo->LocInfos[BufNum-1]; +    // If the inline asm had metadata associated with it, pull out a location    // cookie corresponding to which line the error occurred on.    unsigned LocCookie = 0; -  if (const MDNode *LocInfo = DiagInfo->LocInfo) { +  if (LocInfo) {      unsigned ErrorLine = Diag.getLineNo()-1;      if (ErrorLine >= LocInfo->getNumOperands())        ErrorLine = 0; @@ -99,35 +98,39 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,      return;    } -  SourceMgr SrcMgr; -  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); +  if (!DiagInfo) { +    DiagInfo = make_unique<SrcMgrDiagInfo>(); -  SrcMgrDiagInfo DiagInfo; - -  // If the current LLVMContext has an inline asm handler, set it in SourceMgr. -  LLVMContext &LLVMCtx = MMI->getModule()->getContext(); -  bool HasDiagHandler = false; -  if (LLVMCtx.getInlineAsmDiagnosticHandler() != nullptr) { -    // If the source manager has an issue, we arrange for srcMgrDiagHandler -    // to be invoked, getting DiagInfo passed into it. -    DiagInfo.LocInfo = LocMDNode; -    DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler(); -    DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext(); -    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo); -    HasDiagHandler = true; +    MCContext &Context = MMI->getContext(); +    Context.setInlineSourceManager(&DiagInfo->SrcMgr); + +    LLVMContext &LLVMCtx = MMI->getModule()->getContext(); +    if (LLVMCtx.getInlineAsmDiagnosticHandler()) { +      DiagInfo->DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler(); +      DiagInfo->DiagContext = LLVMCtx.getInlineAsmDiagnosticContext(); +      DiagInfo->SrcMgr.setDiagHandler(srcMgrDiagHandler, DiagInfo.get()); +    }    } +  SourceMgr &SrcMgr = DiagInfo->SrcMgr; +  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); +    std::unique_ptr<MemoryBuffer> Buffer; -  if (isNullTerminated) -    Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>"); -  else -    Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>"); +  // The inline asm source manager will outlive Str, so make a copy of the +  // string for SourceMgr to own. +  Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");    // Tell SrcMgr about this buffer, it takes ownership of the buffer. -  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); +  unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + +  // Store LocMDNode in DiagInfo, using BufNum as an identifier. +  if (LocMDNode) { +    DiagInfo->LocInfos.resize(BufNum); +    DiagInfo->LocInfos[BufNum-1] = LocMDNode; +  }    std::unique_ptr<MCAsmParser> Parser( -      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI)); +      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));    // We create a new MCInstrInfo here since we might be at the module level    // and not have a MachineFunction to initialize the TargetInstrInfo from and @@ -151,7 +154,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,    int Res = Parser->Run(/*NoInitialTextSection*/ true,                          /*NoFinalize*/ true);    emitInlineAsmEnd(STI, &TAP->getSTI()); -  if (Res && !HasDiagHandler) + +  if (Res && !DiagInfo->DiagHandler)      report_fatal_error("Error parsing inline asm\n");  } diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 83440513225c..383b8cddb1a0 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -23,13 +23,13 @@  #include "llvm/DebugInfo/CodeView/TypeIndex.h"  #include "llvm/DebugInfo/CodeView/TypeRecord.h"  #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/DebugInfo/MSF/ByteStream.h" -#include "llvm/DebugInfo/MSF/StreamReader.h"  #include "llvm/IR/Constants.h"  #include "llvm/MC/MCAsmInfo.h"  #include "llvm/MC/MCExpr.h"  #include "llvm/MC/MCSectionCOFF.h"  #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/BinaryByteStream.h" +#include "llvm/Support/BinaryStreamReader.h"  #include "llvm/Support/COFF.h"  #include "llvm/Support/ScopedPrinter.h"  #include "llvm/Target/TargetFrameLowering.h" @@ -38,7 +38,6 @@  using namespace llvm;  using namespace llvm::codeview; -using namespace llvm::msf;  CodeViewDebug::CodeViewDebug(AsmPrinter *AP)      : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(), @@ -495,9 +494,9 @@ void CodeViewDebug::emitTypeInformation() {        // comments. The MSVC linker doesn't do much type record validation,        // so the first link of an invalid type record can succeed while        // subsequent links will fail with LNK1285. -      ByteStream Stream(Record); +      BinaryByteStream Stream(Record, llvm::support::little);        CVTypeArray Types; -      StreamReader Reader(Stream); +      BinaryStreamReader Reader(Stream);        Error E = Reader.readArray(Types, Reader.getLength());        if (!E) {          TypeVisitorCallbacks C; @@ -948,10 +947,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {        // Handle fragments.        auto Fragment = DIExpr->getFragmentInfo(); -      if (DIExpr && Fragment) { +      if (Fragment) {          IsSubfield = true;          StructOffset = Fragment->OffsetInBits / 8; -      } else if (DIExpr && DIExpr->getNumElements() > 0) { +      } else if (DIExpr->getNumElements() > 0) {          continue; // Ignore unrecognized exprs.        } @@ -1014,14 +1013,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {    }  } -void CodeViewDebug::beginFunction(const MachineFunction *MF) { -  assert(!CurFn && "Can't process two functions at once!"); - -  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram()) -    return; - -  DebugHandlerBase::beginFunction(MF); - +void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {    const Function *GV = MF->getFunction();    assert(FnDebugInfo.count(GV) == false);    CurFn = &FnDebugInfo[GV]; @@ -1150,27 +1142,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {    uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8; - -  // We want to assert that the element type multiplied by the array lengths -  // match the size of the overall array. However, if we don't have complete -  // type information for the base type, we can't make this assertion. This -  // happens if limited debug info is enabled in this case: -  //   struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); }; -  //   VTableOptzn array[3]; -  // The DICompositeType of VTableOptzn will have size zero, and the array will -  // have size 3 * sizeof(void*), and we should avoid asserting. -  // -  // There is a related bug in the front-end where an array of a structure, -  // which was declared as incomplete structure first, ends up not getting a -  // size assigned to it. (PR28303) -  // Example: -  //   struct A(*p)[3]; -  //   struct A { int f; } a[3]; -  bool PartiallyIncomplete = false; -  if (Ty->getSizeInBits() == 0 || ElementSize == 0) { -    PartiallyIncomplete = true; -  } -    // Add subranges to array type.    DINodeArray Elements = Ty->getElements();    for (int i = Elements.size() - 1; i >= 0; --i) { @@ -1185,16 +1156,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {      // Variable Length Array (VLA) has Count equal to '-1'.      // Replace with Count '1', assume it is the minimum VLA length.      // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU. -    if (Count == -1) { +    if (Count == -1)        Count = 1; -      PartiallyIncomplete = true; -    }      // Update the element size and element type index for subsequent subranges.      ElementSize *= Count;      // If this is the outermost array, use the size from the array. It will be -    // more accurate if PartiallyIncomplete is true. +    // more accurate if we had a VLA or an incomplete element type size.      uint64_t ArraySize =          (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize; @@ -1203,9 +1172,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {      ElementTypeIndex = TypeTable.writeKnownType(AR);    } -  (void)PartiallyIncomplete; -  assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8)); -    return ElementTypeIndex;  } @@ -2115,18 +2081,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {    }  } -void CodeViewDebug::endFunction(const MachineFunction *MF) { -  if (!Asm || !CurFn)  // We haven't created any debug info for this function. -    return; - +void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {    const Function *GV = MF->getFunction();    assert(FnDebugInfo.count(GV));    assert(CurFn == &FnDebugInfo[GV]);    collectVariableInfo(GV->getSubprogram()); -  DebugHandlerBase::endFunction(MF); -    // Don't emit anything if we don't have any line tables.    if (!CurFn->HaveLineInfo) {      FnDebugInfo.erase(GV); diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 3dd4315e4c2f..343384c51772 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -299,6 +299,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {    unsigned getPointerSizeInBytes(); +protected: +  /// \brief Gather pre-function debug information. +  void beginFunctionImpl(const MachineFunction *MF) override; + +  /// \brief Gather post-function debug information. +  void endFunctionImpl(const MachineFunction *) override; +  public:    CodeViewDebug(AsmPrinter *Asm); @@ -307,12 +314,6 @@ public:    /// \brief Emit the COFF section that holds the line table information.    void endModule() override; -  /// \brief Gather pre-function debug information. -  void beginFunction(const MachineFunction *MF) override; - -  /// \brief Gather post-function debug information. -  void endFunction(const MachineFunction *) override; -    /// \brief Process beginning of an instruction.    void beginInstruction(const MachineInstr *MI) override;  }; diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp index 879918995472..b510e0ef36ac 100644 --- a/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/lib/CodeGen/AsmPrinter/DIE.cpp @@ -42,6 +42,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {    // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.    ID.AddInteger(unsigned(Attribute));    ID.AddInteger(unsigned(Form)); +  if (Form == dwarf::DW_FORM_implicit_const) +    ID.AddInteger(Value);  }  //===----------------------------------------------------------------------===// @@ -107,13 +109,20 @@ void DIEAbbrev::print(raw_ostream &O) {      O << "  "        << dwarf::AttributeString(Data[i].getAttribute())        << "  " -      << dwarf::FormEncodingString(Data[i].getForm()) -      << '\n'; +      << dwarf::FormEncodingString(Data[i].getForm()); + +    if (Data[i].getForm() == dwarf::DW_FORM_implicit_const) +      O << " " << Data[i].getValue(); + +    O << '\n';    }  } -LLVM_DUMP_METHOD -void DIEAbbrev::dump() { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEAbbrev::dump() { +  print(dbgs()); +} +#endif  //===----------------------------------------------------------------------===//  // DIEAbbrevSet Implementation @@ -249,10 +258,11 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const {    O << "\n";  } -LLVM_DUMP_METHOD -void DIE::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIE::dump() {    print(dbgs());  } +#endif  unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,                                         DIEAbbrevSet &AbbrevSet, @@ -340,10 +350,11 @@ void DIEValue::print(raw_ostream &O) const {    }  } -LLVM_DUMP_METHOD -void DIEValue::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEValue::dump() const {    print(dbgs());  } +#endif  //===----------------------------------------------------------------------===//  // DIEInteger Implementation @@ -354,57 +365,42 @@ void DIEValue::dump() const {  void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {    switch (Form) {    case dwarf::DW_FORM_implicit_const: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_flag_present:      // Emit something to keep the lines and comments in sync.      // FIXME: Is there a better way to do this?      Asm->OutStreamer->AddBlankLine();      return;    case dwarf::DW_FORM_flag: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_ref1: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_data1: -    LLVM_FALLTHROUGH; +  case dwarf::DW_FORM_strx1: +  case dwarf::DW_FORM_addrx1:    case dwarf::DW_FORM_ref2: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_data2: -    LLVM_FALLTHROUGH; +  case dwarf::DW_FORM_strx2: +  case dwarf::DW_FORM_addrx2:    case dwarf::DW_FORM_strp: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_ref4: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_data4: -    LLVM_FALLTHROUGH; +  case dwarf::DW_FORM_ref_sup4: +  case dwarf::DW_FORM_strx4: +  case dwarf::DW_FORM_addrx4:    case dwarf::DW_FORM_ref8: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_ref_sig8: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_data8: -    LLVM_FALLTHROUGH; +  case dwarf::DW_FORM_ref_sup8:    case dwarf::DW_FORM_GNU_ref_alt: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_GNU_strp_alt: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_line_strp: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_sec_offset: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_strp_sup: -    LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_ref_sup: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_addr: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_ref_addr:      Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form));      return;    case dwarf::DW_FORM_GNU_str_index: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_GNU_addr_index: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_ref_udata: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_udata:      Asm->EmitULEB128(Integer);      return; @@ -419,35 +415,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {  ///  unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {    switch (Form) { -  case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_flag_present: return 0; -  case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_data1: return sizeof(int8_t); -  case dwarf::DW_FORM_ref2:  LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_data2: return sizeof(int16_t); -  case dwarf::DW_FORM_ref4:  LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_data4: return sizeof(int32_t); -  case dwarf::DW_FORM_ref8:  LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_ref_sig8:  LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_data8: return sizeof(int64_t); +  case dwarf::DW_FORM_implicit_const: +  case dwarf::DW_FORM_flag_present: +    return 0; +  case dwarf::DW_FORM_flag: +  case dwarf::DW_FORM_ref1: +  case dwarf::DW_FORM_data1: +  case dwarf::DW_FORM_strx1: +  case dwarf::DW_FORM_addrx1: +    return sizeof(int8_t); +  case dwarf::DW_FORM_ref2: +  case dwarf::DW_FORM_data2: +  case dwarf::DW_FORM_strx2: +  case dwarf::DW_FORM_addrx2: +    return sizeof(int16_t); +  case dwarf::DW_FORM_ref4: +  case dwarf::DW_FORM_data4: +  case dwarf::DW_FORM_ref_sup4: +  case dwarf::DW_FORM_strx4: +  case dwarf::DW_FORM_addrx4: +    return sizeof(int32_t); +  case dwarf::DW_FORM_ref8: +  case dwarf::DW_FORM_ref_sig8: +  case dwarf::DW_FORM_data8: +  case dwarf::DW_FORM_ref_sup8: +    return sizeof(int64_t);    case dwarf::DW_FORM_ref_addr:      if (AP->getDwarfVersion() == 2)        return AP->getPointerSize();      LLVM_FALLTHROUGH;    case dwarf::DW_FORM_strp: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_GNU_ref_alt: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_GNU_strp_alt: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_line_strp: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_sec_offset: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_strp_sup: -    LLVM_FALLTHROUGH; -  case dwarf::DW_FORM_ref_sup:      switch (AP->OutStreamer->getContext().getDwarfFormat()) {      case dwarf::DWARF32:        return 4; @@ -456,11 +458,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {      }      llvm_unreachable("Invalid DWARF format");    case dwarf::DW_FORM_GNU_str_index: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_GNU_addr_index: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_ref_udata: -    LLVM_FALLTHROUGH;    case dwarf::DW_FORM_udata:      return getULEB128Size(Integer);    case dwarf::DW_FORM_sdata: @@ -484,7 +483,7 @@ void DIEInteger::print(raw_ostream &O) const {  /// EmitValue - Emit expression value.  ///  void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { -  AP->EmitDebugValue(Expr, SizeOf(AP, Form)); +  AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form));  }  /// SizeOf - Determine size of expression value in bytes. diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp index d8ecc7ccfb9b..8e3b88d0af0e 100644 --- a/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -490,9 +490,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) {    Hash.final(Result);    // ... take the least significant 8 bytes and return those. Our MD5 -  // implementation always returns its results in little endian, swap bytes -  // appropriately. -  return support::endian::read64le(Result + 8); +  // implementation always returns its results in little endian, so we actually +  // need the "high" word. +  return Result.high();  }  /// This is based on the type signature computation given in section 7.27 of the @@ -514,7 +514,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) {    Hash.final(Result);    // ... take the least significant 8 bytes and return those. Our MD5 -  // implementation always returns its results in little endian, swap bytes -  // appropriately. -  return support::endian::read64le(Result + 8); +  // implementation always returns its results in little endian, so we actually +  // need the "high" word. +  return Result.high();  } diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 94190981e88e..1d63e33a4d33 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {    return getBaseTypeSize(BaseType);  } +bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) { +  if (!MMI->hasDebugInfo()) +    return false; +  auto *SP = MF->getFunction()->getSubprogram(); +  if (!SP) +    return false; +  assert(SP->getUnit()); +  auto EK = SP->getUnit()->getEmissionKind(); +  if (EK == DICompileUnit::NoDebug) +    return false; +  return true; +} +  void DebugHandlerBase::beginFunction(const MachineFunction *MF) { +  assert(Asm); +  PrevInstBB = nullptr; + +  if (!hasDebugInfo(MMI, MF)) { +    skippedNonDebugFunction(); +    return; +  } +    // Grab the lexical scopes for the function, if we don't have any of those    // then we're not going to be able to do anything.    LScopes.initialize(*MF); -  if (LScopes.empty()) +  if (LScopes.empty()) { +    beginFunctionImpl(MF);      return; +  }    // Make sure that each lexical scope will have a begin/end label.    identifyScopeMarkers(); @@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {    PrevInstLoc = DebugLoc();    PrevLabel = Asm->getFunctionBegin(); +  beginFunctionImpl(MF);  }  void DebugHandlerBase::beginInstruction(const MachineInstr *MI) { @@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() {  }  void DebugHandlerBase::endFunction(const MachineFunction *MF) { +  if (hasDebugInfo(MMI, MF)) +    endFunctionImpl(MF);    DbgValues.clear();    LabelsBeforeInsn.clear();    LabelsAfterInsn.clear(); diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h index c00fa189d94a..659a921e1fc5 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h @@ -80,6 +80,10 @@ protected:      LabelsAfterInsn.insert(std::make_pair(MI, nullptr));    } +  virtual void beginFunctionImpl(const MachineFunction *MF) = 0; +  virtual void endFunctionImpl(const MachineFunction *MF) = 0; +  virtual void skippedNonDebugFunction() {} +    // AsmPrinterHandler overrides.  public:    void beginInstruction(const MachineInstr *MI) override; diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h index 36fb1507ddc6..a68e8cc6b4b3 100644 --- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -76,7 +76,8 @@ public:      const DIExpression *getExpression() const { return Expression; }      friend bool operator==(const Value &, const Value &);      friend bool operator<(const Value &, const Value &); -    void dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +    LLVM_DUMP_METHOD void dump() const {        if (isLocation()) {          llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " ";          if (Loc.isIndirect()) @@ -90,6 +91,7 @@ public:        if (Expression)          Expression->dump();      } +#endif    };  private: diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index d904372af589..a550ff2fb90f 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1,3 +1,16 @@ +//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for constructing a dwarf compile unit. +// +//===----------------------------------------------------------------------===// +  #include "DwarfCompileUnit.h"  #include "DwarfExpression.h"  #include "llvm/CodeGen/MachineFunction.h" @@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(    bool addToAccelTable = false;    DIELoc *Loc = nullptr;    std::unique_ptr<DIEDwarfExpression> DwarfExpr; -  bool AllConstant = std::all_of( -      GlobalExprs.begin(), GlobalExprs.end(), -      [&](const GlobalExpr GE) { -        return GE.Expr && GE.Expr->isConstant(); -      }); -    for (const auto &GE : GlobalExprs) {      const GlobalVariable *Global = GE.Var;      const DIExpression *Expr = GE.Expr; +      // For compatibility with DWARF 3 and earlier,      // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes      // DW_AT_const_value(X).      if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) { +      addToAccelTable = true;        addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1)); -      // We cannot describe the location of dllimport'd variables: the -      // computation of their address requires loads from the IAT. -    } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) { -      if (!Loc) { -        Loc = new (DIEValueAllocator) DIELoc; -        DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); -      } +      break; +    } + +    // We cannot describe the location of dllimport'd variables: the +    // computation of their address requires loads from the IAT. +    if (Global && Global->hasDLLImportStorageClass()) +      continue; + +    // Nothing to describe without address or constant. +    if (!Global && (!Expr || !Expr->isConstant())) +      continue; + +    if (!Loc) {        addToAccelTable = true; -      if (Global) { -        const MCSymbol *Sym = Asm->getSymbol(Global); -        if (Global->isThreadLocal()) { -          if (Asm->TM.Options.EmulatedTLS) { -            // TODO: add debug info for emulated thread local mode. -          } else { -            // FIXME: Make this work with -gsplit-dwarf. -            unsigned PointerSize = Asm->getDataLayout().getPointerSize(); -            assert((PointerSize == 4 || PointerSize == 8) && -                   "Add support for other sizes if necessary"); -            // Based on GCC's support for TLS: -            if (!DD->useSplitDwarf()) { -              // 1) Start with a constNu of the appropriate pointer size -              addUInt(*Loc, dwarf::DW_FORM_data1, -                      PointerSize == 4 ? dwarf::DW_OP_const4u -                                       : dwarf::DW_OP_const8u); -              // 2) containing the (relocated) offset of the TLS variable -              //    within the module's TLS block. -              addExpr(*Loc, dwarf::DW_FORM_udata, -                      Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); -            } else { -              addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); -              addUInt(*Loc, dwarf::DW_FORM_udata, -                      DD->getAddressPool().getIndex(Sym, /* TLS */ true)); -            } -            // 3) followed by an OP to make the debugger do a TLS lookup. +      Loc = new (DIEValueAllocator) DIELoc; +      DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); +    } + +    if (Global) { +      const MCSymbol *Sym = Asm->getSymbol(Global); +      if (Global->isThreadLocal()) { +        if (Asm->TM.Options.EmulatedTLS) { +          // TODO: add debug info for emulated thread local mode. +        } else { +          // FIXME: Make this work with -gsplit-dwarf. +          unsigned PointerSize = Asm->getDataLayout().getPointerSize(); +          assert((PointerSize == 4 || PointerSize == 8) && +                 "Add support for other sizes if necessary"); +          // Based on GCC's support for TLS: +          if (!DD->useSplitDwarf()) { +            // 1) Start with a constNu of the appropriate pointer size              addUInt(*Loc, dwarf::DW_FORM_data1, -                    DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address -                                          : dwarf::DW_OP_form_tls_address); +                    PointerSize == 4 ? dwarf::DW_OP_const4u +                                     : dwarf::DW_OP_const8u); +            // 2) containing the (relocated) offset of the TLS variable +            //    within the module's TLS block. +            addExpr(*Loc, dwarf::DW_FORM_udata, +                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); +          } else { +            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); +            addUInt(*Loc, dwarf::DW_FORM_udata, +                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));            } -        } else { -          DD->addArangeLabel(SymbolCU(this, Sym)); -          addOpAddress(*Loc, Sym); +          // 3) followed by an OP to make the debugger do a TLS lookup. +          addUInt(*Loc, dwarf::DW_FORM_data1, +                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address +                                        : dwarf::DW_OP_form_tls_address);          } +      } else { +        DD->addArangeLabel(SymbolCU(this, Sym)); +        addOpAddress(*Loc, Sym);        } -      if (Expr) { -        DwarfExpr->addFragmentOffset(Expr); -        DwarfExpr->AddExpression(Expr); -      } +    } +    if (Expr) { +      DwarfExpr->addFragmentOffset(Expr); +      DwarfExpr->addExpression(Expr);      }    }    if (Loc) @@ -507,8 +525,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,          DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);          // If there is an expression, emit raw unsigned bytes.          DwarfExpr.addFragmentOffset(Expr); -        DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm()); -        DwarfExpr.AddExpression(Expr); +        DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm()); +        DwarfExpr.addExpression(Expr);          addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());        } else          addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType()); @@ -532,9 +550,15 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,      const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();      int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);      DwarfExpr.addFragmentOffset(Fragment.Expr); -    DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), -                                    FrameReg, Offset); -    DwarfExpr.AddExpression(Fragment.Expr); +    SmallVector<uint64_t, 8> Ops; +    Ops.push_back(dwarf::DW_OP_plus); +    Ops.push_back(Offset); +    Ops.push_back(dwarf::DW_OP_deref); +    Ops.append(Fragment.Expr->elements_begin(), Fragment.Expr->elements_end()); +    DIExpressionCursor Expr(Ops); +    DwarfExpr.addMachineRegExpression( +        *Asm->MF->getSubtarget().getRegisterInfo(), Expr, FrameReg); +    DwarfExpr.addExpression(std::move(Expr));    }    addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); @@ -690,11 +714,14 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {      Asm->OutStreamer->EmitLabel(LabelBegin);    } -  DwarfUnit::emitHeader(UseOffsets); +  dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile +                                : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton +                                                      : dwarf::DW_UT_compile; +  DwarfUnit::emitCommonHeader(UseOffsets, UT);  }  /// addGlobalName - Add a new global name to the compile unit. -void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die, +void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,                                       const DIScope *Context) {    if (includeMinimalInlineScopes())      return; @@ -702,6 +729,18 @@ void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,    GlobalNames[FullName] = &Die;  } +void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name, +                                                const DIScope *Context) { +  if (includeMinimalInlineScopes()) +    return; +  std::string FullName = getParentContextString(Context) + Name.str(); +  // Insert, allowing the entry to remain as-is if it's already present +  // This way the CU-level type DIE is preferred over the "can't describe this +  // type as a unit offset because it's not really in the CU at all, it's only +  // in a type unit" +  GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} +  /// Add a new global type to the unit.  void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,                                       const DIScope *Context) { @@ -711,6 +750,18 @@ void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,    GlobalTypes[FullName] = &Die;  } +void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty, +                                             const DIScope *Context) { +  if (includeMinimalInlineScopes()) +    return; +  std::string FullName = getParentContextString(Context) + Ty->getName().str(); +  // Insert, allowing the entry to remain as-is if it's already present +  // This way the CU-level type DIE is preferred over the "can't describe this +  // type as a unit offset because it's not really in the CU at all, it's only +  // in a type unit" +  GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} +  /// addVariableAddress - Add DW_AT_location attribute for a  /// DbgVariable based on provided MachineLocation.  void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, @@ -727,22 +778,22 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,  void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,                                    const MachineLocation &Location) {    DIELoc *Loc = new (DIEValueAllocator) DIELoc; -  DIEDwarfExpression Expr(*Asm, *this, *Loc); - -  bool validReg; -  if (Location.isReg()) -    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), -                                  Location.getReg()); -  else -    validReg = -        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), -                                   Location.getReg(), Location.getOffset()); +  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); -  if (!validReg) +  SmallVector<uint64_t, 8> Ops; +  if (Location.isIndirect()) { +    Ops.push_back(dwarf::DW_OP_plus); +    Ops.push_back(Location.getOffset()); +    Ops.push_back(dwarf::DW_OP_deref); +  } +  DIExpressionCursor Cursor(Ops); +  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); +  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))      return; +  DwarfExpr.addExpression(std::move(Cursor));    // Now attach the location information to the DIE. -  addBlock(Die, Attribute, Expr.finalize()); +  addBlock(Die, Attribute, DwarfExpr.finalize());  }  /// Start with the address based on the location provided, and generate the @@ -754,23 +805,24 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,                                           const MachineLocation &Location) {    DIELoc *Loc = new (DIEValueAllocator) DIELoc;    DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); -  const DIExpression *Expr = DV.getSingleExpression(); -  DIExpressionCursor ExprCursor(Expr); +  const DIExpression *DIExpr = DV.getSingleExpression(); +  DwarfExpr.addFragmentOffset(DIExpr); + +  SmallVector<uint64_t, 8> Ops; +  if (Location.isIndirect()) { +    Ops.push_back(dwarf::DW_OP_plus); +    Ops.push_back(Location.getOffset()); +    Ops.push_back(dwarf::DW_OP_deref); +  } +  Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); +  DIExpressionCursor Cursor(Ops);    const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); -  auto Reg = Location.getReg(); -  DwarfExpr.addFragmentOffset(Expr); -  bool ValidReg = -      Location.getOffset() -          ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset()) -          : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg); - -  if (!ValidReg) +  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))      return; - -  DwarfExpr.AddExpression(std::move(ExprCursor)); +  DwarfExpr.addExpression(std::move(Cursor));    // Now attach the location information to the DIE. -  addBlock(Die, Attribute, Loc); +  addBlock(Die, Attribute, DwarfExpr.finalize());  }  /// Add a Dwarf loclistptr attribute data and value. diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index a8025f1d1521..9a64b4b76b06 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -210,12 +210,19 @@ public:    }    /// Add a new global name to the compile unit. -  void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override; +  void addGlobalName(StringRef Name, const DIE &Die, +                     const DIScope *Context) override; + +  /// Add a new global name present in a type unit to this compile unit. +  void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);    /// Add a new global type to the compile unit.    void addGlobalType(const DIType *Ty, const DIE &Die,                       const DIScope *Context) override; +  /// Add a new global type present in a type unit to this compile unit. +  void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context); +    const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }    const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 91a3d0989cc5..5ce111309208 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -39,7 +39,6 @@  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/Dwarf.h" -#include "llvm/Support/Endian.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/FormattedStream.h"  #include "llvm/Support/LEB128.h" @@ -127,17 +126,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission";  static const char *const DbgTimerName = "writer";  static const char *const DbgTimerDescription = "DWARF Debug Writer"; -void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) { +void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {    BS.EmitInt8(        Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)                    : dwarf::OperationEncodingString(Op));  } -void DebugLocDwarfExpression::EmitSigned(int64_t Value) { +void DebugLocDwarfExpression::emitSigned(int64_t Value) {    BS.EmitSLEB128(Value, Twine(Value));  } -void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) { +void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {    BS.EmitULEB128(Value, Twine(Value));  } @@ -200,6 +199,12 @@ const DIType *DbgVariable::getType() const {  }  ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const { +  if (FrameIndexExprs.size() == 1) +    return FrameIndexExprs; + +  assert(all_of(FrameIndexExprs, +                [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) && +         "multiple FI expressions without DW_OP_LLVM_fragment");    std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(),              [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {                return A.Expr->getFragmentInfo()->OffsetInBits < @@ -418,7 +423,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {      Asm->OutStreamer->getContext().setMCLineTableCompilationDir(          NewCU.getUniqueID(), CompilationDir); -  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer()); +  StringRef Producer = DIUnit->getProducer(); +  StringRef Flags = DIUnit->getFlags(); +  if (!Flags.empty()) { +    std::string ProducerWithFlags = Producer.str() + " " + Flags.str(); +    NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags); +  } else +    NewCU.addString(Die, dwarf::DW_AT_producer, Producer); +    NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,                  DIUnit->getSourceLanguage());    NewCU.addString(Die, dwarf::DW_AT_name, FN); @@ -544,7 +556,6 @@ void DwarfDebug::beginModule() {        // The retained types array by design contains pointers to        // MDNodes rather than DIRefs. Unique them here.        if (DIType *RT = dyn_cast<DIType>(Ty)) -        if (!RT->isExternalTypeRef())            // There is no point in force-emitting a forward declaration.            CU.getOrCreateTypeDIE(RT);      } @@ -740,6 +751,7 @@ DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {  void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,                                          LexicalScope *Scope) { +  assert(Scope && Scope->isAbstractScope());    auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);    InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());    AbstractVariables[Var] = std::move(AbsDbgVariable); @@ -1137,20 +1149,9 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {  // Gather pre-function debug information.  Assumes being called immediately  // after the function entry point has been emitted. -void DwarfDebug::beginFunction(const MachineFunction *MF) { +void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {    CurFn = MF; -  // If there's no debug info for the function we're not going to do anything. -  if (!MMI->hasDebugInfo()) -    return; - -  auto DI = MF->getFunction()->getSubprogram(); -  if (!DI) -    return; - -  // Grab the lexical scopes for the function, if we don't have any of those -  // then we're not going to be able to do anything. -  DebugHandlerBase::beginFunction(MF);    if (LScopes.empty())      return; @@ -1189,23 +1190,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {    }  } +void DwarfDebug::skippedNonDebugFunction() { +  // If we don't have a subprogram for this function then there will be a hole +  // in the range information. Keep note of this by setting the previously used +  // section to nullptr. +  PrevCU = nullptr; +  CurFn = nullptr; +} +  // Gather and emit post-function debug information. -void DwarfDebug::endFunction(const MachineFunction *MF) { +void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { +  const DISubprogram *SP = MF->getFunction()->getSubprogram(); +    assert(CurFn == MF &&        "endFunction should be called with the same function as beginFunction"); -  const DISubprogram *SP = MF->getFunction()->getSubprogram(); -  if (!MMI->hasDebugInfo() || !SP || -      SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) { -    // If we don't have a subprogram for this function then there will be a hole -    // in the range information. Keep note of this by setting the previously -    // used section to nullptr. -    PrevCU = nullptr; -    CurFn = nullptr; -    DebugHandlerBase::endFunction(MF); -    return; -  } -    // Set DwarfDwarfCompileUnitID in MCContext to default value.    Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); @@ -1220,17 +1219,14 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {    TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd()));    // Under -gmlt, skip building the subprogram if there are no inlined -  // subroutines inside it. -  if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && +  // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram +  // is still needed as we need its source location. +  if (!TheCU.getCUNode()->getDebugInfoForProfiling() && +      TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&        LScopes.getAbstractScopesList().empty() && !IsDarwin) {      assert(InfoHolder.getScopeVariables().empty()); -    assert(DbgValues.empty()); -    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed -    // by a -gmlt CU. Add a test and remove this assertion. -    assert(AbstractVariables.empty());      PrevLabel = nullptr;      CurFn = nullptr; -    DebugHandlerBase::endFunction(MF);      return;    } @@ -1266,7 +1262,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {    InfoHolder.getScopeVariables().clear();    PrevLabel = nullptr;    CurFn = nullptr; -  DebugHandlerBase::endFunction(MF);  }  // Register a source line with debug info. Returns the  unique label that was @@ -1361,6 +1356,18 @@ void DwarfDebug::emitAccelTypes() {  /// computeIndexValue - Compute the gdb index value for the DIE and CU.  static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,                                                          const DIE *Die) { +  // Entities that ended up only in a Type Unit reference the CU instead (since +  // the pub entry has offsets within the CU there's no real offset that can be +  // provided anyway). As it happens all such entities (namespaces and types, +  // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out +  // not to be true it would be necessary to persist this information from the +  // point at which the entry is added to the index data structure - since by +  // the time the index is built from that, the original type/namespace DIE in a +  // type unit has already been destroyed so it can't be queried for properties +  // like tag, etc. +  if (Die->getTag() == dwarf::DW_TAG_compile_unit) +    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, +                                          dwarf::GIEL_EXTERNAL);    dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;    // We could have a specification DIE that has our most of our knowledge, @@ -1498,27 +1505,37 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,                                ByteStreamer &Streamer,                                const DebugLocEntry::Value &Value,                                DwarfExpression &DwarfExpr) { -  DIExpressionCursor ExprCursor(Value.getExpression()); -  DwarfExpr.addFragmentOffset(Value.getExpression()); +  auto *DIExpr = Value.getExpression(); +  DIExpressionCursor ExprCursor(DIExpr); +  DwarfExpr.addFragmentOffset(DIExpr);    // Regular entry.    if (Value.isInt()) {      if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||                 BT->getEncoding() == dwarf::DW_ATE_signed_char)) -      DwarfExpr.AddSignedConstant(Value.getInt()); +      DwarfExpr.addSignedConstant(Value.getInt());      else -      DwarfExpr.AddUnsignedConstant(Value.getInt()); +      DwarfExpr.addUnsignedConstant(Value.getInt());    } else if (Value.isLocation()) { -    MachineLocation Loc = Value.getLoc(); +    MachineLocation Location = Value.getLoc(); + +    SmallVector<uint64_t, 8> Ops; +    // FIXME: Should this condition be Location.isIndirect() instead? +    if (Location.getOffset()) { +      Ops.push_back(dwarf::DW_OP_plus); +      Ops.push_back(Location.getOffset()); +      Ops.push_back(dwarf::DW_OP_deref); +    } +    Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); +    DIExpressionCursor Cursor(Ops);      const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo(); -    if (Loc.getOffset()) -      DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset()); -    else -      DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg()); +    if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) +      return; +    return DwarfExpr.addExpression(std::move(Cursor));    } else if (Value.isConstantFP()) {      APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt(); -    DwarfExpr.AddUnsignedConstant(RawBytes); +    DwarfExpr.addUnsignedConstant(RawBytes);    } -  DwarfExpr.AddExpression(std::move(ExprCursor)); +  DwarfExpr.addExpression(std::move(ExprCursor));  }  void DebugLocEntry::finalize(const AsmPrinter &AP, @@ -1940,11 +1957,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {    MD5 Hash;    Hash.update(Identifier);    // ... take the least significant 8 bytes and return those. Our MD5 -  // implementation always returns its results in little endian, swap bytes -  // appropriately. +  // implementation always returns its results in little endian, so we actually +  // need the "high" word.    MD5::MD5Result Result;    Hash.final(Result); -  return support::endian::read64le(Result + 8); +  return Result.high();  }  void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index 253e3f06200e..8a96e7867b6e 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -89,7 +89,7 @@ public:      assert(!MInsn && "Already initialized?");      assert((!E || E->isValid()) && "Expected valid expression"); -    assert(~FI && "Expected valid index"); +    assert(FI != INT_MAX && "Expected valid index");      FrameIndexExprs.push_back({FI, E});    } @@ -448,6 +448,15 @@ class DwarfDebug : public DebugHandlerBase {    /// Collect variable information from the side table maintained by MF.    void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P); +protected: +  /// Gather pre-function debug information. +  void beginFunctionImpl(const MachineFunction *MF) override; + +  /// Gather and emit post-function debug information. +  void endFunctionImpl(const MachineFunction *MF) override; + +  void skippedNonDebugFunction() override; +  public:    //===--------------------------------------------------------------------===//    // Main entry points. @@ -463,12 +472,6 @@ public:    /// Emit all Dwarf sections that should come after the content.    void endModule() override; -  /// Gather pre-function debug information. -  void beginFunction(const MachineFunction *MF) override; - -  /// Gather and emit post-function debug information. -  void endFunction(const MachineFunction *MF) override; -    /// Process beginning of an instruction.    void beginInstruction(const MachineInstr *MI) override; diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 61b2c7e65842..debe88f3b1ee 100644 --- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -22,79 +22,76 @@  using namespace llvm; -void DwarfExpression::AddReg(int DwarfReg, const char *Comment) { +void DwarfExpression::addReg(int DwarfReg, const char *Comment) {    assert(DwarfReg >= 0 && "invalid negative dwarf register number");    if (DwarfReg < 32) { -    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); +    emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);    } else { -    EmitOp(dwarf::DW_OP_regx, Comment); -    EmitUnsigned(DwarfReg); +    emitOp(dwarf::DW_OP_regx, Comment); +    emitUnsigned(DwarfReg);    }  } -void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) { +void DwarfExpression::addBReg(int DwarfReg, int Offset) {    assert(DwarfReg >= 0 && "invalid negative dwarf register number");    if (DwarfReg < 32) { -    EmitOp(dwarf::DW_OP_breg0 + DwarfReg); +    emitOp(dwarf::DW_OP_breg0 + DwarfReg);    } else { -    EmitOp(dwarf::DW_OP_bregx); -    EmitUnsigned(DwarfReg); +    emitOp(dwarf::DW_OP_bregx); +    emitUnsigned(DwarfReg);    } -  EmitSigned(Offset); -  if (Deref) -    EmitOp(dwarf::DW_OP_deref); +  emitSigned(Offset);  } -void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { +void DwarfExpression::addFBReg(int Offset) { +  emitOp(dwarf::DW_OP_fbreg); +  emitSigned(Offset); +} + +void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {    if (!SizeInBits)      return;    const unsigned SizeOfByte = 8;    if (OffsetInBits > 0 || SizeInBits % SizeOfByte) { -    EmitOp(dwarf::DW_OP_bit_piece); -    EmitUnsigned(SizeInBits); -    EmitUnsigned(OffsetInBits); +    emitOp(dwarf::DW_OP_bit_piece); +    emitUnsigned(SizeInBits); +    emitUnsigned(OffsetInBits);    } else { -    EmitOp(dwarf::DW_OP_piece); +    emitOp(dwarf::DW_OP_piece);      unsigned ByteSize = SizeInBits / SizeOfByte; -    EmitUnsigned(ByteSize); +    emitUnsigned(ByteSize);    }    this->OffsetInBits += SizeInBits;  } -void DwarfExpression::AddShr(unsigned ShiftBy) { -  EmitOp(dwarf::DW_OP_constu); -  EmitUnsigned(ShiftBy); -  EmitOp(dwarf::DW_OP_shr); +void DwarfExpression::addShr(unsigned ShiftBy) { +  emitOp(dwarf::DW_OP_constu); +  emitUnsigned(ShiftBy); +  emitOp(dwarf::DW_OP_shr);  } -bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI, -                                            unsigned MachineReg, int Offset) { -  if (isFrameRegister(TRI, MachineReg)) { -    // If variable offset is based in frame register then use fbreg. -    EmitOp(dwarf::DW_OP_fbreg); -    EmitSigned(Offset); -    return true; -  } - -  int DwarfReg = TRI.getDwarfRegNum(MachineReg, false); -  if (DwarfReg < 0) -    return false; - -  AddRegIndirect(DwarfReg, Offset); -  return true; +void DwarfExpression::addAnd(unsigned Mask) { +  emitOp(dwarf::DW_OP_constu); +  emitUnsigned(Mask); +  emitOp(dwarf::DW_OP_and);  } -bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,                                      unsigned MachineReg, unsigned MaxSize) { -  if (!TRI.isPhysicalRegister(MachineReg)) +  if (!TRI.isPhysicalRegister(MachineReg)) { +    if (isFrameRegister(TRI, MachineReg)) { +      DwarfRegs.push_back({-1, 0, nullptr}); +      return true; +    }      return false; +  }    int Reg = TRI.getDwarfRegNum(MachineReg, false);    // If this is a valid register number, emit it.    if (Reg >= 0) { -    AddReg(Reg); +    DwarfRegs.push_back({Reg, 0, nullptr});      return true;    } @@ -106,7 +103,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,        unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg);        unsigned Size = TRI.getSubRegIdxSize(Idx);        unsigned RegOffset = TRI.getSubRegIdxOffset(Idx); -      AddReg(Reg, "super-register"); +      DwarfRegs.push_back({Reg, 0, "super-register"});        // Use a DW_OP_bit_piece to describe the sub-register.        setSubRegisterPiece(Size, RegOffset);        return true; @@ -136,72 +133,101 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,      // If this sub-register has a DWARF number and we haven't covered      // its range, emit a DWARF piece for it.      if (Reg >= 0 && Intersection.any()) { -      AddReg(Reg, "sub-register"); +      // Emit a piece for any gap in the coverage. +      if (Offset > CurPos) +        DwarfRegs.push_back({-1, Offset - CurPos, nullptr}); +      DwarfRegs.push_back( +          {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"});        if (Offset >= MaxSize)  	break; -      // Emit a piece for the any gap in the coverage. -      if (Offset > CurPos) -        AddOpPiece(Offset - CurPos); -      AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset)); -      CurPos = Offset + Size;        // Mark it as emitted.        Coverage.set(Offset, Offset + Size); +      CurPos = Offset + Size;      }    }    return CurPos;  } -void DwarfExpression::AddStackValue() { +void DwarfExpression::addStackValue() {    if (DwarfVersion >= 4) -    EmitOp(dwarf::DW_OP_stack_value); +    emitOp(dwarf::DW_OP_stack_value);  } -void DwarfExpression::AddSignedConstant(int64_t Value) { -  EmitOp(dwarf::DW_OP_consts); -  EmitSigned(Value); -  AddStackValue(); +void DwarfExpression::addSignedConstant(int64_t Value) { +  emitOp(dwarf::DW_OP_consts); +  emitSigned(Value); +  addStackValue();  } -void DwarfExpression::AddUnsignedConstant(uint64_t Value) { -  EmitOp(dwarf::DW_OP_constu); -  EmitUnsigned(Value); -  AddStackValue(); +void DwarfExpression::addUnsignedConstant(uint64_t Value) { +  emitOp(dwarf::DW_OP_constu); +  emitUnsigned(Value); +  addStackValue();  } -void DwarfExpression::AddUnsignedConstant(const APInt &Value) { +void DwarfExpression::addUnsignedConstant(const APInt &Value) {    unsigned Size = Value.getBitWidth();    const uint64_t *Data = Value.getRawData();    // Chop it up into 64-bit pieces, because that's the maximum that -  // AddUnsignedConstant takes. +  // addUnsignedConstant takes.    unsigned Offset = 0;    while (Offset < Size) { -    AddUnsignedConstant(*Data++); +    addUnsignedConstant(*Data++);      if (Offset == 0 && Size <= 64)        break; -    AddOpPiece(std::min(Size-Offset, 64u), Offset); +    addOpPiece(std::min(Size-Offset, 64u), Offset);      Offset += 64;    }  } -bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,                                                DIExpressionCursor &ExprCursor,                                                unsigned MachineReg,                                                unsigned FragmentOffsetInBits) { -  if (!ExprCursor) -    return AddMachineReg(TRI, MachineReg); +  auto Fragment = ExprCursor.getFragmentInfo(); +  if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) +    return false; -  // Pattern-match combinations for which more efficient representations exist -  // first. -  bool ValidReg = false; +  bool HasComplexExpression = false;    auto Op = ExprCursor.peek(); +  if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment) +    HasComplexExpression = true; + +  // If the register can only be described by a complex expression (i.e., +  // multiple subregisters) it doesn't safely compose with another complex +  // expression. For example, it is not possible to apply a DW_OP_deref +  // operation to multiple DW_OP_pieces. +  if (HasComplexExpression && DwarfRegs.size() > 1) { +    DwarfRegs.clear(); +    return false; +  } + +  // Handle simple register locations. +  if (!HasComplexExpression) { +    for (auto &Reg : DwarfRegs) { +      if (Reg.DwarfRegNo >= 0) +        addReg(Reg.DwarfRegNo, Reg.Comment); +      addOpPiece(Reg.Size); +    } +    DwarfRegs.clear(); +    return true; +  } + +  assert(DwarfRegs.size() == 1); +  auto Reg = DwarfRegs[0]; +  bool FBReg = isFrameRegister(TRI, MachineReg);  +  assert(Reg.Size == 0 && "subregister has same size as superregister"); + +  // Pattern-match combinations for which more efficient representations exist.    switch (Op->getOp()) {    default: { -    auto Fragment = ExprCursor.getFragmentInfo(); -    ValidReg = AddMachineReg(TRI, MachineReg, -			     Fragment ? Fragment->SizeInBits : ~1U); +    if (FBReg) +      addFBReg(0); +    else +      addReg(Reg.DwarfRegNo, 0);      break;    }    case dwarf::DW_OP_plus: @@ -210,28 +236,42 @@ bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,      // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset].      auto N = ExprCursor.peekNext();      if (N && N->getOp() == dwarf::DW_OP_deref) { -      unsigned Offset = Op->getArg(0); -      ValidReg = AddMachineRegIndirect( -          TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset); +      int Offset = Op->getArg(0); +      int SignedOffset = (Op->getOp() == dwarf::DW_OP_plus) ? Offset : -Offset; +      if (FBReg) +        addFBReg(SignedOffset); +      else +        addBReg(Reg.DwarfRegNo, SignedOffset); +        ExprCursor.consume(2); -    } else -      ValidReg = AddMachineReg(TRI, MachineReg); +      break; +    } +    addReg(Reg.DwarfRegNo, 0);      break;    }    case dwarf::DW_OP_deref:      // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg]. -    ValidReg = AddMachineRegIndirect(TRI, MachineReg); +    if (FBReg) +      addFBReg(0); +    else +      addBReg(Reg.DwarfRegNo, 0);      ExprCursor.take();      break;    } - -  return ValidReg; +  DwarfRegs.clear(); +  return true;  } -void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, +void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,                                      unsigned FragmentOffsetInBits) {    while (ExprCursor) {      auto Op = ExprCursor.take(); + +    // If we need to mask out a subregister, do it now, unless the next +    // operation would emit an OpPiece anyway. +    if (SubRegisterSizeInBits && Op->getOp() != dwarf::DW_OP_LLVM_fragment) +      maskSubRegister(); +      switch (Op->getOp()) {      case dwarf::DW_OP_LLVM_fragment: {        unsigned SizeInBits = Op->getArg(1); @@ -241,39 +281,45 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,        // location.        assert(OffsetInBits >= FragmentOffset && "fragment offset not added?"); -      // If \a AddMachineReg already emitted DW_OP_piece operations to represent +      // If \a addMachineReg already emitted DW_OP_piece operations to represent        // a super-register by splicing together sub-registers, subtract the size        // of the pieces that was already emitted.        SizeInBits -= OffsetInBits - FragmentOffset; -      // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a +      // If \a addMachineReg requested a DW_OP_bit_piece to stencil out a        // sub-register that is smaller than the current fragment's size, use it.        if (SubRegisterSizeInBits)          SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits); -      AddOpPiece(SizeInBits, SubRegisterOffsetInBits); +      addOpPiece(SizeInBits, SubRegisterOffsetInBits);        setSubRegisterPiece(0, 0);        break;      }      case dwarf::DW_OP_plus: -      EmitOp(dwarf::DW_OP_plus_uconst); -      EmitUnsigned(Op->getArg(0)); +      emitOp(dwarf::DW_OP_plus_uconst); +      emitUnsigned(Op->getArg(0));        break;      case dwarf::DW_OP_minus:        // There is no OP_minus_uconst. -      EmitOp(dwarf::DW_OP_constu); -      EmitUnsigned(Op->getArg(0)); -      EmitOp(dwarf::DW_OP_minus); +      emitOp(dwarf::DW_OP_constu); +      emitUnsigned(Op->getArg(0)); +      emitOp(dwarf::DW_OP_minus);        break;      case dwarf::DW_OP_deref: -      EmitOp(dwarf::DW_OP_deref); +      emitOp(dwarf::DW_OP_deref);        break;      case dwarf::DW_OP_constu: -      EmitOp(dwarf::DW_OP_constu); -      EmitUnsigned(Op->getArg(0)); +      emitOp(dwarf::DW_OP_constu); +      emitUnsigned(Op->getArg(0));        break;      case dwarf::DW_OP_stack_value: -      AddStackValue(); +      addStackValue(); +      break; +    case dwarf::DW_OP_swap: +      emitOp(dwarf::DW_OP_swap); +      break; +    case dwarf::DW_OP_xderef: +      emitOp(dwarf::DW_OP_xderef);        break;      default:        llvm_unreachable("unhandled opcode found in expression"); @@ -281,9 +327,25 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,    }  } +/// add masking operations to stencil out a subregister. +void DwarfExpression::maskSubRegister() { +  assert(SubRegisterSizeInBits && "no subregister was registered"); +  if (SubRegisterOffsetInBits > 0) +    addShr(SubRegisterOffsetInBits); +  uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL; +  addAnd(Mask); +} + +  void DwarfExpression::finalize() { -  if (SubRegisterSizeInBits) -    AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); +  assert(DwarfRegs.size() == 0 && "dwarf registers not emitted"); +  // Emit any outstanding DW_OP_piece operations to mask out subregisters. +  if (SubRegisterSizeInBits == 0) +    return; +  // Don't emit a DW_OP_piece for a subregister at offset 0. +  if (SubRegisterOffsetInBits == 0) +    return; +  addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);  }  void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { @@ -294,6 +356,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {    assert(FragmentOffset >= OffsetInBits &&           "overlapping or duplicate fragments");    if (FragmentOffset > OffsetInBits) -    AddOpPiece(FragmentOffset - OffsetInBits); +    addOpPiece(FragmentOffset - OffsetInBits);    OffsetInBits = FragmentOffset;  } diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h index fd90fa05bc32..e8dc211eb3c2 100644 --- a/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -84,9 +84,19 @@ public:  /// entry.  class DwarfExpression {  protected: -  unsigned DwarfVersion; +  /// Holds information about all subregisters comprising a register location. +  struct Register { +    int DwarfRegNo; +    unsigned Size; +    const char *Comment; +  }; + +  /// The register location, if any. +  SmallVector<Register, 2> DwarfRegs; +    /// Current Fragment Offset in Bits.    uint64_t OffsetInBits = 0; +  unsigned DwarfVersion;    /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister.     unsigned SubRegisterSizeInBits = 0; @@ -99,35 +109,54 @@ protected:      SubRegisterOffsetInBits = OffsetInBits;    } -public: -  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} -  virtual ~DwarfExpression() {}; - -  /// This needs to be called last to commit any pending changes. -  void finalize(); +  /// Add masking operations to stencil out a subregister. +  void maskSubRegister();    /// Output a dwarf operand and an optional assembler comment. -  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0; +  virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0;    /// Emit a raw signed value. -  virtual void EmitSigned(int64_t Value) = 0; +  virtual void emitSigned(int64_t Value) = 0;    /// Emit a raw unsigned value. -  virtual void EmitUnsigned(uint64_t Value) = 0; +  virtual void emitUnsigned(uint64_t Value) = 0;    /// Return whether the given machine register is the frame register in the    /// current function.    virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0; -  /// Emit a dwarf register operation. -  void AddReg(int DwarfReg, const char *Comment = nullptr); -  /// Emit an (double-)indirect dwarf register operation. -  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false); +  /// Emit a DW_OP_reg operation. +  void addReg(int DwarfReg, const char *Comment = nullptr); +  /// Emit a DW_OP_breg operation. +  void addBReg(int DwarfReg, int Offset); +  /// Emit DW_OP_fbreg <Offset>. +  void addFBReg(int Offset); + +  /// Emit a partial DWARF register operation. +  /// +  /// \param MachineReg           The register number. +  /// \param MaxSize              If the register must be composed from +  ///                             sub-registers this is an upper bound +  ///                             for how many bits the emitted DW_OP_piece +  ///                             may cover. +  /// +  /// If size and offset is zero an operation for the entire register is +  /// emitted: Some targets do not provide a DWARF register number for every +  /// register.  If this is the case, this function will attempt to emit a DWARF +  /// register by emitting a fragment of a super-register or by piecing together +  /// multiple subregisters that alias the register. +  /// +  /// \return false if no DWARF register exists for MachineReg. +  bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, +                     unsigned MaxSize = ~1U); +    /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment.    /// \param OffsetInBits    This is an optional offset into the location that    /// is at the top of the DWARF stack. -  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); +  void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); -  /// Emit a shift-right dwarf expression. -  void AddShr(unsigned ShiftBy); +  /// Emit a shift-right dwarf operation. +  void addShr(unsigned ShiftBy); +  /// Emit a bitwise and dwarf operation. +  void addAnd(unsigned Mask);    /// Emit a DW_OP_stack_value, if supported.    /// @@ -140,37 +169,21 @@ public:    /// constant value, so the producers and consumers started to rely on    /// heuristics to disambiguate the value vs. location status of the    /// expression.  See PR21176 for more details. -  void AddStackValue(); +  void addStackValue(); -  /// Emit an indirect dwarf register operation for the given machine register. -  /// \return false if no DWARF register exists for MachineReg. -  bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg, -                             int Offset = 0); +  ~DwarfExpression() = default; +public: +  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} -  /// Emit a partial DWARF register operation. -  /// -  /// \param MachineReg           The register number. -  /// \param MaxSize              If the register must be composed from -  ///                             sub-registers this is an upper bound -  ///                             for how many bits the emitted DW_OP_piece -  ///                             may cover. -  /// -  /// If size and offset is zero an operation for the entire register is -  /// emitted: Some targets do not provide a DWARF register number for every -  /// register.  If this is the case, this function will attempt to emit a DWARF -  /// register by emitting a fragment of a super-register or by piecing together -  /// multiple subregisters that alias the register. -  /// -  /// \return false if no DWARF register exists for MachineReg. -  bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, -                     unsigned MaxSize = ~1U); +  /// This needs to be called last to commit any pending changes. +  void finalize();    /// Emit a signed constant. -  void AddSignedConstant(int64_t Value); +  void addSignedConstant(int64_t Value);    /// Emit an unsigned constant. -  void AddUnsignedConstant(uint64_t Value); +  void addUnsignedConstant(uint64_t Value);    /// Emit an unsigned constant. -  void AddUnsignedConstant(const APInt &Value); +  void addUnsignedConstant(const APInt &Value);    /// Emit a machine register location. As an optimization this may also consume    /// the prefix of a DwarfExpression if a more efficient representation for @@ -181,7 +194,7 @@ public:    ///                                 fragment inside the entire variable.    /// \return                         false if no DWARF register exists    ///                                 for MachineReg. -  bool AddMachineRegExpression(const TargetRegisterInfo &TRI, +  bool addMachineRegExpression(const TargetRegisterInfo &TRI,                                 DIExpressionCursor &Expr, unsigned MachineReg,                                 unsigned FragmentOffsetInBits = 0);    /// Emit all remaining operations in the DIExpressionCursor. @@ -189,7 +202,7 @@ public:    /// \param FragmentOffsetInBits     If this is one fragment out of multiple    ///                                 locations, this is the offset of the    ///                                 fragment inside the entire variable. -  void AddExpression(DIExpressionCursor &&Expr, +  void addExpression(DIExpressionCursor &&Expr,                       unsigned FragmentOffsetInBits = 0);    /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to @@ -198,33 +211,32 @@ public:  };  /// DwarfExpression implementation for .debug_loc entries. -class DebugLocDwarfExpression : public DwarfExpression { +class DebugLocDwarfExpression final : public DwarfExpression {    ByteStreamer &BS; +  void emitOp(uint8_t Op, const char *Comment = nullptr) override; +  void emitSigned(int64_t Value) override; +  void emitUnsigned(uint64_t Value) override; +  bool isFrameRegister(const TargetRegisterInfo &TRI, +                       unsigned MachineReg) override;  public:    DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS)        : DwarfExpression(DwarfVersion), BS(BS) {} - -  void EmitOp(uint8_t Op, const char *Comment = nullptr) override; -  void EmitSigned(int64_t Value) override; -  void EmitUnsigned(uint64_t Value) override; -  bool isFrameRegister(const TargetRegisterInfo &TRI, -                       unsigned MachineReg) override;  };  /// DwarfExpression implementation for singular DW_AT_location. -class DIEDwarfExpression : public DwarfExpression { +class DIEDwarfExpression final : public DwarfExpression {  const AsmPrinter &AP;    DwarfUnit &DU;    DIELoc &DIE; -public: -  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); -  void EmitOp(uint8_t Op, const char *Comment = nullptr) override; -  void EmitSigned(int64_t Value) override; -  void EmitUnsigned(uint64_t Value) override; +  void emitOp(uint8_t Op, const char *Comment = nullptr) override; +  void emitSigned(int64_t Value) override; +  void emitUnsigned(uint64_t Value) override;    bool isFrameRegister(const TargetRegisterInfo &TRI,                         unsigned MachineReg) override; +public: +  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);    DIELoc *finalize() {      DwarfExpression::finalize();      return &DIE; diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 2a866c071f59..bad5b09553cd 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -54,15 +54,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,      : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU),        DIE(DIE) {} -void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) { +void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {    DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);  } -void DIEDwarfExpression::EmitSigned(int64_t Value) { +void DIEDwarfExpression::emitSigned(int64_t Value) {    DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);  } -void DIEDwarfExpression::EmitUnsigned(uint64_t Value) { +void DIEDwarfExpression::emitUnsigned(uint64_t Value) {    DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);  } @@ -98,25 +98,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const {    default:      break; -  case dwarf::DW_LANG_C89: -  case dwarf::DW_LANG_C99: +  // The languages below have valid values in all DWARF versions.    case dwarf::DW_LANG_C: +  case dwarf::DW_LANG_C89:    case dwarf::DW_LANG_C_plus_plus: -  case dwarf::DW_LANG_ObjC: -  case dwarf::DW_LANG_ObjC_plus_plus:      return 0;    case dwarf::DW_LANG_Fortran77:    case dwarf::DW_LANG_Fortran90: -  case dwarf::DW_LANG_Fortran95:      return 1; -  // The languages below have valid values only if the DWARF version >= 4. +  // The languages below have valid values only if the DWARF version >= 3. +  case dwarf::DW_LANG_C99: +  case dwarf::DW_LANG_ObjC: +  case dwarf::DW_LANG_ObjC_plus_plus: +    if (DD->getDwarfVersion() >= 3) +      return 0; +    break; + +  case dwarf::DW_LANG_Fortran95: +    if (DD->getDwarfVersion() >= 3) +      return 1; +    break; + +  // Starting with DWARF v4, all defined languages have valid values. +  case dwarf::DW_LANG_D:    case dwarf::DW_LANG_Java:    case dwarf::DW_LANG_Python:    case dwarf::DW_LANG_UPC: -  case dwarf::DW_LANG_D: -    if (dwarf::DWARF_VERSION >= 4) +    if (DD->getDwarfVersion() >= 4)        return 0;      break; @@ -127,31 +137,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const {    case dwarf::DW_LANG_Modula2:    case dwarf::DW_LANG_Pascal83:    case dwarf::DW_LANG_PLI: -    if (dwarf::DWARF_VERSION >= 4) +    if (DD->getDwarfVersion() >= 4)        return 1;      break; -  // The languages below have valid values only if the DWARF version >= 5. -  case dwarf::DW_LANG_OpenCL: -  case dwarf::DW_LANG_Go: -  case dwarf::DW_LANG_Haskell: +  // The languages below are new in DWARF v5. +  case dwarf::DW_LANG_BLISS: +  case dwarf::DW_LANG_C11:    case dwarf::DW_LANG_C_plus_plus_03:    case dwarf::DW_LANG_C_plus_plus_11: +  case dwarf::DW_LANG_C_plus_plus_14: +  case dwarf::DW_LANG_Dylan: +  case dwarf::DW_LANG_Go: +  case dwarf::DW_LANG_Haskell:    case dwarf::DW_LANG_OCaml: +  case dwarf::DW_LANG_OpenCL: +  case dwarf::DW_LANG_RenderScript:    case dwarf::DW_LANG_Rust: -  case dwarf::DW_LANG_C11:    case dwarf::DW_LANG_Swift: -  case dwarf::DW_LANG_Dylan: -  case dwarf::DW_LANG_C_plus_plus_14: -    if (dwarf::DWARF_VERSION >= 5) +    if (DD->getDwarfVersion() >= 5)        return 0;      break; -  case dwarf::DW_LANG_Modula3: -  case dwarf::DW_LANG_Julia:    case dwarf::DW_LANG_Fortran03:    case dwarf::DW_LANG_Fortran08: -    if (dwarf::DWARF_VERSION >= 5) +  case dwarf::DW_LANG_Julia: +  case dwarf::DW_LANG_Modula3: +    if (DD->getDwarfVersion() >= 5)        return 1;      break;    } @@ -285,13 +297,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {                 dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));  } -void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, -                                    StringRef Identifier) { -  uint64_t Signature = DD->makeTypeSignature(Identifier); -  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8, -               DIEInteger(Signature)); -} -  void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,                              DIEEntry Entry) {    const DIEUnit *CU = Die.getUnit(); @@ -465,50 +470,47 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,    // Decode the original location, and use that as the start of the byref    // variable's location.    DIELoc *Loc = new (DIEValueAllocator) DIELoc; -  SmallVector<uint64_t, 6> DIExpr; -  DIEDwarfExpression Expr(*Asm, *this, *Loc); - -  bool validReg; -  if (Location.isReg()) -    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), -                                  Location.getReg()); -  else -    validReg = -        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), -                                   Location.getReg(), Location.getOffset()); - -  if (!validReg) -    return; +  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); +  SmallVector<uint64_t, 9> Ops; +  if (Location.isIndirect()) { +    Ops.push_back(dwarf::DW_OP_plus); +    Ops.push_back(Location.getOffset()); +    Ops.push_back(dwarf::DW_OP_deref); +  }    // If we started with a pointer to the __Block_byref... struct, then    // the first thing we need to do is dereference the pointer (DW_OP_deref).    if (isPointer) -    DIExpr.push_back(dwarf::DW_OP_deref); +    Ops.push_back(dwarf::DW_OP_deref);    // Next add the offset for the '__forwarding' field:    // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in    // adding the offset if it's 0.    if (forwardingFieldOffset > 0) { -    DIExpr.push_back(dwarf::DW_OP_plus); -    DIExpr.push_back(forwardingFieldOffset); +    Ops.push_back(dwarf::DW_OP_plus); +    Ops.push_back(forwardingFieldOffset);    }    // Now dereference the __forwarding field to get to the real __Block_byref    // struct:  DW_OP_deref. -  DIExpr.push_back(dwarf::DW_OP_deref); +  Ops.push_back(dwarf::DW_OP_deref);    // Now that we've got the real __Block_byref... struct, add the offset    // for the variable's field to get to the location of the actual variable:    // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.    if (varFieldOffset > 0) { -    DIExpr.push_back(dwarf::DW_OP_plus); -    DIExpr.push_back(varFieldOffset); +    Ops.push_back(dwarf::DW_OP_plus); +    Ops.push_back(varFieldOffset);    } -  Expr.AddExpression(makeArrayRef(DIExpr)); -  Expr.finalize(); + +  DIExpressionCursor Cursor(Ops); +  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); +  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) +    return; +  DwarfExpr.addExpression(std::move(Cursor));    // Now attach the location information to the DIE. -  addBlock(Die, Attribute, Loc); +  addBlock(Die, Attribute, DwarfExpr.finalize());  }  /// Return true if type encoding is unsigned. @@ -672,7 +674,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {    return getDIE(Context);  } -DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { +DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) {    auto *Context = resolve(Ty->getScope());    DIE *ContextDIE = getOrCreateContextDIE(Context); @@ -684,8 +686,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {    constructTypeDIE(TyDIE, cast<DICompositeType>(Ty)); -  if (!Ty->isExternalTypeRef()) -    updateAcceleratorTables(Context, Ty, TyDIE); +  updateAcceleratorTables(Context, Ty, TyDIE);    return &TyDIE;  } @@ -841,6 +842,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {    // Add source line info if available and TyDesc is not a forward declaration.    if (!DTy->isForwardDecl())      addSourceLine(Buffer, DTy); + +  // If DWARF address space value is other than None, add it for pointer and +  // reference types as DW_AT_address_class. +  if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type || +                                      Tag == dwarf::DW_TAG_reference_type)) +    addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4, +            DTy->getDWARFAddressSpace().getValue());  }  void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) { @@ -892,13 +900,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {  }  void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { -  if (CTy->isExternalTypeRef()) { -    StringRef Identifier = CTy->getIdentifier(); -    assert(!Identifier.empty() && "external type ref without identifier"); -    addFlag(Buffer, dwarf::DW_AT_declaration); -    return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier); -  } -    // Add name if not anonymous or intermediate type.    StringRef Name = CTy->getName(); @@ -1180,8 +1181,12 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,  }  void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, -                                          bool Minimal) { -  if (!Minimal) +                                          bool SkipSPAttributes) { +  // If -fdebug-info-for-profiling is enabled, need to emit the subprogram +  // and its source location. +  bool SkipSPSourceLocation = SkipSPAttributes && +                              !CUNode->getDebugInfoForProfiling(); +  if (!SkipSPSourceLocation)      if (applySubprogramDefinitionAttributes(SP, SPDie))        return; @@ -1189,12 +1194,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,    if (!SP->getName().empty())      addString(SPDie, dwarf::DW_AT_name, SP->getName()); +  if (!SkipSPSourceLocation) +    addSourceLine(SPDie, SP); +    // Skip the rest of the attributes under -gmlt to save space. -  if (Minimal) +  if (SkipSPAttributes)      return; -  addSourceLine(SPDie, SP); -    // Add the prototype if we have a prototype and we have a C like    // language.    uint16_t Language = getLanguage(); @@ -1526,18 +1532,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {    return &StaticMemberDIE;  } -void DwarfUnit::emitHeader(bool UseOffsets) { +void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {    // Emit size of content not including length itself    Asm->OutStreamer->AddComment("Length of Unit");    Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize());    Asm->OutStreamer->AddComment("DWARF version number"); -  Asm->EmitInt16(DD->getDwarfVersion()); -  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); +  unsigned Version = DD->getDwarfVersion(); +  Asm->EmitInt16(Version); + +  // DWARF v5 reorders the address size and adds a unit type. +  if (Version >= 5) { +    Asm->OutStreamer->AddComment("DWARF Unit Type"); +    Asm->EmitInt8(UT); +    Asm->OutStreamer->AddComment("Address Size (in bytes)"); +    Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); +  }    // We share one abbreviations table across all units so it's always at the    // start of the section. Use a relocatable offset where needed to ensure    // linking doesn't invalidate that offset. +  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();    if (UseOffsets)      Asm->EmitInt32(0); @@ -1545,12 +1560,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) {      Asm->emitDwarfSymbolReference(          TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false); -  Asm->OutStreamer->AddComment("Address Size (in bytes)"); -  Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); +  if (Version <= 4) { +    Asm->OutStreamer->AddComment("Address Size (in bytes)"); +    Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); +  }  }  void DwarfTypeUnit::emitHeader(bool UseOffsets) { -  DwarfUnit::emitHeader(UseOffsets); +  DwarfUnit::emitCommonHeader(UseOffsets,  +                              DD->useSplitDwarf() ? dwarf::DW_UT_split_type +                                                  : dwarf::DW_UT_type);    Asm->OutStreamer->AddComment("Type Signature");    Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));    Asm->OutStreamer->AddComment("Type DIE Offset"); @@ -1564,3 +1583,13 @@ bool DwarfTypeUnit::isDwoUnit() const {    // when split DWARF is being used.    return DD->useSplitDwarf();  } + +void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die, +                                  const DIScope *Context) { +  getCU().addGlobalNameForTypeUnit(Name, Context); +} + +void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die, +                                  const DIScope *Context) { +  getCU().addGlobalTypeUnitType(Ty, Context); +} diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h index 8654d6f0caf4..d626ef920f95 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -124,12 +124,12 @@ public:    std::string getParentContextString(const DIScope *Context) const;    /// Add a new global name to the compile unit. -  virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) { -  } +  virtual void addGlobalName(StringRef Name, const DIE &Die, +                             const DIScope *Context) = 0;    /// Add a new global type to the compile unit.    virtual void addGlobalType(const DIType *Ty, const DIE &Die, -                             const DIScope *Context) {} +                             const DIScope *Context) = 0;    /// Returns the DIE map slot for the specified debug variable.    /// @@ -198,9 +198,6 @@ public:    /// Add a type's DW_AT_signature and set the  declaration flag.    void addDIETypeSignature(DIE &Die, uint64_t Signature); -  /// Add an attribute containing the type signature for a unique identifier. -  void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, -                           StringRef Identifier);    /// Add block data.    void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block); @@ -256,15 +253,12 @@ public:    DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);    void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, -                                 bool Minimal = false); +                                 bool SkipSPAttributes = false);    /// Find existing DIE or create new DIE for the given type.    DIE *getOrCreateTypeDIE(const MDNode *N);    /// Get context owner's DIE. -  DIE *createTypeDIE(const DICompositeType *Ty); - -  /// Get context owner's DIE.    DIE *getOrCreateContextDIE(const DIScope *Context);    /// Construct DIEs for types that contain vtables. @@ -282,11 +276,13 @@ public:    virtual unsigned getHeaderSize() const {      return sizeof(int16_t) + // DWARF version number             sizeof(int32_t) + // Offset Into Abbrev. Section -           sizeof(int8_t);   // Pointer Size (in bytes) +           sizeof(int8_t) +  // Pointer Size (in bytes) +           (DD->getDwarfVersion() >= 5 ? sizeof(int8_t) +                                       : 0); // DWARF v5 unit type    }    /// Emit the header for this unit, not including the initial length field. -  virtual void emitHeader(bool UseOffsets); +  virtual void emitHeader(bool UseOffsets) = 0;    virtual DwarfCompileUnit &getCU() = 0; @@ -306,6 +302,14 @@ protected:      return Ref.resolve();    } +  /// If this is a named finished type then include it in the list of types for +  /// the accelerator tables. +  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, +                               const DIE &TyDIE); + +  /// Emit the common part of the header for this unit. +  void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT); +  private:    void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);    void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy); @@ -330,11 +334,6 @@ private:    /// Set D as anonymous type for index which can be reused later.    void setIndexTyDie(DIE *D) { IndexTyDie = D; } -  /// If this is a named finished type then include it in the list of types for -  /// the accelerator tables. -  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, -                               const DIE &TyDIE); -    virtual bool isDwoUnit() const = 0;  }; @@ -354,12 +353,19 @@ public:    void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }    void setType(const DIE *Ty) { this->Ty = Ty; } +  /// Get context owner's DIE. +  DIE *createTypeDIE(const DICompositeType *Ty); +    /// Emit the header for this unit, not including the initial length field.    void emitHeader(bool UseOffsets) override;    unsigned getHeaderSize() const override {      return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature             sizeof(uint32_t);                               // Type DIE Offset    } +  void addGlobalName(StringRef Name, const DIE &Die, +                     const DIScope *Context) override; +  void addGlobalType(const DIType *Ty, const DIE &Die, +                     const DIScope *Context) override;    DwarfCompileUnit &getCU() override { return CU; }  };  } // end llvm namespace diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp index 6a023b998b32..342efc3611c7 100644 --- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp @@ -1,4 +1,4 @@ -//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===// +//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -14,21 +14,19 @@  //===----------------------------------------------------------------------===//  #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h"  #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h"  #include "llvm/CodeGen/GCs.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Metadata.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Module.h"  #include "llvm/MC/MCContext.h"  #include "llvm/MC/MCSectionELF.h"  #include "llvm/MC/MCStreamer.h"  #include "llvm/MC/MCSymbol.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Target/TargetLoweringObjectFile.h"  +#include "llvm/Support/ELF.h"  using namespace llvm; @@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter {  public:    void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;  }; -} + +} // end anonymous namespace  static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>      X("erlang", "erlang-compatible garbage collector"); -void llvm::linkErlangGCPrinter() {} -  void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,                                       AsmPrinter &AP) {    MCStreamer &OS = *AP.OutStreamer; @@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,      }    }  } + +void llvm::linkErlangGCPrinter() {} diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp index 9d7c96a1b8ef..704f0ac2f191 100644 --- a/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/lib/CodeGen/AsmPrinter/WinException.cpp @@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) {    const Function *F = MF->getFunction(); -  shouldEmitMoves = Asm->needsSEHMoves(); +  shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI();    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();    unsigned PerEncoding = TLOF.getPersonalityEncoding(); @@ -94,7 +94,7 @@ void WinException::beginFunction(const MachineFunction *MF) {    // If we're not using CFI, we don't want the CFI or the personality, but we    // might want EH tables if we had EH pads. -  if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) { +  if (!Asm->MAI->usesWindowsCFI()) {      if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) {        // If this is 32-bit SEH and we don't have any funclets (really invokes),        // make sure we emit the parent offset label. Some unreferenced filter diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index bf5cf105a8f8..9c19a4fd3c3e 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -1532,7 +1532,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(    Type *ResultTy;    SmallVector<Value *, 6> Args; -  AttributeSet Attr; +  AttributeList Attr;    // 'size' argument.    if (!UseSizedLibcall) { @@ -1593,7 +1593,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(    // Now, the return type.    if (CASExpected) {      ResultTy = Type::getInt1Ty(Ctx); -    Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt); +    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);    } else if (HasResult && UseSizedLibcall)      ResultTy = SizedIntTy;    else diff --git a/lib/CodeGen/BranchCoalescing.cpp b/lib/CodeGen/BranchCoalescing.cpp new file mode 100644 index 000000000000..efdf300df850 --- /dev/null +++ b/lib/CodeGen/BranchCoalescing.cpp @@ -0,0 +1,758 @@ +//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Coalesce basic blocks guarded by the same branch condition into a single +/// basic block. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "coal-branch" + +static cl::opt<cl::boolOrDefault> +    EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden, +                           cl::desc("enable coalescing of duplicate branches")); + +STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced"); +STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged"); +STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced"); + +//===----------------------------------------------------------------------===// +//                               BranchCoalescing +//===----------------------------------------------------------------------===// +/// +/// Improve scheduling by coalescing branches that depend on the same condition. +/// This pass looks for blocks that are guarded by the same branch condition +/// and attempts to merge the blocks together. Such opportunities arise from +/// the expansion of select statements in the IR. +/// +/// For example, consider the following LLVM IR: +/// +/// %test = icmp eq i32 %x 0 +/// %tmp1 = select i1 %test, double %a, double 2.000000e-03 +/// %tmp2 = select i1 %test, double %b, double 5.000000e-03 +/// +/// This IR expands to the following machine code on PowerPC: +/// +/// BB#0: derived from LLVM BB %entry +///    Live Ins: %F1 %F3 %X6 +///        <SNIP1> +///        %vreg0<def> = COPY %F1; F8RC:%vreg0 +///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +///                    mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +///        BCC 76, %vreg5, <BB#2>; CRRC:%vreg5 +///    Successors according to CFG: BB#1(?%) BB#2(?%) +/// +/// BB#1: derived from LLVM BB %entry +///    Predecessors according to CFG: BB#0 +///    Successors according to CFG: BB#2(?%) +/// +/// BB#2: derived from LLVM BB %entry +///    Predecessors according to CFG: BB#0 BB#1 +///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +///                    F8RC:%vreg9,%vreg8,%vreg0 +///        <SNIP2> +///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +///    Successors according to CFG: BB#3(?%) BB#4(?%) +/// +/// BB#3: derived from LLVM BB %entry +///    Predecessors according to CFG: BB#2 +///    Successors according to CFG: BB#4(?%) +/// +/// BB#4: derived from LLVM BB %entry +///    Predecessors according to CFG: BB#2 BB#3 +///        %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>; +///                     F8RC:%vreg13,%vreg12,%vreg2 +///        <SNIP3> +///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// When this pattern is detected, branch coalescing will try to collapse +/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3. +/// +/// If all conditions are meet, IR should collapse to: +/// +/// BB#0: derived from LLVM BB %entry +///    Live Ins: %F1 %F3 %X6 +///        <SNIP1> +///        %vreg0<def> = COPY %F1; F8RC:%vreg0 +///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +///                     mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +///        <SNIP2> +///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +///    Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%) +///      BB#4(0x55555554 / 0x80000000 = 66.67%) +/// +/// BB#1: derived from LLVM BB %entry +///    Predecessors according to CFG: BB#0 +///    Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%) +/// +/// BB#4: derived from LLVM BB %entry +///    Predecessors according to CFG: BB#0 BB#1 +///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +///                    F8RC:%vreg9,%vreg8,%vreg0 +///        %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>; +///                     F8RC:%vreg13,%vreg12,%vreg2 +///        <SNIP3> +///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// Branch Coalescing does not split blocks, it moves everything in the same +/// direction ensuring it does not break use/definition semantics. +/// +/// PHI nodes and its corresponding use instructions are moved to its successor +/// block if there are no uses within the successor block PHI nodes.  PHI +/// node ordering cannot be assumed. +/// +/// Non-PHI can be moved up to the predecessor basic block or down to the +/// successor basic block following any PHI instructions. Whether it moves +/// up or down depends on whether the register(s) defined in the instructions +/// are used in current block or in any PHI instructions at the beginning of +/// the successor block. + +namespace { + +class BranchCoalescing : public MachineFunctionPass { +  struct CoalescingCandidateInfo { +    MachineBasicBlock *BranchBlock;       // Block containing the branch +    MachineBasicBlock *BranchTargetBlock; // Block branched to +    MachineBasicBlock *FallThroughBlock;  // Fall-through if branch not taken +    SmallVector<MachineOperand, 4> Cond; +    bool MustMoveDown; +    bool MustMoveUp; + +    CoalescingCandidateInfo(); +    void clear(); +  }; + +  MachineDominatorTree *MDT; +  MachinePostDominatorTree *MPDT; +  const TargetInstrInfo *TII; +  MachineRegisterInfo *MRI; + +  void initialize(MachineFunction &F); +  bool canCoalesceBranch(CoalescingCandidateInfo &Cand); +  bool identicalOperands(ArrayRef<MachineOperand> OperandList1, +                         ArrayRef<MachineOperand> OperandList2) const; +  bool validateCandidates(CoalescingCandidateInfo &SourceRegion, +                          CoalescingCandidateInfo &TargetRegion) const; + +  static bool isBranchCoalescingEnabled() { +    return EnableBranchCoalescing == cl::BOU_TRUE; +  } + +public: +  static char ID; + +  BranchCoalescing() : MachineFunctionPass(ID) { +    initializeBranchCoalescingPass(*PassRegistry::getPassRegistry()); +  } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<MachineDominatorTree>(); +    AU.addRequired<MachinePostDominatorTree>(); +    MachineFunctionPass::getAnalysisUsage(AU); +  } + +  StringRef getPassName() const override { return "Branch Coalescing"; } + +  bool mergeCandidates(CoalescingCandidateInfo &SourceRegion, +                       CoalescingCandidateInfo &TargetRegion); +  bool canMoveToBeginning(const MachineInstr &MI, +                          const MachineBasicBlock &MBB) const; +  bool canMoveToEnd(const MachineInstr &MI, +                    const MachineBasicBlock &MBB) const; +  bool canMerge(CoalescingCandidateInfo &SourceRegion, +                CoalescingCandidateInfo &TargetRegion) const; +  void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB, +                         MachineBasicBlock *TargetRegionMBB); +  bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char BranchCoalescing::ID = 0; +char &llvm::BranchCoalescingID = BranchCoalescing::ID; + +INITIALIZE_PASS_BEGIN(BranchCoalescing, "branch-coalescing", +                      "Branch Coalescing", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(BranchCoalescing, "branch-coalescing", "Branch Coalescing", +                    false, false) + +BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo() +    : BranchBlock(nullptr), BranchTargetBlock(nullptr), +      FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {} + +void BranchCoalescing::CoalescingCandidateInfo::clear() { +  BranchBlock = nullptr; +  BranchTargetBlock = nullptr; +  FallThroughBlock = nullptr; +  Cond.clear(); +  MustMoveDown = false; +  MustMoveUp = false; +} + +void BranchCoalescing::initialize(MachineFunction &MF) { +  MDT = &getAnalysis<MachineDominatorTree>(); +  MPDT = &getAnalysis<MachinePostDominatorTree>(); +  TII = MF.getSubtarget().getInstrInfo(); +  MRI = &MF.getRegInfo(); +} + +/// +/// Analyze the branch statement to determine if it can be coalesced. This +/// method analyses the branch statement for the given candidate to determine +/// if it can be coalesced. If the branch can be coalesced, then the +/// BranchTargetBlock and the FallThroughBlock are recorded in the specified +/// Candidate. +/// +///\param[in,out] Cand The coalescing candidate to analyze +///\return true if and only if the branch can be coalesced, false otherwise +/// +bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) { +  DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber() +               << " can be coalesced:"); +  MachineBasicBlock *FalseMBB = nullptr; + +  if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB, +                         Cand.Cond)) { +    DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n"); +    return false; +  } + +  for (auto &I : Cand.BranchBlock->terminators()) { +    DEBUG(dbgs() << "Looking at terminator : " << I << "\n"); +    if (!I.isBranch()) +      continue; + +    if (I.getNumOperands() != I.getNumExplicitOperands()) { +      DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I +                   << "\n"); +      return false; +    } +  } + +  if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) { +    DEBUG(dbgs() << "EH Pad - skip\n"); +    return false; +  } + +  // For now only consider triangles (i.e, BranchTargetBlock is set, +  // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock) +  if (!Cand.BranchTargetBlock || FalseMBB || +      !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) { +    DEBUG(dbgs() << "Does not form a triangle - skip\n"); +    return false; +  } + +  // Ensure there are only two successors +  if (Cand.BranchBlock->succ_size() != 2) { +    DEBUG(dbgs() << "Does not have 2 successors - skip\n"); +    return false; +  } + +  // Sanity check - the block must be able to fall through +  assert(Cand.BranchBlock->canFallThrough() && +         "Expecting the block to fall through!"); + +  // We have already ensured there are exactly two successors to +  // BranchBlock and that BranchTargetBlock is a successor to BranchBlock. +  // Ensure the single fall though block is empty. +  MachineBasicBlock *Succ = +    (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock) +    ? *Cand.BranchBlock->succ_rbegin() +    : *Cand.BranchBlock->succ_begin(); + +  assert(Succ && "Expecting a valid fall-through block\n"); + +  if (!Succ->empty()) { +      DEBUG(dbgs() << "Fall-through block contains code -- skip\n"); +      return false; +  } + +  if (!Succ->isSuccessor(Cand.BranchTargetBlock)) { +      DEBUG(dbgs() +            << "Successor of fall through block is not branch taken block\n"); +      return false; +  } + +  Cand.FallThroughBlock = Succ; +  DEBUG(dbgs() << "Valid Candidate\n"); +  return true; +} + +/// +/// Determine if the two operand lists are identical +/// +/// \param[in] OpList1 operand list +/// \param[in] OpList2 operand list +/// \return true if and only if the operands lists are identical +/// +bool BranchCoalescing::identicalOperands( +    ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const { + +  if (OpList1.size() != OpList2.size()) { +    DEBUG(dbgs() << "Operand list is different size\n"); +    return false; +  } + +  for (unsigned i = 0; i < OpList1.size(); ++i) { +    const MachineOperand &Op1 = OpList1[i]; +    const MachineOperand &Op2 = OpList2[i]; + +    DEBUG(dbgs() << "Op1: " << Op1 << "\n" +                 << "Op2: " << Op2 << "\n"); + +    if (Op1.isIdenticalTo(Op2)) { +      DEBUG(dbgs() << "Op1 and Op2 are identical!\n"); +      continue; +    } + +    // If the operands are not identical, but are registers, check to see if the +    // definition of the register produces the same value. If they produce the +    // same value, consider them to be identical. +    if (Op1.isReg() && Op2.isReg() && +        TargetRegisterInfo::isVirtualRegister(Op1.getReg()) && +        TargetRegisterInfo::isVirtualRegister(Op2.getReg())) { +      MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg()); +      MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg()); +      if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) { +        DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def +                     << " produce the same value!\n"); +      } else { +        DEBUG(dbgs() << "Operands produce different values\n"); +        return false; +      } +    } else { +      DEBUG(dbgs() << "The operands are not provably identical.\n"); +      return false; +    } +  } +  return true; +} + +/// +/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB +/// and update them to refer to the new block.  PHI node ordering +/// cannot be assumed so it does not matter where the PHI instructions +/// are moved to in TargetMBB. +/// +/// \param[in] SourceMBB block to move PHI instructions from +/// \param[in] TargetMBB block to move PHI instructions to +/// +void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB, +                                         MachineBasicBlock *TargetMBB) { + +  MachineBasicBlock::iterator MI = SourceMBB->begin(); +  MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI(); + +  if (MI == ME) { +    DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n"); +    return; +  } + +  // Update all PHI instructions in SourceMBB and move to top of TargetMBB +  for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) { +    MachineInstr &PHIInst = *Iter; +    for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) { +      MachineOperand &MO = PHIInst.getOperand(i); +      if (MO.getMBB() == SourceMBB) +        MO.setMBB(TargetMBB); +    } +  } +  TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME); +} + +/// +/// This function checks if MI can be moved to the beginning of the TargetMBB +/// following PHI instructions. A MI instruction can be moved to beginning of +/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to beginning of TargetMBB, +///         false otherwise. +/// +bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI, +                                          const MachineBasicBlock &TargetMBB +                                          ) const { + +  DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of " +        << TargetMBB.getNumber() << "\n"); + +  for (auto &Def : MI.defs()) { // Looking at Def +    for (auto &Use : MRI->use_instructions(Def.getReg())) { +      if (Use.isPHI() && Use.getParent() == &TargetMBB) { +        DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n"); +       return false; +      } +    } +  } + +  DEBUG(dbgs() << "  Safe to move to the beginning.\n"); +  return true; +} + +/// +/// This function checks if MI can be moved to the end of the TargetMBB, +/// immediately before the first terminator.  A MI instruction can be moved +/// to then end of the TargetMBB if no PHI node defines what MI uses within +/// it's own MBB. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to end of TargetMBB, +///         false otherwise. +/// +bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI, +                                    const MachineBasicBlock &TargetMBB +                                    ) const { + +  DEBUG(dbgs() << "Checking if " << MI << " can move to end of " +        << TargetMBB.getNumber() << "\n"); + +  for (auto &Use : MI.uses()) { +    if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) { +      MachineInstr *DefInst = MRI->getVRegDef(Use.getReg()); +      if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) { +        DEBUG(dbgs() << "    *** Cannot move this instruction ***\n"); +        return false; +      } else { +        DEBUG(dbgs() << "    *** def is in another block -- safe to move!\n"); +      } +    } +  } + +  DEBUG(dbgs() << "  Safe to move to the end.\n"); +  return true; +} + +/// +/// This method checks to ensure the two coalescing candidates follows the +/// expected pattern required for coalescing. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion; false otherwise. +/// +bool BranchCoalescing::validateCandidates( +    CoalescingCandidateInfo &SourceRegion, +    CoalescingCandidateInfo &TargetRegion) const { + +  if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock) +    llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion"); +  else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock)) +    llvm_unreachable("Expecting TargetRegion to dominate SourceRegion"); +  else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock)) +    llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion"); +  else if (!TargetRegion.FallThroughBlock->empty() || +           !SourceRegion.FallThroughBlock->empty()) +    llvm_unreachable("Expecting fall-through blocks to be empty"); + +  return true; +} + +/// +/// This method determines whether the two coalescing candidates can be merged. +/// In order to be merged, all instructions must be able to +///   1. Move to the beginning of the SourceRegion.BranchTargetBlock; +///   2. Move to the end of the TargetRegion.BranchBlock. +/// Merging involves moving the instructions in the +/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock). +/// +/// This function first try to move instructions from the +/// TargetRegion.BranchTargetBlock down, to the beginning of the +/// SourceRegion.BranchTargetBlock. This is not possible if any register defined +/// in TargetRegion.BranchTargetBlock is used in a PHI node in the +/// SourceRegion.BranchTargetBlock. In this case, check whether the statement +/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately +/// before the branch statement). If it cannot move, then these blocks cannot +/// be merged. +/// +/// Note that there is no analysis for moving instructions past the fall-through +/// blocks because they are confirmed to be empty. An assert is thrown if they +/// are not. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +///         into a block in TargetRegion, false otherwise. +/// +bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion, +                                CoalescingCandidateInfo &TargetRegion) const { +  if (!validateCandidates(SourceRegion, TargetRegion)) +    return false; + +  // Walk through PHI nodes first and see if they force the merge into the +  // SourceRegion.BranchTargetBlock. +  for (MachineBasicBlock::iterator +           I = SourceRegion.BranchBlock->instr_begin(), +           E = SourceRegion.BranchBlock->getFirstNonPHI(); +       I != E; ++I) { +    for (auto &Def : I->defs()) +      for (auto &Use : MRI->use_instructions(Def.getReg())) { +        if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) { +          DEBUG(dbgs() << "PHI " << *I << " defines register used in another " +                          "PHI within branch target block -- can't merge\n"); +          NumPHINotMoved++; +          return false; +        } +        if (Use.getParent() == SourceRegion.BranchBlock) { +          DEBUG(dbgs() << "PHI " << *I +                       << " defines register used in this " +                          "block -- all must move down\n"); +          SourceRegion.MustMoveDown = true; +        } +      } +  } + +  // Walk through the MI to see if they should be merged into +  // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down) +  for (MachineBasicBlock::iterator +           I = SourceRegion.BranchBlock->getFirstNonPHI(), +           E = SourceRegion.BranchBlock->end(); +       I != E; ++I) { +    if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) { +      DEBUG(dbgs() << "Instruction " << *I +                   << " cannot move down - must move up!\n"); +      SourceRegion.MustMoveUp = true; +    } +    if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) { +      DEBUG(dbgs() << "Instruction " << *I +                   << " cannot move up - must move down!\n"); +      SourceRegion.MustMoveDown = true; +    } +  } + +  return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true; +} + +/// Merge the instructions from SourceRegion.BranchBlock, +/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into +/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and +/// TargetRegion.FallThroughBlock respectively. +/// +/// The successors for blocks in TargetRegion will be updated to use the +/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion +/// will be removed from the function. +/// +/// A region consists of a BranchBlock, a FallThroughBlock, and a +/// BranchTargetBlock. Branch coalesce works on patterns where the +/// TargetRegion's BranchTargetBlock must also be the SourceRegions's +/// BranchBlock. +/// +///  Before mergeCandidates: +/// +///  +---------------------------+ +///  |  TargetRegion.BranchBlock | +///  +---------------------------+ +///     /        | +///    /   +--------------------------------+ +///   |    |  TargetRegion.FallThroughBlock | +///    \   +--------------------------------+ +///     \        | +///  +----------------------------------+ +///  |  TargetRegion.BranchTargetBlock  | +///  |  SourceRegion.BranchBlock        | +///  +----------------------------------+ +///     /        | +///    /   +--------------------------------+ +///   |    |  SourceRegion.FallThroughBlock | +///    \   +--------------------------------+ +///     \        | +///  +----------------------------------+ +///  |  SourceRegion.BranchTargetBlock  | +///  +----------------------------------+ +/// +///  After mergeCandidates: +/// +///  +-----------------------------+ +///  |  TargetRegion.BranchBlock   | +///  |  SourceRegion.BranchBlock   | +///  +-----------------------------+ +///     /        | +///    /   +---------------------------------+ +///   |    |  TargetRegion.FallThroughBlock  | +///   |    |  SourceRegion.FallThroughBlock  | +///    \   +---------------------------------+ +///     \        | +///  +----------------------------------+ +///  |  SourceRegion.BranchTargetBlock  | +///  +----------------------------------+ +/// +/// \param[in] SourceRegion The candidate to move blocks from +/// \param[in] TargetRegion The candidate to move blocks to +/// +bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion, +                                       CoalescingCandidateInfo &TargetRegion) { + +  if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) { +    llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!"); +    return false; +  } + +  if (!validateCandidates(SourceRegion, TargetRegion)) +    return false; + +  // Start the merging process by first handling the BranchBlock. +  // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block +  moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + +  // Move remaining instructions in SourceRegion.BranchBlock into +  // TargetRegion.BranchBlock +  MachineBasicBlock::iterator firstInstr = +      SourceRegion.BranchBlock->getFirstNonPHI(); +  MachineBasicBlock::iterator lastInstr = +      SourceRegion.BranchBlock->getFirstTerminator(); + +  MachineBasicBlock *Source = SourceRegion.MustMoveDown +                                  ? SourceRegion.BranchTargetBlock +                                  : TargetRegion.BranchBlock; + +  MachineBasicBlock::iterator Target = +      SourceRegion.MustMoveDown +          ? SourceRegion.BranchTargetBlock->getFirstNonPHI() +          : TargetRegion.BranchBlock->getFirstTerminator(); + +  Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr); + +  // Once PHI and instructions have been moved we need to clean up the +  // control flow. + +  // Remove SourceRegion.FallThroughBlock before transferring successors of +  // SourceRegion.BranchBlock to TargetRegion.BranchBlock. +  SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock); +  TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs( +      SourceRegion.BranchBlock); +  // Update branch in TargetRegion.BranchBlock to jump to +  // SourceRegion.BranchTargetBlock +  // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock. +  TargetRegion.BranchBlock->ReplaceUsesOfBlockWith( +      SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); +  // Remove the branch statement(s) in SourceRegion.BranchBlock +  MachineBasicBlock::iterator I = +      SourceRegion.BranchBlock->terminators().begin(); +  while (I != SourceRegion.BranchBlock->terminators().end()) { +    MachineInstr &CurrInst = *I; +    ++I; +    if (CurrInst.isBranch()) +      CurrInst.eraseFromParent(); +  } + +  // Fall-through block should be empty since this is part of the condition +  // to coalesce the branches. +  assert(TargetRegion.FallThroughBlock->empty() && +         "FallThroughBlocks should be empty!"); + +  // Transfer successor information and move PHIs down to the +  // branch-taken block. +  TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs( +      SourceRegion.FallThroughBlock); +  TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock); + +  // Remove the blocks from the function. +  assert(SourceRegion.BranchBlock->empty() && +         "Expecting branch block to be empty!"); +  SourceRegion.BranchBlock->eraseFromParent(); + +  assert(SourceRegion.FallThroughBlock->empty() && +         "Expecting fall-through block to be empty!\n"); +  SourceRegion.FallThroughBlock->eraseFromParent(); + +  NumBlocksCoalesced++; +  return true; +} + +bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) { + +  if (skipFunction(*MF.getFunction()) || MF.empty() || +      !isBranchCoalescingEnabled()) +    return false; + +  bool didSomething = false; + +  DEBUG(dbgs() << "******** Branch Coalescing ********\n"); +  initialize(MF); + +  DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); + +  CoalescingCandidateInfo Cand1, Cand2; +  // Walk over blocks and find candidates to merge +  // Continue trying to merge with the first candidate found, as long as merging +  // is successfull. +  for (MachineBasicBlock &MBB : MF) { +    bool MergedCandidates = false; +    do { +      MergedCandidates = false; +      Cand1.clear(); +      Cand2.clear(); + +      Cand1.BranchBlock = &MBB; + +      // If unable to coalesce the branch, then continue to next block +      if (!canCoalesceBranch(Cand1)) +        break; + +      Cand2.BranchBlock = Cand1.BranchTargetBlock; +      if (!canCoalesceBranch(Cand2)) +        break; + +      // Sanity check +      // The branch-taken block of the second candidate should post-dominate the +      // first candidate +      assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) && +             "Branch-taken block should post-dominate first candidate"); + +      if (!identicalOperands(Cand1.Cond, Cand2.Cond)) { +        DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and " +                     << Cand2.BranchBlock->getNumber() +                     << " have different branches\n"); +        break; +      } +      if (!canMerge(Cand2, Cand1)) { +        DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber() +                     << " and " << Cand2.BranchBlock->getNumber() << "\n"); +        NumBlocksNotCoalesced++; +        continue; +      } +      DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber() +                   << " and " << Cand1.BranchTargetBlock->getNumber() << "\n"); +      MergedCandidates = mergeCandidates(Cand2, Cand1); +      if (MergedCandidates) +        didSomething = true; + +      DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n"); +    } while (MergedCandidates); +  } + +#ifndef NDEBUG +  // Verify MF is still valid after branch coalescing +  if (didSomething) +    MF.verify(nullptr, "Error in code produced by branch coalescing"); +#endif // NDEBUG + +  DEBUG(dbgs() << "Finished Branch Coalescing\n"); +  return didSomething; +} diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index 6fba161033b0..2d01301402f0 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -32,6 +32,7 @@  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/Passes.h"  #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/DebugInfoMetadata.h"  #include "llvm/IR/Function.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h" @@ -49,6 +50,7 @@ STATISTIC(NumDeadBlocks, "Number of dead blocks removed");  STATISTIC(NumBranchOpts, "Number of branches optimized");  STATISTIC(NumTailMerge , "Number of block tails merged");  STATISTIC(NumHoist     , "Number of times common instructions are hoisted"); +STATISTIC(NumTailCalls,  "Number of tail calls optimized");  static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",                                cl::init(cl::BOU_UNSET), cl::Hidden); @@ -123,8 +125,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,    }  } -/// RemoveDeadBlock - Remove the specified dead machine basic block from the -/// function, updating the CFG.  void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {    assert(MBB->pred_empty() && "MBB must be dead!");    DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); @@ -144,9 +144,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {      MLI->removeBlock(MBB);  } -/// OptimizeFunction - Perhaps branch folding, tail merging and other -/// CFG optimizations on the given function.  Block placement changes the layout -/// and may create new tail merging opportunities.  bool BranchFolder::OptimizeFunction(MachineFunction &MF,                                      const TargetInstrInfo *tii,                                      const TargetRegisterInfo *tri, @@ -348,8 +345,6 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,    return TailLen;  } -/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything -/// after it, replacing it with an unconditional branch to NewDest.  void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,                                             MachineBasicBlock *NewDest) {    TII->ReplaceTailWithBranchTo(OldInst, NewDest); @@ -362,9 +357,6 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,    ++NumTailMerge;  } -/// SplitMBBAt - Given a machine basic block and an iterator into it, split the -/// MBB so that the part before the iterator falls into the part starting at the -/// iterator.  This returns the new MBB.  MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,                                              MachineBasicBlock::iterator BBI1,                                              const BasicBlock *BB) { @@ -388,7 +380,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,    NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());    // NewMBB belongs to the same loop as CurMBB. -  if (MLI)  +  if (MLI)      if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))        ML->addBasicBlockToLoop(NewMBB, MLI->getBase()); @@ -436,7 +428,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,    MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;    SmallVector<MachineOperand, 4> Cond; -  DebugLoc dl;  // FIXME: this is nowhere +  DebugLoc dl = CurMBB->findBranchDebugLoc();    if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {      MachineBasicBlock *NextBB = &*I;      if (TBB == NextBB && !Cond.empty() && !FBB) { @@ -497,6 +489,15 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,    return MBFI.printBlockFreq(OS, Freq);  } +void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) { +  MBFI.view(Name, isSimple); +} + +uint64_t +BranchFolder::MBFIWrapper::getEntryFreq() const { +  return MBFI.getEntryFreq(); +} +  /// CountTerminators - Count the number of terminators in the given  /// block and set I to the position of the first non-terminator, if there  /// is one, or MBB->end() otherwise. @@ -516,6 +517,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB,    return NumTerms;  } +/// A no successor, non-return block probably ends in unreachable and is cold. +/// Also consider a block that ends in an indirect branch to be a return block, +/// since many targets use plain indirect branches to return. +static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) { +  if (!MBB->succ_empty()) +    return false; +  if (MBB->empty()) +    return true; +  return !(MBB->back().isReturn() || MBB->back().isIndirectBranch()); +} +  /// ProfitableToMerge - Check if two machine basic blocks have a common tail  /// and decide if it would be profitable to merge those tails.  Return the  /// length of the common tail and iterators to the first common instruction @@ -570,6 +582,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,        return true;    } +  // If these are identical non-return blocks with no successors, merge them. +  // Such blocks are typically cold calls to noreturn functions like abort, and +  // are unlikely to become a fallthrough target after machine block placement. +  // Tail merging these blocks is unlikely to create additional unconditional +  // branches, and will reduce the size of this cold code. +  if (I1 == MBB1->begin() && I2 == MBB2->begin() && +      blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2)) +    return true; +    // If one of the blocks can be completely merged and happens to be in    // a position where the other could fall through into it, merge any number    // of instructions, because it can be done without a branch. @@ -579,6 +600,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,    if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())      return true; +  // If both blocks are identical and end in a branch, merge them unless they +  // both have a fallthrough predecessor and successor. +  // We can only do this after block placement because it depends on whether +  // there are fallthroughs, and we don't know until after layout. +  if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) { +    auto BothFallThrough = [](MachineBasicBlock *MBB) { +      if (MBB->succ_size() != 0 && !MBB->canFallThrough()) +        return false; +      MachineFunction::iterator I(MBB); +      MachineFunction *MF = MBB->getParent(); +      return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough(); +    }; +    if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2)) +      return true; +  } +    // If both blocks have an unconditional branch temporarily stripped out,    // count that as an additional common instruction for the following    // heuristics. This heuristic is only accurate for single-succ blocks, so to @@ -604,16 +641,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,           (I1 == MBB1->begin() || I2 == MBB2->begin());  } -/// ComputeSameTails - Look through all the blocks in MergePotentials that have -/// hash CurHash (guaranteed to match the last element).  Build the vector -/// SameTails of all those that have the (same) largest number of instructions -/// in common of any pair of these blocks.  SameTails entries contain an -/// iterator into MergePotentials (from which the MachineBasicBlock can be -/// found) and a MachineBasicBlock::iterator into that MBB indicating the -/// instruction where the matching code sequence begins. -/// Order of elements in SameTails is the reverse of the order in which -/// those blocks appear in MergePotentials (where they are not necessarily -/// consecutive).  unsigned BranchFolder::ComputeSameTails(unsigned CurHash,                                          unsigned MinCommonTailLength,                                          MachineBasicBlock *SuccBB, @@ -650,8 +677,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash,    return maxCommonTailLength;  } -/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from -/// MergePotentials, restoring branches at ends of blocks as appropriate.  void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,                                          MachineBasicBlock *SuccBB,                                          MachineBasicBlock *PredBB) { @@ -671,8 +696,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,    MergePotentials.erase(CurMPIter, MergePotentials.end());  } -/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist -/// only of the common tail.  Create a block that does by splitting one.  bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,                                               MachineBasicBlock *SuccBB,                                               unsigned maxCommonTailLength, @@ -723,6 +746,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,    return true;  } +void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) { +  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock(); + +  std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size()); +  for (unsigned int i = 0 ; i != SameTails.size() ; ++i) { +    if (i != commonTailIndex) +      NextCommonInsts[i] = SameTails[i].getTailStartPos(); +    else { +      assert(SameTails[i].getTailStartPos() == MBB->begin() && +          "MBB is not a common tail only block"); +    } +  } + +  for (auto &MI : *MBB) { +    if (MI.isDebugValue()) +      continue; +    DebugLoc DL = MI.getDebugLoc(); +    for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) { +      if (i == commonTailIndex) +        continue; + +      auto &Pos = NextCommonInsts[i]; +      assert(Pos != SameTails[i].getBlock()->end() && +          "Reached BB end within common tail"); +      while (Pos->isDebugValue()) { +        ++Pos; +        assert(Pos != SameTails[i].getBlock()->end() && +            "Reached BB end within common tail"); +      } +      assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!"); +      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc()); +      NextCommonInsts[i] = ++Pos; +    } +    MI.setDebugLoc(DL); +  } +} +  static void  mergeOperations(MachineBasicBlock::iterator MBBIStartPos,                  MachineBasicBlock &MBBCommon) { @@ -875,10 +935,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,      // Recompute common tail MBB's edge weights and block frequency.      setCommonTailEdgeWeights(*MBB); -    // Remove the original debug location from the common tail. -    for (auto &MI : *MBB) -      if (!MI.isDebugValue()) -        MI.setDebugLoc(DebugLoc()); +    // Merge debug locations across identical instructions for common tail. +    MergeCommonTailDebugLocs(commonTailIndex);      // MBB is common tail.  Adjust all other BB's to jump to this one.      // Traversal must be forwards so erases work. @@ -1043,7 +1101,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {          // Remove the unconditional branch at the end, if any.          if (TBB && (Cond.empty() || FBB)) { -          DebugLoc dl;  // FIXME: this is nowhere +          DebugLoc dl = PBB->findBranchDebugLoc();            TII->removeBranch(*PBB);            if (!Cond.empty())              // reinsert conditional branch only, for now @@ -1193,8 +1251,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {    return DebugLoc();  } -/// OptimizeBlock - Analyze and optimize control flow related to the specified -/// block.  This is never called on the entry block.  bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {    bool MadeChange = false;    MachineFunction &MF = *MBB->getParent(); @@ -1386,6 +1442,42 @@ ReoptimizeBlock:      }    } +  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && +      MF.getFunction()->optForSize()) { +    // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch +    // direction, thereby defeating careful block placement and regressing +    // performance. Therefore, only consider this for optsize functions. +    MachineInstr &TailCall = *MBB->getFirstNonDebugInstr(); +    if (TII->isUnconditionalTailCall(TailCall)) { +      MachineBasicBlock *Pred = *MBB->pred_begin(); +      MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; +      SmallVector<MachineOperand, 4> PredCond; +      bool PredAnalyzable = +          !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); + +      if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) { +        // The predecessor has a conditional branch to this block which consists +        // of only a tail call. Try to fold the tail call into the conditional +        // branch. +        if (TII->canMakeTailCallConditional(PredCond, TailCall)) { +          // TODO: It would be nice if analyzeBranch() could provide a pointer +          // to the branch insturction so replaceBranchWithTailCall() doesn't +          // have to search for it. +          TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); +          ++NumTailCalls; +          Pred->removeSuccessor(MBB); +          MadeChange = true; +          return MadeChange; +        } +      } +      // If the predecessor is falling through to this block, we could reverse +      // the branch condition and fold the tail call into that. However, after +      // that we might have to re-arrange the CFG to fall through to the other +      // block and there is a high risk of regressing code size rather than +      // improving it. +    } +  } +    // Analyze the branch in the current block.    MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;    SmallVector<MachineOperand, 4> CurCond; @@ -1599,8 +1691,6 @@ ReoptimizeBlock:  //  Hoist Common Code  //===----------------------------------------------------------------------===// -/// HoistCommonCode - Hoist common instruction sequences at the start of basic -/// blocks to their common predecessor.  bool BranchFolder::HoistCommonCode(MachineFunction &MF) {    bool MadeChange = false;    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) { @@ -1734,9 +1824,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,    return PI;  } -/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction -/// sequence at the start of the function, move the instructions before MBB -/// terminator if it's legal.  bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;    SmallVector<MachineOperand, 4> Cond; diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h index fc48e484292d..4852721eea10 100644 --- a/lib/CodeGen/BranchFolding.h +++ b/lib/CodeGen/BranchFolding.h @@ -37,6 +37,9 @@ namespace llvm {                            // flag. Ignored for optsize.                            unsigned MinCommonTailLength = 0); +    /// Perhaps branch folding, tail merging and other CFG optimizations on the +    /// given function.  Block placement changes the layout and may create new +    /// tail merging opportunities.      bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,                            const TargetRegisterInfo *tri, MachineModuleInfo *mmi,                            MachineLoopInfo *mli = nullptr, @@ -122,6 +125,8 @@ namespace llvm {                                    const MachineBasicBlock *MBB) const;        raw_ostream &printBlockFreq(raw_ostream &OS,                                    const BlockFrequency Freq) const; +      void view(const Twine &Name, bool isSimple = true); +      uint64_t getEntryFreq() const;      private:        const MachineBlockFrequencyInfo &MBFI; @@ -137,26 +142,64 @@ namespace llvm {                         MachineBasicBlock* PredBB,                         unsigned MinCommonTailLength);      void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB); + +    /// Delete the instruction OldInst and everything after it, replacing it +    /// with an unconditional branch to NewDest.      void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,                                   MachineBasicBlock *NewDest); + +    /// Given a machine basic block and an iterator into it, split the MBB so +    /// that the part before the iterator falls into the part starting at the +    /// iterator.  This returns the new MBB.      MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,                                    MachineBasicBlock::iterator BBI1,                                    const BasicBlock *BB); + +    /// Look through all the blocks in MergePotentials that have hash CurHash +    /// (guaranteed to match the last element).  Build the vector SameTails of +    /// all those that have the (same) largest number of instructions in common +    /// of any pair of these blocks.  SameTails entries contain an iterator into +    /// MergePotentials (from which the MachineBasicBlock can be found) and a +    /// MachineBasicBlock::iterator into that MBB indicating the instruction +    /// where the matching code sequence begins.  Order of elements in SameTails +    /// is the reverse of the order in which those blocks appear in +    /// MergePotentials (where they are not necessarily consecutive).      unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,                                MachineBasicBlock *SuccBB,                                MachineBasicBlock *PredBB); + +    /// Remove all blocks with hash CurHash from MergePotentials, restoring +    /// branches at ends of blocks as appropriate.      void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,                                                  MachineBasicBlock* PredBB); + +    /// None of the blocks to be tail-merged consist only of the common tail. +    /// Create a block that does by splitting one.      bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,                                     MachineBasicBlock *SuccBB,                                     unsigned maxCommonTailLength,                                     unsigned &commonTailIndex); +    /// Create merged DebugLocs of identical instructions across SameTails and +    /// assign it to the instruction in common tail. +    void MergeCommonTailDebugLocs(unsigned commonTailIndex); +      bool OptimizeBranches(MachineFunction &MF); + +    /// Analyze and optimize control flow related to the specified block. This +    /// is never called on the entry block.      bool OptimizeBlock(MachineBasicBlock *MBB); + +    /// Remove the specified dead machine basic block from the function, +    /// updating the CFG.      void RemoveDeadBlock(MachineBasicBlock *MBB); +    /// Hoist common instruction sequences at the start of basic blocks to their +    /// common predecessor.      bool HoistCommonCode(MachineFunction &MF); + +    /// If the successors of MBB has common instruction sequence at the start of +    /// the function, move the instructions before MBB terminator if it's legal.      bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);    };  } diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp index 8b27570a17f4..7af136941661 100644 --- a/lib/CodeGen/BranchRelaxation.cpp +++ b/lib/CodeGen/BranchRelaxation.cpp @@ -126,14 +126,16 @@ void BranchRelaxation::verify() {  #endif  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  /// print block size and offset information - debugging -void BranchRelaxation::dumpBBs() { +LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() {    for (auto &MBB : *MF) {      const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];      dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)             << format("size=%#x\n", BBI.Size);    }  } +#endif  /// scanFunction - Do the initial scan of the function, building up  /// information about each block. diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp index ff7c99de0420..e4eab8c513d9 100644 --- a/lib/CodeGen/BuiltinGCs.cpp +++ b/lib/CodeGen/BuiltinGCs.cpp @@ -1,4 +1,4 @@ -//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===// +//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -14,6 +14,8 @@  #include "llvm/CodeGen/GCs.h"  #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Casting.h"  using namespace llvm; @@ -77,6 +79,7 @@ public:      UsesMetadata = false;      CustomRoots = false;    } +    Optional<bool> isGCManagedPointer(const Type *Ty) const override {      // Method is only valid on pointer typed values.      const PointerType *PT = cast<PointerType>(Ty); @@ -110,6 +113,7 @@ public:      UsesMetadata = false;      CustomRoots = false;    } +    Optional<bool> isGCManagedPointer(const Type *Ty) const override {      // Method is only valid on pointer typed values.      const PointerType *PT = cast<PointerType>(Ty); @@ -117,7 +121,8 @@ public:      return (1 == PT->getAddressSpace());    }  }; -} + +} // end anonymous namespace  // Register all the above so that they can be found at runtime.  Note that  // these static initializers are important since the registration list is diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 398ea88363b6..0912d9f68aff 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_library(LLVMCodeGen    Analysis.cpp    AtomicExpandPass.cpp    BasicTargetTransformInfo.cpp +  BranchCoalescing.cpp    BranchFolding.cpp    BranchRelaxation.cpp    BuiltinGCs.cpp @@ -23,6 +24,7 @@ add_llvm_library(LLVMCodeGen    ExpandISelPseudos.cpp    ExpandPostRAPseudos.cpp    FaultMaps.cpp +  FEntryInserter.cpp    FuncletLayout.cpp    GCMetadata.cpp    GCMetadataPrinter.cpp @@ -36,6 +38,7 @@ add_llvm_library(LLVMCodeGen    InterleavedAccessPass.cpp    IntrinsicLowering.cpp    LatencyPriorityQueue.cpp +  LazyMachineBlockFrequencyInfo.cpp    LexicalScopes.cpp    LiveDebugValues.cpp    LiveDebugVariables.cpp @@ -46,6 +49,7 @@ add_llvm_library(LLVMCodeGen    LiveRangeCalc.cpp    LiveRangeEdit.cpp    LiveRegMatrix.cpp +  LiveRegUnits.cpp    LiveStackAnalysis.cpp    LiveVariables.cpp    LLVMTargetMachine.cpp @@ -70,6 +74,8 @@ add_llvm_library(LLVMCodeGen    MachineLoopInfo.cpp    MachineModuleInfo.cpp    MachineModuleInfoImpls.cpp +  MachineOptimizationRemarkEmitter.cpp +  MachineOutliner.cpp    MachinePassRegistry.cpp    MachinePipeliner.cpp    MachinePostDominators.cpp @@ -147,7 +153,7 @@ add_llvm_library(LLVMCodeGen    ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen    ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP -  LINK_LIBS ${PTHREAD_LIB} +  LINK_LIBS ${LLVM_PTHREAD_LIB}    DEPENDS    intrinsics_gen diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp index 2e33f14c7ee3..7cad4d031169 100644 --- a/lib/CodeGen/CallingConvLower.cpp +++ b/lib/CodeGen/CallingConvLower.cpp @@ -30,8 +30,7 @@ using namespace llvm;  CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,                   SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)      : CallingConv(CC), IsVarArg(isVarArg), MF(mf), -      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C), -      CallOrPrologue(Unknown) { +      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) {    // No stack is used.    StackOffset = 0;    MaxStackArgAlign = 1; diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 4cf9b138f10d..3fc12ccc3b60 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -21,6 +21,7 @@ using namespace llvm;  /// initializeCodeGen - Initialize all passes linked into the CodeGen library.  void llvm::initializeCodeGen(PassRegistry &Registry) {    initializeAtomicExpandPass(Registry); +  initializeBranchCoalescingPass(Registry);    initializeBranchFolderPassPass(Registry);    initializeBranchRelaxationPass(Registry);    initializeCodeGenPreparePass(Registry); @@ -31,12 +32,15 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {    initializeEarlyIfConverterPass(Registry);    initializeExpandISelPseudosPass(Registry);    initializeExpandPostRAPass(Registry); +  initializeFEntryInserterPass(Registry);    initializeFinalizeMachineBundlesPass(Registry);    initializeFuncletLayoutPass(Registry);    initializeGCMachineCodeAnalysisPass(Registry);    initializeGCModuleInfoPass(Registry);    initializeIfConverterPass(Registry); +  initializeImplicitNullChecksPass(Registry);    initializeInterleavedAccessPass(Registry); +  initializeLiveDebugValuesPass(Registry);    initializeLiveDebugVariablesPass(Registry);    initializeLiveIntervalsPass(Registry);    initializeLiveStacksPass(Registry); @@ -47,7 +51,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {    initializeMachineBlockPlacementPass(Registry);    initializeMachineBlockPlacementStatsPass(Registry);    initializeMachineCSEPass(Registry); -  initializeImplicitNullChecksPass(Registry);    initializeMachineCombinerPass(Registry);    initializeMachineCopyPropagationPass(Registry);    initializeMachineDominatorTreePass(Registry); @@ -55,16 +58,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {    initializeMachineLICMPass(Registry);    initializeMachineLoopInfoPass(Registry);    initializeMachineModuleInfoPass(Registry); +  initializeMachineOptimizationRemarkEmitterPassPass(Registry); +  initializeMachineOutlinerPass(Registry);    initializeMachinePipelinerPass(Registry);    initializeMachinePostDominatorTreePass(Registry); +  initializeMachineRegionInfoPassPass(Registry);    initializeMachineSchedulerPass(Registry);    initializeMachineSinkingPass(Registry);    initializeMachineVerifierPassPass(Registry); -  initializeXRayInstrumentationPass(Registry); -  initializePatchableFunctionPass(Registry);    initializeOptimizePHIsPass(Registry);    initializePEIPass(Registry);    initializePHIEliminationPass(Registry); +  initializePatchableFunctionPass(Registry);    initializePeepholeOptimizerPass(Registry);    initializePostMachineSchedulerPass(Registry);    initializePostRAHazardRecognizerPass(Registry); @@ -74,12 +79,11 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {    initializeRAGreedyPass(Registry);    initializeRegisterCoalescerPass(Registry);    initializeRenameIndependentSubregsPass(Registry); +  initializeSafeStackPass(Registry);    initializeShrinkWrapPass(Registry);    initializeSlotIndexesPass(Registry);    initializeStackColoringPass(Registry);    initializeStackMapLivenessPass(Registry); -  initializeLiveDebugValuesPass(Registry); -  initializeSafeStackPass(Registry);    initializeStackProtectorPass(Registry);    initializeStackSlotColoringPass(Registry);    initializeTailDuplicatePassPass(Registry); @@ -91,6 +95,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {    initializeVirtRegMapPass(Registry);    initializeVirtRegRewriterPass(Registry);    initializeWinEHPreparePass(Registry); +  initializeXRayInstrumentationPass(Registry);  }  void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 934b470f13b5..2bdd189557b4 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -15,10 +15,12 @@  #include "llvm/CodeGen/Passes.h"  #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/BlockFrequencyInfo.h"  #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -53,8 +55,10 @@  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/BuildLibCalls.h"  #include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/Transforms/Utils/Cloning.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/ValueMapper.h"  using namespace llvm;  using namespace llvm::PatternMatch; @@ -77,7 +81,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");  STATISTIC(NumRetsDup,    "Number of return instructions duplicated");  STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");  STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); -STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");  STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");  static cl::opt<bool> DisableBranchOpts( @@ -93,7 +96,7 @@ static cl::opt<bool> DisableSelectToBranch(    cl::desc("Disable select to branch conversion."));  static cl::opt<bool> AddrSinkUsingGEPs( -  "addr-sink-using-gep", cl::Hidden, cl::init(false), +  "addr-sink-using-gep", cl::Hidden, cl::init(true),    cl::desc("Address sinking in CGP using GEPs."));  static cl::opt<bool> EnableAndCmpSinking( @@ -135,15 +138,24 @@ static cl::opt<bool> ForceSplitStore(      "force-split-store", cl::Hidden, cl::init(false),      cl::desc("Force store splitting no matter what the target query says.")); +static cl::opt<bool> +EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, +    cl::desc("Enable merging of redundant sexts when one is dominating" +    " the other."), cl::init(true)); +  namespace {  typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;  typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;  typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; +typedef SmallVector<Instruction *, 16> SExts; +typedef DenseMap<Value *, SExts> ValueToSExts;  class TypePromotionTransaction;    class CodeGenPrepare : public FunctionPass {      const TargetMachine *TM; +    const TargetSubtargetInfo *SubtargetInfo;      const TargetLowering *TLI; +    const TargetRegisterInfo *TRI;      const TargetTransformInfo *TTI;      const TargetLibraryInfo *TLInfo;      const LoopInfo *LI; @@ -165,6 +177,15 @@ class TypePromotionTransaction;      /// promotion for the current function.      InstrToOrigTy PromotedInsts; +    /// Keep track of instructions removed during promotion. +    SetOfInstrs RemovedInsts; + +    /// Keep track of sext chains based on their initial value. +    DenseMap<Value *, Instruction *> SeenChainsForSExt; + +    /// Keep track of SExt promoted. +    ValueToSExts ValToSExtendedUses; +      /// True if CFG is modified in any way.      bool ModifiedDT; @@ -206,7 +227,7 @@ class TypePromotionTransaction;                              Type *AccessTy, unsigned AS);      bool optimizeInlineAsmInst(CallInst *CS);      bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); -    bool moveExtToFormExtLoad(Instruction *&I); +    bool optimizeExt(Instruction *&I);      bool optimizeExtUses(Instruction *I);      bool optimizeLoadExt(LoadInst *I);      bool optimizeSelectInst(SelectInst *SI); @@ -215,13 +236,21 @@ class TypePromotionTransaction;      bool optimizeExtractElementInst(Instruction *Inst);      bool dupRetToEnableTailCallOpts(BasicBlock *BB);      bool placeDbgValues(Function &F); -    bool sinkAndCmp(Function &F); -    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, -                        Instruction *&Inst, -                        const SmallVectorImpl<Instruction *> &Exts, -                        unsigned CreatedInstCost); +    bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, +                      LoadInst *&LI, Instruction *&Inst, bool HasPromoted); +    bool tryToPromoteExts(TypePromotionTransaction &TPT, +                          const SmallVectorImpl<Instruction *> &Exts, +                          SmallVectorImpl<Instruction *> &ProfitablyMovedExts, +                          unsigned CreatedInstsCost = 0); +    bool mergeSExts(Function &F); +    bool performAddressTypePromotion( +        Instruction *&Inst, +        bool AllowPromotionWithoutCommonHeader, +        bool HasPromoted, TypePromotionTransaction &TPT, +        SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);      bool splitBranchCondition(Function &F);      bool simplifyOffsetableRelocate(Instruction &I); +    bool splitIndirectCriticalEdges(Function &F);    };  } @@ -250,8 +279,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    BPI.reset();    ModifiedDT = false; -  if (TM) -    TLI = TM->getSubtargetImpl(F)->getTargetLowering(); +  if (TM) { +    SubtargetInfo = TM->getSubtargetImpl(F); +    TLI = SubtargetInfo->getTargetLowering(); +    TRI = SubtargetInfo->getRegisterInfo(); +  }    TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -260,9 +292,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    if (ProfileGuidedSectionPrefix) {      ProfileSummaryInfo *PSI =          getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); -    if (PSI->isFunctionEntryHot(&F)) +    if (PSI->isFunctionHotInCallGraph(&F))        F.setSectionPrefix(".hot"); -    else if (PSI->isFunctionEntryCold(&F)) +    else if (PSI->isFunctionColdInCallGraph(&F))        F.setSectionPrefix(".cold");    } @@ -290,18 +322,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    // find a node corresponding to the value.    EverMadeChange |= placeDbgValues(F); -  // If there is a mask, compare against zero, and branch that can be combined -  // into a single target instruction, push the mask and compare into branch -  // users. Do this before OptimizeBlock -> OptimizeInst -> -  // OptimizeCmpExpression, which perturbs the pattern being searched for. -  if (!DisableBranchOpts) { -    EverMadeChange |= sinkAndCmp(F); +  if (!DisableBranchOpts)      EverMadeChange |= splitBranchCondition(F); -  } + +  // Split some critical edges where one of the sources is an indirect branch, +  // to help generate sane code for PHIs involving such edges. +  EverMadeChange |= splitIndirectCriticalEdges(F);    bool MadeChange = true;    while (MadeChange) {      MadeChange = false; +    SeenChainsForSExt.clear(); +    ValToSExtendedUses.clear(); +    RemovedInsts.clear();      for (Function::iterator I = F.begin(); I != F.end(); ) {        BasicBlock *BB = &*I++;        bool ModifiedDTOnIteration = false; @@ -311,6 +344,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {        if (ModifiedDTOnIteration)          break;      } +    if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) +      MadeChange |= mergeSExts(F); + +    // Really free removed instructions during promotion. +    for (Instruction *I : RemovedInsts) +      delete I; +      EverMadeChange |= MadeChange;    } @@ -432,6 +472,154 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {    return DestBB;  } +// Return the unique indirectbr predecessor of a block. This may return null +// even if such a predecessor exists, if it's not useful for splitting. +// If a predecessor is found, OtherPreds will contain all other (non-indirectbr) +// predecessors of BB. +static BasicBlock * +findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { +  // If the block doesn't have any PHIs, we don't care about it, since there's +  // no point in splitting it. +  PHINode *PN = dyn_cast<PHINode>(BB->begin()); +  if (!PN) +    return nullptr; + +  // Verify we have exactly one IBR predecessor. +  // Conservatively bail out if one of the other predecessors is not a "regular" +  // terminator (that is, not a switch or a br). +  BasicBlock *IBB = nullptr; +  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { +    BasicBlock *PredBB = PN->getIncomingBlock(Pred); +    TerminatorInst *PredTerm = PredBB->getTerminator(); +    switch (PredTerm->getOpcode()) { +    case Instruction::IndirectBr: +      if (IBB) +        return nullptr; +      IBB = PredBB; +      break; +    case Instruction::Br: +    case Instruction::Switch: +      OtherPreds.push_back(PredBB); +      continue; +    default: +      return nullptr; +    } +  } + +  return IBB; +} + +// Split critical edges where the source of the edge is an indirectbr +// instruction. This isn't always possible, but we can handle some easy cases. +// This is useful because MI is unable to split such critical edges, +// which means it will not be able to sink instructions along those edges. +// This is especially painful for indirect branches with many successors, where +// we end up having to prepare all outgoing values in the origin block. +// +// Our normal algorithm for splitting critical edges requires us to update +// the outgoing edges of the edge origin block, but for an indirectbr this +// is hard, since it would require finding and updating the block addresses +// the indirect branch uses. But if a block only has a single indirectbr +// predecessor, with the others being regular branches, we can do it in a +// different way. +// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr. +// We can split D into D0 and D1, where D0 contains only the PHIs from D, +// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and +// create the following structure: +// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1 +bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) { +  // Check whether the function has any indirectbrs, and collect which blocks +  // they may jump to. Since most functions don't have indirect branches, +  // this lowers the common case's overhead to O(Blocks) instead of O(Edges). +  SmallSetVector<BasicBlock *, 16> Targets; +  for (auto &BB : F) { +    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); +    if (!IBI) +      continue; + +    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) +      Targets.insert(IBI->getSuccessor(Succ)); +  } + +  if (Targets.empty()) +    return false; + +  bool Changed = false; +  for (BasicBlock *Target : Targets) { +    SmallVector<BasicBlock *, 16> OtherPreds; +    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); +    // If we did not found an indirectbr, or the indirectbr is the only +    // incoming edge, this isn't the kind of edge we're looking for. +    if (!IBRPred || OtherPreds.empty()) +      continue; + +    // Don't even think about ehpads/landingpads. +    Instruction *FirstNonPHI = Target->getFirstNonPHI(); +    if (FirstNonPHI->isEHPad() || Target->isLandingPad()) +      continue; + +    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); +    // It's possible Target was its own successor through an indirectbr. +    // In this case, the indirectbr now comes from BodyBlock. +    if (IBRPred == Target) +      IBRPred = BodyBlock; + +    // At this point Target only has PHIs, and BodyBlock has the rest of the +    // block's body. Create a copy of Target that will be used by the "direct" +    // preds. +    ValueToValueMapTy VMap; +    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); + +    for (BasicBlock *Pred : OtherPreds) +      Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc); + +    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that +    // they are clones, so the number of PHIs are the same. +    // (a) Remove the edge coming from IBRPred from the "Direct" PHI +    // (b) Leave that as the only edge in the "Indirect" PHI. +    // (c) Merge the two in the body block. +    BasicBlock::iterator Indirect = Target->begin(), +                         End = Target->getFirstNonPHI()->getIterator(); +    BasicBlock::iterator Direct = DirectSucc->begin(); +    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); + +    assert(&*End == Target->getTerminator() && +           "Block was expected to only contain PHIs"); + +    while (Indirect != End) { +      PHINode *DirPHI = cast<PHINode>(Direct); +      PHINode *IndPHI = cast<PHINode>(Indirect); + +      // Now, clean up - the direct block shouldn't get the indirect value, +      // and vice versa. +      DirPHI->removeIncomingValue(IBRPred); +      Direct++; + +      // Advance the pointer here, to avoid invalidation issues when the old +      // PHI is erased. +      Indirect++; + +      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); +      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), +                             IBRPred); + +      // Create a PHI in the body block, to merge the direct and indirect +      // predecessors. +      PHINode *MergePHI = +          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); +      MergePHI->addIncoming(NewIndPHI, Target); +      MergePHI->addIncoming(DirPHI, DirectSucc); + +      IndPHI->replaceAllUsesWith(MergePHI); +      IndPHI->eraseFromParent(); +    } + +    Changed = true; +  } + +  return Changed; +} +  /// Eliminate blocks that contain only PHI nodes, debug info directives, and an  /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split  /// edges in ways that are non-optimal for isel. Start by eliminating these @@ -1090,6 +1278,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {    return false;  } +/// Duplicate and sink the given 'and' instruction into user blocks where it is +/// used in a compare to allow isel to generate better code for targets where +/// this operation can be combined. +/// +/// Return true if any changes are made. +static bool sinkAndCmp0Expression(Instruction *AndI, +                                  const TargetLowering &TLI, +                                  SetOfInstrs &InsertedInsts) { +  // Double-check that we're not trying to optimize an instruction that was +  // already optimized by some other part of this pass. +  assert(!InsertedInsts.count(AndI) && +         "Attempting to optimize already optimized and instruction"); +  (void) InsertedInsts; + +  // Nothing to do for single use in same basic block. +  if (AndI->hasOneUse() && +      AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) +    return false; + +  // Try to avoid cases where sinking/duplicating is likely to increase register +  // pressure. +  if (!isa<ConstantInt>(AndI->getOperand(0)) && +      !isa<ConstantInt>(AndI->getOperand(1)) && +      AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) +    return false; + +  for (auto *U : AndI->users()) { +    Instruction *User = cast<Instruction>(U); + +    // Only sink for and mask feeding icmp with 0. +    if (!isa<ICmpInst>(User)) +      return false; + +    auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); +    if (!CmpC || !CmpC->isZero()) +      return false; +  } + +  if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) +    return false; + +  DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); +  DEBUG(AndI->getParent()->dump()); + +  // Push the 'and' into the same block as the icmp 0.  There should only be +  // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any +  // others, so we don't need to keep track of which BBs we insert into. +  for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); +       UI != E; ) { +    Use &TheUse = UI.getUse(); +    Instruction *User = cast<Instruction>(*UI); + +    // Preincrement use iterator so we don't invalidate it. +    ++UI; + +    DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); + +    // Keep the 'and' in the same place if the use is already in the same block. +    Instruction *InsertPt = +        User->getParent() == AndI->getParent() ? AndI : User; +    Instruction *InsertedAnd = +        BinaryOperator::Create(Instruction::And, AndI->getOperand(0), +                               AndI->getOperand(1), "", InsertPt); +    // Propagate the debug info. +    InsertedAnd->setDebugLoc(AndI->getDebugLoc()); + +    // Replace a use of the 'and' with a use of the new 'and'. +    TheUse = InsertedAnd; +    ++NumAndUses; +    DEBUG(User->getParent()->dump()); +  } + +  // We removed all uses, nuke the and. +  AndI->eraseFromParent(); +  return true; +} +  /// Check if the candidates could be combined with a shift instruction, which  /// includes:  /// 1. Truncate instruction @@ -2028,16 +2293,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {      }      if (TLI) { -      // Unknown address space. -      // TODO: Target hook to pick which address space the intrinsic cares -      // about? -      unsigned AddrSpace = ~0u;        SmallVector<Value*, 2> PtrOps;        Type *AccessTy; -      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) -        while (!PtrOps.empty()) -          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) +      if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) +        while (!PtrOps.empty()) { +          Value *PtrVal = PtrOps.pop_back_val(); +          unsigned AS = PtrVal->getType()->getPointerAddressSpace(); +          if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))              return true; +        }      }    } @@ -2168,11 +2432,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {      // Conservatively require the attributes of the call to match those of the      // return. Ignore noalias because it doesn't affect the call sequence. -    AttributeSet CalleeAttrs = CS.getAttributes(); -    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). -          removeAttribute(Attribute::NoAlias) != -        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). -          removeAttribute(Attribute::NoAlias)) +    AttributeList CalleeAttrs = CS.getAttributes(); +    if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) +            .removeAttribute(Attribute::NoAlias) != +        AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) +            .removeAttribute(Attribute::NoAlias))        continue;      // Make sure the call instruction is followed by an unconditional branch to @@ -2561,25 +2825,30 @@ class TypePromotionTransaction {      OperandsHider Hider;      /// Keep track of the uses replaced, if any.      UsesReplacer *Replacer; +    /// Keep track of instructions removed. +    SetOfInstrs &RemovedInsts;    public:      /// \brief Remove all reference of \p Inst and optinally replace all its      /// uses with New. +    /// \p RemovedInsts Keep track of the instructions removed by this Action.      /// \pre If !Inst->use_empty(), then New != nullptr -    InstructionRemover(Instruction *Inst, Value *New = nullptr) +    InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, +                       Value *New = nullptr)          : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), -          Replacer(nullptr) { +          Replacer(nullptr), RemovedInsts(RemovedInsts) {        if (New)          Replacer = new UsesReplacer(Inst, New);        DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); +      RemovedInsts.insert(Inst); +      /// The instructions removed here will be freed after completing +      /// optimizeBlock() for all blocks as we need to keep track of the +      /// removed instructions during promotion.        Inst->removeFromParent();      }      ~InstructionRemover() override { delete Replacer; } -    /// \brief Really remove the instruction. -    void commit() override { delete Inst; } -      /// \brief Resurrect the instruction and reassign it to the proper uses if      /// new value was provided when build this action.      void undo() override { @@ -2588,6 +2857,7 @@ class TypePromotionTransaction {        if (Replacer)          Replacer->undo();        Hider.undo(); +      RemovedInsts.erase(Inst);      }    }; @@ -2596,6 +2866,10 @@ public:    /// The restoration point is a pointer to an action instead of an iterator    /// because the iterator may be invalidated but not the pointer.    typedef const TypePromotionAction *ConstRestorationPt; + +  TypePromotionTransaction(SetOfInstrs &RemovedInsts) +      : RemovedInsts(RemovedInsts) {} +    /// Advocate every changes made in that transaction.    void commit();    /// Undo all the changes made after the given point. @@ -2627,6 +2901,7 @@ private:    /// The ordered list of actions made so far.    SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;    typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt; +  SetOfInstrs &RemovedInsts;  };  void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, @@ -2638,7 +2913,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,  void TypePromotionTransaction::eraseInstruction(Instruction *Inst,                                                  Value *NewVal) {    Actions.push_back( -      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal)); +      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, +                                                         RemovedInsts, NewVal));  }  void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, @@ -2705,8 +2981,8 @@ void TypePromotionTransaction::rollback(  /// This encapsulates the logic for matching the target-legal addressing modes.  class AddressingModeMatcher {    SmallVectorImpl<Instruction*> &AddrModeInsts; -  const TargetMachine &TM;    const TargetLowering &TLI; +  const TargetRegisterInfo &TRI;    const DataLayout &DL;    /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and @@ -2731,14 +3007,14 @@ class AddressingModeMatcher {    bool IgnoreProfitability;    AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, -                        const TargetMachine &TM, Type *AT, unsigned AS, +                        const TargetLowering &TLI, +                        const TargetRegisterInfo &TRI, +                        Type *AT, unsigned AS,                          Instruction *MI, ExtAddrMode &AM,                          const SetOfInstrs &InsertedInsts,                          InstrToOrigTy &PromotedInsts,                          TypePromotionTransaction &TPT) -      : AddrModeInsts(AMI), TM(TM), -        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) -                 ->getTargetLowering()), +      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),          DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),          MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),          PromotedInsts(PromotedInsts), TPT(TPT) { @@ -2756,13 +3032,15 @@ public:    static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,                             Instruction *MemoryInst,                             SmallVectorImpl<Instruction*> &AddrModeInsts, -                           const TargetMachine &TM, +                           const TargetLowering &TLI, +                           const TargetRegisterInfo &TRI,                             const SetOfInstrs &InsertedInsts,                             InstrToOrigTy &PromotedInsts,                             TypePromotionTransaction &TPT) {      ExtAddrMode Result; -    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, +    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, +                                         AccessTy, AS,                                           MemoryInst, Result, InsertedInsts,                                           PromotedInsts, TPT).matchAddr(V, 0);      (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -3583,18 +3861,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {  /// Check to see if all uses of OpVal by the specified inline asm call are due  /// to memory operands. If so, return true, otherwise return false.  static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, -                                    const TargetMachine &TM) { +                                    const TargetLowering &TLI, +                                    const TargetRegisterInfo &TRI) {    const Function *F = CI->getParent()->getParent(); -  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); -  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();    TargetLowering::AsmOperandInfoVector TargetConstraints = -      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, +      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,                              ImmutableCallSite(CI)); +    for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {      TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];      // Compute the constraint code and ConstraintType to use. -    TLI->ComputeConstraintToUse(OpInfo, SDValue()); +    TLI.ComputeConstraintToUse(OpInfo, SDValue());      // If this asm operand is our Value*, and if it isn't an indirect memory      // operand, we can't fold it! @@ -3613,7 +3891,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,  static bool FindAllMemoryUses(      Instruction *I,      SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, -    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) { +    SmallPtrSetImpl<Instruction *> &ConsideredInsts, +    const TargetLowering &TLI, const TargetRegisterInfo &TRI) {    // If we already considered this instruction, we're done.    if (!ConsideredInsts.insert(I).second)      return false; @@ -3635,11 +3914,28 @@ static bool FindAllMemoryUses(      if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {        unsigned opNo = U.getOperandNo(); -      if (opNo == 0) return true; // Storing addr, not into addr. +      if (opNo != StoreInst::getPointerOperandIndex()) +        return true; // Storing addr, not into addr.        MemoryUses.push_back(std::make_pair(SI, opNo));        continue;      } +    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { +      unsigned opNo = U.getOperandNo(); +      if (opNo != AtomicRMWInst::getPointerOperandIndex()) +        return true; // Storing addr, not into addr. +      MemoryUses.push_back(std::make_pair(RMW, opNo)); +      continue; +    } + +    if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { +      unsigned opNo = U.getOperandNo(); +      if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) +        return true; // Storing addr, not into addr. +      MemoryUses.push_back(std::make_pair(CmpX, opNo)); +      continue; +    } +      if (CallInst *CI = dyn_cast<CallInst>(UserI)) {        // If this is a cold call, we can sink the addressing calculation into        // the cold path.  See optimizeCallInst @@ -3650,12 +3946,12 @@ static bool FindAllMemoryUses(        if (!IA) return true;        // If this is a memory operand, we're cool, otherwise bail out. -      if (!IsOperandAMemoryOperand(CI, IA, I, TM)) +      if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))          return true;        continue;      } -    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) +    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI))        return true;    } @@ -3743,7 +4039,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,    // the use is just a particularly nice way of sinking it.    SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;    SmallPtrSet<Instruction*, 16> ConsideredInsts; -  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) +  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))      return false;  // Has a non-memory, non-foldable use!    // Now that we know that all uses of this instruction are part of a chain of @@ -3775,7 +4071,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,      ExtAddrMode Result;      TypePromotionTransaction::ConstRestorationPt LastKnownGood =          TPT.getRestorationPoint(); -    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, +    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, +                                  AddressAccessTy, AS,                                    MemoryInst, Result, InsertedInsts,                                    PromotedInsts, TPT);      Matcher.IgnoreProfitability = true; @@ -3844,7 +4141,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,    bool IsNumUsesConsensusValid = false;    SmallVector<Instruction*, 16> AddrModeInsts;    ExtAddrMode AddrMode; -  TypePromotionTransaction TPT; +  TypePromotionTransaction TPT(RemovedInsts);    TypePromotionTransaction::ConstRestorationPt LastKnownGood =        TPT.getRestorationPoint();    while (!worklist.empty()) { @@ -3869,7 +4166,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,      // addressing instructions might have.      SmallVector<Instruction*, 16> NewAddrModeInsts;      ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( -      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, +      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI,        InsertedInsts, PromotedInsts, TPT);      // This check is broken into two cases with very similar code to avoid using @@ -3935,11 +4232,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,      DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "                   << *MemoryInst << "\n");      if (SunkAddr->getType() != Addr->getType()) -      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); +      SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());    } else if (AddrSinkUsingGEPs ||               (!AddrSinkUsingGEPs.getNumOccurrences() && TM && -              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) -                  ->useAA())) { +              SubtargetInfo->useAA())) {      // By default, we use the GEP-based method when AA is used later. This      // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.      DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4042,7 +4338,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,            // We need to add this separately from the scale above to help with            // SDAG consecutive load/store merging.            if (ResultPtr->getType() != I8PtrTy) -            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); +            ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);            ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");          } @@ -4053,12 +4349,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,          SunkAddr = ResultPtr;        } else {          if (ResultPtr->getType() != I8PtrTy) -          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); +          ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);          SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");        }        if (SunkAddr->getType() != Addr->getType()) -        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); +        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());      }    } else {      DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4185,14 +4481,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {    return MadeChange;  } -/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// \brief Check if all the uses of \p Val are equivalent (or free) zero or  /// sign extensions. -static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { -  assert(!Inst->use_empty() && "Input must have at least one use"); -  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin()); +static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { +  assert(!Val->use_empty() && "Input must have at least one use"); +  const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());    bool IsSExt = isa<SExtInst>(FirstUser);    Type *ExtTy = FirstUser->getType(); -  for (const User *U : Inst->users()) { +  for (const User *U : Val->users()) {      const Instruction *UI = cast<Instruction>(U);      if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))        return false; @@ -4202,11 +4498,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {        continue;      // If IsSExt is true, we are in this situation: -    // a = Inst +    // a = Val      // b = sext ty1 a to ty2      // c = sext ty1 a to ty3      // Assuming ty2 is shorter than ty3, this could be turned into: -    // a = Inst +    // a = Val      // b = sext ty1 a to ty2      // c = sext ty2 b to ty3      // However, the last sext is not free. @@ -4233,51 +4529,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {    return true;  } -/// \brief Try to form ExtLd by promoting \p Exts until they reach a -/// load instruction. -/// If an ext(load) can be formed, it is returned via \p LI for the load -/// and \p Inst for the extension. -/// Otherwise LI == nullptr and Inst == nullptr. -/// When some promotion happened, \p TPT contains the proper state to -/// revert them. +/// \brief Try to speculatively promote extensions in \p Exts and continue +/// promoting through newly promoted operands recursively as far as doing so is +/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. +/// When some promotion happened, \p TPT contains the proper state to revert +/// them.  /// -/// \return true when promoting was necessary to expose the ext(load) -/// opportunity, false otherwise. -/// -/// Example: -/// \code -/// %ld = load i32* %addr -/// %add = add nuw i32 %ld, 4 -/// %zext = zext i32 %add to i64 -/// \endcode -/// => -/// \code -/// %ld = load i32* %addr -/// %zext = zext i32 %ld to i64 -/// %add = add nuw i64 %zext, 4 -/// \encode -/// Thanks to the promotion, we can match zext(load i32*) to i64. -bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, -                                    LoadInst *&LI, Instruction *&Inst, -                                    const SmallVectorImpl<Instruction *> &Exts, -                                    unsigned CreatedInstsCost = 0) { -  // Iterate over all the extensions to see if one form an ext(load). +/// \return true if some promotion happened, false otherwise. +bool CodeGenPrepare::tryToPromoteExts( +    TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, +    SmallVectorImpl<Instruction *> &ProfitablyMovedExts, +    unsigned CreatedInstsCost) { +  bool Promoted = false; + +  // Iterate over all the extensions to try to promote them.    for (auto I : Exts) { -    // Check if we directly have ext(load). -    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) { -      Inst = I; -      // No promotion happened here. -      return false; +    // Early check if we directly have ext(load). +    if (isa<LoadInst>(I->getOperand(0))) { +      ProfitablyMovedExts.push_back(I); +      continue;      } -    // Check whether or not we want to do any promotion. + +    // Check whether or not we want to do any promotion.  The reason we have +    // this check inside the for loop is to catch the case where an extension +    // is directly fed by a load because in such case the extension can be moved +    // up without any promotion on its operands.      if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) -      continue; +      return false; +      // Get the action to perform the promotion. -    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( -        I, InsertedInsts, *TLI, PromotedInsts); +    TypePromotionHelper::Action TPH = +        TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);      // Check if we can promote. -    if (!TPH) +    if (!TPH) { +      // Save the current extension as we cannot move up through its operand. +      ProfitablyMovedExts.push_back(I);        continue; +    } +      // Save the current state.      TypePromotionTransaction::ConstRestorationPt LastKnownGood =          TPT.getRestorationPoint(); @@ -4297,110 +4586,293 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,      // one extension but leave one. However, we optimistically keep going,      // because the new extension may be removed too.      long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; -    TotalCreatedInstsCost -= ExtCost; +    // FIXME: It would be possible to propagate a negative value instead of +    // conservatively ceiling it to 0. +    TotalCreatedInstsCost = +        std::max((long long)0, (TotalCreatedInstsCost - ExtCost));      if (!StressExtLdPromotion &&          (TotalCreatedInstsCost > 1 ||           !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { -      // The promotion is not profitable, rollback to the previous state. +      // This promotion is not profitable, rollback to the previous state, and +      // save the current extension in ProfitablyMovedExts as the latest +      // speculative promotion turned out to be unprofitable.        TPT.rollback(LastKnownGood); +      ProfitablyMovedExts.push_back(I); +      continue; +    } +    // Continue promoting NewExts as far as doing so is profitable. +    SmallVector<Instruction *, 2> NewlyMovedExts; +    (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); +    bool NewPromoted = false; +    for (auto ExtInst : NewlyMovedExts) { +      Instruction *MovedExt = cast<Instruction>(ExtInst); +      Value *ExtOperand = MovedExt->getOperand(0); +      // If we have reached to a load, we need this extra profitability check +      // as it could potentially be merged into an ext(load). +      if (isa<LoadInst>(ExtOperand) && +          !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || +            (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) +        continue; + +      ProfitablyMovedExts.push_back(MovedExt); +      NewPromoted = true; +    } + +    // If none of speculative promotions for NewExts is profitable, rollback +    // and save the current extension (I) as the last profitable extension. +    if (!NewPromoted) { +      TPT.rollback(LastKnownGood); +      ProfitablyMovedExts.push_back(I);        continue;      }      // The promotion is profitable. -    // Check if it exposes an ext(load). -    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); -    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || -               // If we have created a new extension, i.e., now we have two -               // extensions. We must make sure one of them is merged with -               // the load, otherwise we may degrade the code quality. -               (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) -      // Promotion happened. -      return true; -    // If this does not help to expose an ext(load) then, rollback. -    TPT.rollback(LastKnownGood); +    Promoted = true;    } -  // None of the extension can form an ext(load). -  LI = nullptr; -  Inst = nullptr; -  return false; +  return Promoted;  } -/// Move a zext or sext fed by a load into the same basic block as the load, -/// unless conditions are unfavorable. This allows SelectionDAG to fold the -/// extend into the load. -/// \p I[in/out] the extension may be modified during the process if some -/// promotions apply. -/// -bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { -  // ExtLoad formation infrastructure requires TLI to be effective. -  if (!TLI) -    return false; +/// Merging redundant sexts when one is dominating the other. +bool CodeGenPrepare::mergeSExts(Function &F) { +  DominatorTree DT(F); +  bool Changed = false; +  for (auto &Entry : ValToSExtendedUses) { +    SExts &Insts = Entry.second; +    SExts CurPts; +    for (Instruction *Inst : Insts) { +      if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || +          Inst->getOperand(0) != Entry.first) +        continue; +      bool inserted = false; +      for (auto &Pt : CurPts) { +        if (DT.dominates(Inst, Pt)) { +          Pt->replaceAllUsesWith(Inst); +          RemovedInsts.insert(Pt); +          Pt->removeFromParent(); +          Pt = Inst; +          inserted = true; +          Changed = true; +          break; +        } +        if (!DT.dominates(Pt, Inst)) +          // Give up if we need to merge in a common dominator as the +          // expermients show it is not profitable. +          continue; +        Inst->replaceAllUsesWith(Pt); +        RemovedInsts.insert(Inst); +        Inst->removeFromParent(); +        inserted = true; +        Changed = true; +        break; +      } +      if (!inserted) +        CurPts.push_back(Inst); +    } +  } +  return Changed; +} -  // Try to promote a chain of computation if it allows to form -  // an extended load. -  TypePromotionTransaction TPT; -  TypePromotionTransaction::ConstRestorationPt LastKnownGood = -    TPT.getRestorationPoint(); -  SmallVector<Instruction *, 1> Exts; -  Exts.push_back(I); -  // Look for a load being extended. -  LoadInst *LI = nullptr; -  Instruction *OldExt = I; -  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); -  if (!LI || !I) { -    assert(!HasPromoted && !LI && "If we did not match any load instruction " -                                  "the code must remain the same"); -    I = OldExt; -    return false; +/// Return true, if an ext(load) can be formed from an extension in +/// \p MovedExts. +bool CodeGenPrepare::canFormExtLd( +    const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, +    Instruction *&Inst, bool HasPromoted) { +  for (auto *MovedExtInst : MovedExts) { +    if (isa<LoadInst>(MovedExtInst->getOperand(0))) { +      LI = cast<LoadInst>(MovedExtInst->getOperand(0)); +      Inst = MovedExtInst; +      break; +    }    } +  if (!LI) +    return false;    // If they're already in the same block, there's nothing to do.    // Make the cheap checks first if we did not promote.    // If we promoted, we need to check if it is indeed profitable. -  if (!HasPromoted && LI->getParent() == I->getParent()) +  if (!HasPromoted && LI->getParent() == Inst->getParent())      return false; -  EVT VT = TLI->getValueType(*DL, I->getType()); +  EVT VT = TLI->getValueType(*DL, Inst->getType());    EVT LoadVT = TLI->getValueType(*DL, LI->getType());    // If the load has other users and the truncate is not free, this probably    // isn't worthwhile. -  if (!LI->hasOneUse() && -      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && -      !TLI->isTruncateFree(I->getType(), LI->getType())) { -    I = OldExt; -    TPT.rollback(LastKnownGood); +  if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && +      !TLI->isTruncateFree(Inst->getType(), LI->getType()))      return false; -  }    // Check whether the target supports casts folded into loads.    unsigned LType; -  if (isa<ZExtInst>(I)) +  if (isa<ZExtInst>(Inst))      LType = ISD::ZEXTLOAD;    else { -    assert(isa<SExtInst>(I) && "Unexpected ext type!"); +    assert(isa<SExtInst>(Inst) && "Unexpected ext type!");      LType = ISD::SEXTLOAD;    } -  if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) { -    I = OldExt; -    TPT.rollback(LastKnownGood); + +  return TLI->isLoadExtLegal(LType, VT, LoadVT); +} + +/// Move a zext or sext fed by a load into the same basic block as the load, +/// unless conditions are unfavorable. This allows SelectionDAG to fold the +/// extend into the load. +/// +/// E.g., +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which +/// allow us to match zext(load i32*) to i64. +/// +/// Also, try to promote the computations used to obtain a sign extended +/// value used into memory accesses. +/// E.g., +/// \code +/// a = add nsw i32 b, 3 +/// d = sext i32 a to i64 +/// e = getelementptr ..., i64 d +/// \endcode +/// => +/// \code +/// f = sext i32 b to i64 +/// a = add nsw i64 f, 3 +/// e = getelementptr ..., i64 a +/// \endcode +/// +/// \p Inst[in/out] the extension may be modified during the process if some +/// promotions apply. +bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { +  // ExtLoad formation and address type promotion infrastructure requires TLI to +  // be effective. +  if (!TLI)      return false; + +  bool AllowPromotionWithoutCommonHeader = false; +  /// See if it is an interesting sext operations for the address type +  /// promotion before trying to promote it, e.g., the ones with the right +  /// type and used in memory accesses. +  bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( +      *Inst, AllowPromotionWithoutCommonHeader); +  TypePromotionTransaction TPT(RemovedInsts); +  TypePromotionTransaction::ConstRestorationPt LastKnownGood = +      TPT.getRestorationPoint(); +  SmallVector<Instruction *, 1> Exts; +  SmallVector<Instruction *, 2> SpeculativelyMovedExts; +  Exts.push_back(Inst); + +  bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); + +  // Look for a load being extended. +  LoadInst *LI = nullptr; +  Instruction *ExtFedByLoad; + +  // Try to promote a chain of computation if it allows to form an extended +  // load. +  if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { +    assert(LI && ExtFedByLoad && "Expect a valid load and extension"); +    TPT.commit(); +    // Move the extend into the same block as the load +    ExtFedByLoad->removeFromParent(); +    ExtFedByLoad->insertAfter(LI); +    // CGP does not check if the zext would be speculatively executed when moved +    // to the same basic block as the load. Preserving its original location +    // would pessimize the debugging experience, as well as negatively impact +    // the quality of sample pgo. We don't want to use "line 0" as that has a +    // size cost in the line-table section and logically the zext can be seen as +    // part of the load. Therefore we conservatively reuse the same debug +    // location for the load and the zext. +    ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); +    ++NumExtsMoved; +    Inst = ExtFedByLoad; +    return true;    } -  // Move the extend into the same block as the load, so that SelectionDAG -  // can fold it. -  TPT.commit(); -  I->removeFromParent(); -  I->insertAfter(LI); -  // CGP does not check if the zext would be speculatively executed when moved -  // to the same basic block as the load. Preserving its original location would -  // pessimize the debugging experience, as well as negatively impact the  -  // quality of sample pgo. We don't want to use "line 0" as that has a -  // size cost in the line-table section and logically the zext can be seen as -  // part of the load. Therefore we conservatively reuse the same debug location -  // for the load and the zext. -  I->setDebugLoc(LI->getDebugLoc()); -  ++NumExtsMoved; -  return true; +  // Continue promoting SExts if known as considerable depending on targets. +  if (ATPConsiderable && +      performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, +                                  HasPromoted, TPT, SpeculativelyMovedExts)) +    return true; + +  TPT.rollback(LastKnownGood); +  return false; +} + +// Perform address type promotion if doing so is profitable. +// If AllowPromotionWithoutCommonHeader == false, we should find other sext +// instructions that sign extended the same initial value. However, if +// AllowPromotionWithoutCommonHeader == true, we expect promoting the +// extension is just profitable. +bool CodeGenPrepare::performAddressTypePromotion( +    Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, +    bool HasPromoted, TypePromotionTransaction &TPT, +    SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { +  bool Promoted = false; +  SmallPtrSet<Instruction *, 1> UnhandledExts; +  bool AllSeenFirst = true; +  for (auto I : SpeculativelyMovedExts) { +    Value *HeadOfChain = I->getOperand(0); +    DenseMap<Value *, Instruction *>::iterator AlreadySeen = +        SeenChainsForSExt.find(HeadOfChain); +    // If there is an unhandled SExt which has the same header, try to promote +    // it as well. +    if (AlreadySeen != SeenChainsForSExt.end()) { +      if (AlreadySeen->second != nullptr) +        UnhandledExts.insert(AlreadySeen->second); +      AllSeenFirst = false; +    } +  } + +  if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && +                        SpeculativelyMovedExts.size() == 1)) { +    TPT.commit(); +    if (HasPromoted) +      Promoted = true; +    for (auto I : SpeculativelyMovedExts) { +      Value *HeadOfChain = I->getOperand(0); +      SeenChainsForSExt[HeadOfChain] = nullptr; +      ValToSExtendedUses[HeadOfChain].push_back(I); +    } +    // Update Inst as promotion happen. +    Inst = SpeculativelyMovedExts.pop_back_val(); +  } else { +    // This is the first chain visited from the header, keep the current chain +    // as unhandled. Defer to promote this until we encounter another SExt +    // chain derived from the same header. +    for (auto I : SpeculativelyMovedExts) { +      Value *HeadOfChain = I->getOperand(0); +      SeenChainsForSExt[HeadOfChain] = Inst; +    } +    return false; +  } + +  if (!AllSeenFirst && !UnhandledExts.empty()) +    for (auto VisitedSExt : UnhandledExts) { +      if (RemovedInsts.count(VisitedSExt)) +        continue; +      TypePromotionTransaction TPT(RemovedInsts); +      SmallVector<Instruction *, 1> Exts; +      SmallVector<Instruction *, 2> Chains; +      Exts.push_back(VisitedSExt); +      bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); +      TPT.commit(); +      if (HasPromoted) +        Promoted = true; +      for (auto I : Chains) { +        Value *HeadOfChain = I->getOperand(0); +        // Mark this as handled. +        SeenChainsForSExt[HeadOfChain] = nullptr; +        ValToSExtendedUses[HeadOfChain].push_back(I); +      } +    } +  return Promoted;  }  bool CodeGenPrepare::optimizeExtUses(Instruction *I) { @@ -4534,13 +5006,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {        !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))      return false; -  // Skip loads we've already transformed or have no reason to transform. -  if (Load->hasOneUse()) { -    User *LoadUser = *Load->user_begin(); -    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() && -        !dyn_cast<PHINode>(LoadUser)) -      return false; -  } +  // Skip loads we've already transformed. +  if (Load->hasOneUse() && +      InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) +    return false;    // Look at all uses of Load, looking through phis, to determine how many bits    // of the loaded value are needed. @@ -4620,7 +5089,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {    //    // Also avoid hoisting if we didn't see any ands with the exact DemandBits    // mask, since these are the only ands that will be removed by isel. -  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || +  if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||        WidestAndBits != DemandBits)      return false; @@ -4636,6 +5105,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {    IRBuilder<> Builder(Load->getNextNode());    auto *NewAnd = dyn_cast<Instruction>(        Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); +  // Mark this instruction as "inserted by CGP", so that other +  // optimizations don't touch it. +  InsertedInsts.insert(NewAnd);    // Replace all uses of load with new and (except for the use of load in the    // new and itself). @@ -4985,7 +5457,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {    auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);    ExtInst->insertBefore(SI);    SI->setCondition(ExtInst); -  for (SwitchInst::CaseIt Case : SI->cases()) { +  for (auto Case : SI->cases()) {      APInt NarrowConst = Case.getCaseValue()->getValue();      APInt WideConst = (ExtType == Instruction::ZExt) ?                        NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); @@ -5514,7 +5986,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {                TargetLowering::TypeExpandInteger) {          return SinkCast(CI);        } else { -        bool MadeChange = moveExtToFormExtLoad(I); +        bool MadeChange = optimizeExt(I);          return MadeChange | optimizeExtUses(I);        }      } @@ -5548,8 +6020,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {      return false;    } +  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { +      unsigned AS = RMW->getPointerAddressSpace(); +      return optimizeMemoryInst(I, RMW->getPointerOperand(), +                                RMW->getType(), AS); +  } + +  if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { +      unsigned AS = CmpX->getPointerAddressSpace(); +      return optimizeMemoryInst(I, CmpX->getPointerOperand(), +                                CmpX->getCompareOperand()->getType(), AS); +  } +    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); +  if (BinOp && (BinOp->getOpcode() == Instruction::And) && +      EnableAndCmpSinking && TLI) +    return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); +    if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||                  BinOp->getOpcode() == Instruction::LShr)) {      ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); @@ -5679,68 +6167,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {    return MadeChange;  } -// If there is a sequence that branches based on comparing a single bit -// against zero that can be combined into a single instruction, and the -// target supports folding these into a single instruction, sink the -// mask and compare into the branch uses. Do this before OptimizeBlock -> -// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being -// searched for. -bool CodeGenPrepare::sinkAndCmp(Function &F) { -  if (!EnableAndCmpSinking) -    return false; -  if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) -    return false; -  bool MadeChange = false; -  for (BasicBlock &BB : F) { -    // Does this BB end with the following? -    //   %andVal = and %val, #single-bit-set -    //   %icmpVal = icmp %andResult, 0 -    //   br i1 %cmpVal label %dest1, label %dest2" -    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator()); -    if (!Brcc || !Brcc->isConditional()) -      continue; -    ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0)); -    if (!Cmp || Cmp->getParent() != &BB) -      continue; -    ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1)); -    if (!Zero || !Zero->isZero()) -      continue; -    Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0)); -    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB) -      continue; -    ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1)); -    if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) -      continue; -    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump()); - -    // Push the "and; icmp" for any users that are conditional branches. -    // Since there can only be one branch use per BB, we don't need to keep -    // track of which BBs we insert into. -    for (Use &TheUse : Cmp->uses()) { -      // Find brcc use. -      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse); -      if (!BrccUser || !BrccUser->isConditional()) -        continue; -      BasicBlock *UserBB = BrccUser->getParent(); -      if (UserBB == &BB) continue; -      DEBUG(dbgs() << "found Brcc use\n"); - -      // Sink the "and; icmp" to use. -      MadeChange = true; -      BinaryOperator *NewAnd = -        BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", -                                  BrccUser); -      CmpInst *NewCmp = -        CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, -                        "", BrccUser); -      TheUse = NewCmp; -      ++NumAndCmpsMoved; -      DEBUG(BrccUser->getParent()->dump()); -    } -  } -  return MadeChange; -} -  /// \brief Scale down both weights to fit into uint32_t.  static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {    uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; diff --git a/lib/CodeGen/CountingFunctionInserter.cpp b/lib/CodeGen/CountingFunctionInserter.cpp index 1e46a7a99e7e..7f7350f5fb5c 100644 --- a/lib/CodeGen/CountingFunctionInserter.cpp +++ b/lib/CodeGen/CountingFunctionInserter.cpp @@ -41,7 +41,7 @@ namespace {        Type *VoidTy = Type::getVoidTy(F.getContext());        Constant *CountingFn =          F.getParent()->getOrInsertFunction(CountingFunctionName, -                                           VoidTy, nullptr); +                                           VoidTy);        CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt());        return true;      } diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index 5d60c3055456..e1eeddf0816c 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {    // callee-saved register that is not saved in the prolog.    const MachineFrameInfo &MFI = MF.getFrameInfo();    BitVector Pristine = MFI.getPristineRegs(MF); -  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { -    if (!IsReturnBlock && !Pristine.test(*I)) continue; +  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; +       ++I) { +    unsigned Reg = *I; +    if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) +      continue;      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {        unsigned Reg = *AI;        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp index 17c229a216ae..7ac2e5445435 100644 --- a/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {      // Start out assuming that reserved registers are live out of this block.      LivePhysRegs = MRI->getReservedRegs(); -    // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not +    // Add live-ins from successors to LivePhysRegs. Normally, physregs are not      // live across blocks, but some targets (x86) can have flags live out of a      // block.      for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp index a7ba694c144d..6f4ea1912cf4 100644 --- a/lib/CodeGen/DetectDeadLanes.cpp +++ b/lib/CodeGen/DetectDeadLanes.cpp @@ -441,7 +441,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {            const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);            CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);            if (CrossCopy) -            DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI); +            DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);          }          if (!CrossCopy) diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp index 32c57e3e3705..e272d25047e6 100644 --- a/lib/CodeGen/ExecutionDepsFix.cpp +++ b/lib/CodeGen/ExecutionDepsFix.cpp @@ -6,21 +6,9 @@  // License. See LICENSE.TXT for details.  //  //===----------------------------------------------------------------------===// -// -// This file contains the execution dependency fix pass. -// -// Some X86 SSE instructions like mov, and, or, xor are available in different -// variants for different operand types. These variant instructions are -// equivalent, but on Nehalem and newer cpus there is extra latency -// transferring data between integer and floating point domains.  ARM cores -// have similar issues when they are configured with both VFP and NEON -// pipelines. -// -// This pass changes the variant instructions to minimize domain crossings. -// -//===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ExecutionDepsFix.h" +  #include "llvm/ADT/PostOrderIterator.h"  #include "llvm/ADT/iterator_range.h"  #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,193 +23,18 @@  using namespace llvm; -#define DEBUG_TYPE "execution-fix" - -/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track -/// of execution domains. -/// -/// An open DomainValue represents a set of instructions that can still switch -/// execution domain. Multiple registers may refer to the same open -/// DomainValue - they will eventually be collapsed to the same execution -/// domain. -/// -/// A collapsed DomainValue represents a single register that has been forced -/// into one of more execution domains. There is a separate collapsed -/// DomainValue for each register, but it may contain multiple execution -/// domains. A register value is initially created in a single execution -/// domain, but if we were forced to pay the penalty of a domain crossing, we -/// keep track of the fact that the register is now available in multiple -/// domains. -namespace { -struct DomainValue { -  // Basic reference counting. -  unsigned Refs; - -  // Bitmask of available domains. For an open DomainValue, it is the still -  // possible domains for collapsing. For a collapsed DomainValue it is the -  // domains where the register is available for free. -  unsigned AvailableDomains; - -  // Pointer to the next DomainValue in a chain.  When two DomainValues are -  // merged, Victim.Next is set to point to Victor, so old DomainValue -  // references can be updated by following the chain. -  DomainValue *Next; - -  // Twiddleable instructions using or defining these registers. -  SmallVector<MachineInstr*, 8> Instrs; - -  // A collapsed DomainValue has no instructions to twiddle - it simply keeps -  // track of the domains where the registers are already available. -  bool isCollapsed() const { return Instrs.empty(); } - -  // Is domain available? -  bool hasDomain(unsigned domain) const { -    assert(domain < -               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && -           "undefined behavior"); -    return AvailableDomains & (1u << domain); -  } - -  // Mark domain as available. -  void addDomain(unsigned domain) { -    AvailableDomains |= 1u << domain; -  } - -  // Restrict to a single domain available. -  void setSingleDomain(unsigned domain) { -    AvailableDomains = 1u << domain; -  } - -  // Return bitmask of domains that are available and in mask. -  unsigned getCommonDomains(unsigned mask) const { -    return AvailableDomains & mask; -  } - -  // First domain available. -  unsigned getFirstDomain() const { -    return countTrailingZeros(AvailableDomains); -  } - -  DomainValue() : Refs(0) { clear(); } - -  // Clear this DomainValue and point to next which has all its data. -  void clear() { -    AvailableDomains = 0; -    Next = nullptr; -    Instrs.clear(); -  } -}; -} - -namespace { -/// Information about a live register. -struct LiveReg { -  /// Value currently in this register, or NULL when no value is being tracked. -  /// This counts as a DomainValue reference. -  DomainValue *Value; - -  /// Instruction that defined this register, relative to the beginning of the -  /// current basic block.  When a LiveReg is used to represent a live-out -  /// register, this value is relative to the end of the basic block, so it -  /// will be a negative number. -  int Def; -}; -} // anonymous namespace - -namespace { -class ExeDepsFix : public MachineFunctionPass { -  static char ID; -  SpecificBumpPtrAllocator<DomainValue> Allocator; -  SmallVector<DomainValue*,16> Avail; - -  const TargetRegisterClass *const RC; -  MachineFunction *MF; -  const TargetInstrInfo *TII; -  const TargetRegisterInfo *TRI; -  RegisterClassInfo RegClassInfo; -  std::vector<SmallVector<int, 1>> AliasMap; -  const unsigned NumRegs; -  LiveReg *LiveRegs; -  typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap; -  LiveOutMap LiveOuts; - -  /// List of undefined register reads in this block in forward order. -  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads; - -  /// Storage for register unit liveness. -  LivePhysRegs LiveRegSet; - -  /// Current instruction number. -  /// The first instruction in each basic block is 0. -  int CurInstr; - -  /// True when the current block has a predecessor that hasn't been visited -  /// yet. -  bool SeenUnknownBackEdge; - -public: -  ExeDepsFix(const TargetRegisterClass *rc) -    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {} - -  void getAnalysisUsage(AnalysisUsage &AU) const override { -    AU.setPreservesAll(); -    MachineFunctionPass::getAnalysisUsage(AU); -  } - -  bool runOnMachineFunction(MachineFunction &MF) override; - -  MachineFunctionProperties getRequiredProperties() const override { -    return MachineFunctionProperties().set( -        MachineFunctionProperties::Property::NoVRegs); -  } - -  StringRef getPassName() const override { return "Execution dependency fix"; } - -private: -  iterator_range<SmallVectorImpl<int>::const_iterator> -  regIndices(unsigned Reg) const; - -  // DomainValue allocation. -  DomainValue *alloc(int domain = -1); -  DomainValue *retain(DomainValue *DV) { -    if (DV) ++DV->Refs; -    return DV; -  } -  void release(DomainValue*); -  DomainValue *resolve(DomainValue*&); - -  // LiveRegs manipulations. -  void setLiveReg(int rx, DomainValue *DV); -  void kill(int rx); -  void force(int rx, unsigned domain); -  void collapse(DomainValue *dv, unsigned domain); -  bool merge(DomainValue *A, DomainValue *B); - -  void enterBasicBlock(MachineBasicBlock*); -  void leaveBasicBlock(MachineBasicBlock*); -  void visitInstr(MachineInstr*); -  void processDefs(MachineInstr*, bool Kill); -  void visitSoftInstr(MachineInstr*, unsigned mask); -  void visitHardInstr(MachineInstr*, unsigned domain); -  void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, -                                unsigned Pref); -  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); -  void processUndefReads(MachineBasicBlock*); -}; -} - -char ExeDepsFix::ID = 0; +#define DEBUG_TYPE "execution-deps-fix"  /// Translate TRI register number to a list of indices into our smaller tables  /// of interesting registers.  iterator_range<SmallVectorImpl<int>::const_iterator> -ExeDepsFix::regIndices(unsigned Reg) const { +ExecutionDepsFix::regIndices(unsigned Reg) const {    assert(Reg < AliasMap.size() && "Invalid register");    const auto &Entry = AliasMap[Reg];    return make_range(Entry.begin(), Entry.end());  } -DomainValue *ExeDepsFix::alloc(int domain) { +DomainValue *ExecutionDepsFix::alloc(int domain) {    DomainValue *dv = Avail.empty() ?                        new(Allocator.Allocate()) DomainValue :                        Avail.pop_back_val(); @@ -234,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) {  /// Release a reference to DV.  When the last reference is released,  /// collapse if needed. -void ExeDepsFix::release(DomainValue *DV) { +void ExecutionDepsFix::release(DomainValue *DV) {    while (DV) {      assert(DV->Refs && "Bad DomainValue");      if (--DV->Refs) @@ -254,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) {  /// Follow the chain of dead DomainValues until a live DomainValue is reached.  /// Update the referenced pointer when necessary. -DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { +DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) {    DomainValue *DV = DVRef;    if (!DV || !DV->Next)      return DV; @@ -271,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {  }  /// Set LiveRegs[rx] = dv, updating reference counts. -void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { +void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) {    assert(unsigned(rx) < NumRegs && "Invalid index");    assert(LiveRegs && "Must enter basic block first."); @@ -283,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {  }  // Kill register rx, recycle or collapse any DomainValue. -void ExeDepsFix::kill(int rx) { +void ExecutionDepsFix::kill(int rx) {    assert(unsigned(rx) < NumRegs && "Invalid index");    assert(LiveRegs && "Must enter basic block first.");    if (!LiveRegs[rx].Value) @@ -294,7 +107,7 @@ void ExeDepsFix::kill(int rx) {  }  /// Force register rx into domain. -void ExeDepsFix::force(int rx, unsigned domain) { +void ExecutionDepsFix::force(int rx, unsigned domain) {    assert(unsigned(rx) < NumRegs && "Invalid index");    assert(LiveRegs && "Must enter basic block first.");    if (DomainValue *dv = LiveRegs[rx].Value) { @@ -317,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) {  /// Collapse open DomainValue into given domain. If there are multiple  /// registers using dv, they each get a unique collapsed DomainValue. -void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { +void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) {    assert(dv->hasDomain(domain) && "Cannot collapse");    // Collapse all the instructions. @@ -333,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {  }  /// All instructions and registers in B are moved to A, and B is released. -bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { +bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) {    assert(!A->isCollapsed() && "Cannot merge into collapsed");    assert(!B->isCollapsed() && "Cannot merge from collapsed");    if (A == B) @@ -359,10 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {  }  /// Set up LiveRegs by merging predecessor live-out values. -void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { -  // Detect back-edges from predecessors we haven't processed yet. -  SeenUnknownBackEdge = false; - +void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {    // Reset instruction counter in each basic block.    CurInstr = 0; @@ -397,18 +207,21 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {    // Try to coalesce live-out registers from predecessors.    for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),         pe = MBB->pred_end(); pi != pe; ++pi) { -    LiveOutMap::const_iterator fi = LiveOuts.find(*pi); -    if (fi == LiveOuts.end()) { -      SeenUnknownBackEdge = true; +    auto fi = MBBInfos.find(*pi); +    assert(fi != MBBInfos.end() && +           "Should have pre-allocated MBBInfos for all MBBs"); +    LiveReg *Incoming = fi->second.OutRegs; +    // Incoming is null if this is a backedge from a BB +    // we haven't processed yet +    if (Incoming == nullptr) {        continue;      } -    assert(fi->second && "Can't have NULL entries");      for (unsigned rx = 0; rx != NumRegs; ++rx) {        // Use the most recent predecessor def for each register. -      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def); +      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); -      DomainValue *pdv = resolve(fi->second[rx].Value); +      DomainValue *pdv = resolve(Incoming[rx].Value);        if (!pdv)          continue;        if (!LiveRegs[rx].Value) { @@ -432,35 +245,34 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {          force(rx, pdv->getFirstDomain());      }    } -  DEBUG(dbgs() << "BB#" << MBB->getNumber() -        << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n")); +  DEBUG( +      dbgs() << "BB#" << MBB->getNumber() +             << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));  } -void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { +void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {    assert(LiveRegs && "Must enter basic block first."); -  // Save live registers at end of MBB - used by enterBasicBlock(). -  // Also use LiveOuts as a visited set to detect back-edges. -  bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second; - -  if (First) { -    // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to -    // the end of this block instead of the beginning. -    for (unsigned i = 0, e = NumRegs; i != e; ++i) -      LiveRegs[i].Def -= CurInstr; -  } else { -    // Insertion failed, this must be the second pass. +  LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; +  // Save register clearances at end of MBB - used by enterBasicBlock(). +  MBBInfos[MBB].OutRegs = LiveRegs; + +  // While processing the basic block, we kept `Def` relative to the start +  // of the basic block for convenience. However, future use of this information +  // only cares about the clearance from the end of the block, so adjust +  // everything to be relative to the end of the basic block. +  for (unsigned i = 0, e = NumRegs; i != e; ++i) +    LiveRegs[i].Def -= CurInstr; +  if (OldOutRegs) { +    // This must be the second pass.      // Release all the DomainValues instead of keeping them.      for (unsigned i = 0, e = NumRegs; i != e; ++i) -      release(LiveRegs[i].Value); -    delete[] LiveRegs; +      release(OldOutRegs[i].Value); +    delete[] OldOutRegs;    }    LiveRegs = nullptr;  } -void ExeDepsFix::visitInstr(MachineInstr *MI) { -  if (MI->isDebugValue()) -    return; - +bool ExecutionDepsFix::visitInstr(MachineInstr *MI) {    // Update instructions with explicit execution domains.    std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);    if (DomP.first) { @@ -470,16 +282,16 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {        visitHardInstr(MI, DomP.first);    } -  // Process defs to track register ages, and kill values clobbered by generic -  // instructions. -  processDefs(MI, !DomP.first); +  return !DomP.first;  }  /// \brief Helps avoid false dependencies on undef registers by updating the  /// machine instructions' undef operand to use a register that the instruction  /// is truly dependent on, or use a register with clearance higher than Pref. -void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, -                                          unsigned Pref) { +/// Returns true if it was able to find a true dependency, thus not requiring +/// a dependency breaking instruction regardless of clearance. +bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, +                                                unsigned OpIdx, unsigned Pref) {    MachineOperand &MO = MI->getOperand(OpIdx);    assert(MO.isUndef() && "Expected undef machine operand"); @@ -487,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,    // Update only undef operands that are mapped to one register.    if (AliasMap[OriginalReg].size() != 1) -    return; +    return false;    // Get the undef operand's register class    const TargetRegisterClass *OpRC = @@ -502,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,      // We found a true dependency - replace the undef register with the true      // dependency.      MO.setReg(CurrMO.getReg()); -    return; +    return true;    }    // Go over all registers in the register class and find the register with @@ -527,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,    // Update the operand if we found a register with better clearance.    if (MaxClearanceReg != OriginalReg)      MO.setReg(MaxClearanceReg); + +  return false;  }  /// \brief Return true to if it makes sense to break dependence on a partial def  /// or undef use. -bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, -                                       unsigned Pref) { +bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, +                                             unsigned Pref) {    unsigned reg = MI->getOperand(OpIdx).getReg();    for (int rx : regIndices(reg)) {      unsigned Clearance = CurInstr - LiveRegs[rx].Def; @@ -542,14 +356,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,        DEBUG(dbgs() << ": Break dependency.\n");        continue;      } -    // The current clearance seems OK, but we may be ignoring a def from a -    // back-edge. -    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { -      DEBUG(dbgs() << ": OK .\n"); -      return false; -    } -    // A def from an unprocessed back-edge may make us break this dependency. -    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); +    DEBUG(dbgs() << ": OK .\n");      return false;    }    return true; @@ -559,16 +366,22 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,  // If Kill is set, also kill off DomainValues clobbered by the defs.  //  // Also break dependencies on partial defs and undef uses. -void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { +void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, +                                   bool Kill) {    assert(!MI->isDebugValue() && "Won't process debug values");    // Break dependence on undef uses. Do this before updating LiveRegs below.    unsigned OpNum; -  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); -  if (Pref) { -    pickBestRegisterForUndef(MI, OpNum, Pref); -    if (shouldBreakDependence(MI, OpNum, Pref)) -      UndefReads.push_back(std::make_pair(MI, OpNum)); +  if (breakDependency) { +    unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); +    if (Pref) { +      bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); +      // We don't need to bother trying to break a dependency if this +      // instruction has a true dependency on that register through another +      // operand - we'll have to wait for it to be available regardless. +      if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) +        UndefReads.push_back(std::make_pair(MI, OpNum)); +    }    }    const MCInstrDesc &MCID = MI->getDesc();    for (unsigned i = 0, @@ -584,11 +397,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {        DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr                     << '\t' << *MI); -      // Check clearance before partial register updates. -      // Call breakDependence before setting LiveRegs[rx].Def. -      unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); -      if (Pref && shouldBreakDependence(MI, i, Pref)) -        TII->breakPartialRegDependency(*MI, i, TRI); +      if (breakDependency) { +        // Check clearance before partial register updates. +        // Call breakDependence before setting LiveRegs[rx].Def. +        unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); +        if (Pref && shouldBreakDependence(MI, i, Pref)) +          TII->breakPartialRegDependency(*MI, i, TRI); +      }        // How many instructions since rx was last written?        LiveRegs[rx].Def = CurInstr; @@ -607,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {  /// only do it on demand. Note that the occurrence of undefined register reads  /// that should be broken is very rare, but when they occur we may have many in  /// a single block. -void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { +void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) {    if (UndefReads.empty())      return; @@ -640,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {  // A hard instruction only works in one domain. All input registers will be  // forced into that domain. -void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { +void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {    // Collapse all uses.    for (unsigned i = mi->getDesc().getNumDefs(),                  e = mi->getDesc().getNumOperands(); i != e; ++i) { @@ -663,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {  }  // A soft instruction can be changed to work in other domains given by mask. -void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { +void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {    // Bitmask of available domains for this instruction after taking collapsed    // operands into account.    unsigned available = mask; @@ -774,7 +589,34 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {    }  } -bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { +void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, +                                         bool PrimaryPass) { +  enterBasicBlock(MBB); +  // If this block is not done, it makes little sense to make any decisions +  // based on clearance information. We need to make a second pass anyway, +  // and by then we'll have better information, so we can avoid doing the work +  // to try and break dependencies now. +  bool breakDependency = isBlockDone(MBB); +  for (MachineInstr &MI : *MBB) { +    if (!MI.isDebugValue()) { +      bool Kill = false; +      if (PrimaryPass) +        Kill = visitInstr(&MI); +      processDefs(&MI, breakDependency, Kill); +    } +  } +  if (breakDependency) +    processUndefReads(MBB); +  leaveBasicBlock(MBB); +} + +bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { +  return MBBInfos[MBB].PrimaryCompleted && +         MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && +         MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) {    if (skipFunction(*mf.getFunction()))      return false;    MF = &mf; @@ -810,52 +652,104 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {          AliasMap[*AI].push_back(i);    } +  // Initialize the MMBInfos +  for (auto &MBB : mf) { +    MBBInfo InitialInfo; +    MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); +  } + +  /* +   *  We want to visit every instruction in every basic block in order to update +   *  it's execution domain or break any false dependencies. However, for the +   *  dependency breaking, we need to know clearances from all predecessors +   *  (including any backedges). One way to do so would be to do two complete +   *  passes over all basic blocks/instructions, the first for recording +   *  clearances, the second to break the dependencies. However, for functions +   *  without backedges, or functions with a lot of straight-line code, and +   *  a small loop, that would be a lot of unnecessary work (since only the +   *  BBs that are part of the loop require two passes). As an example, +   *  consider the following loop. +   * +   * +   *     PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT +   *           ^                                  | +   *           +----------------------------------+ +   * +   *  The iteration order is as follows: +   *  Naive: PH A B C D A' B' C' D' +   *  Optimized: PH A B C A' B' C' D +   * +   *  Note that we avoid processing D twice, because we can entirely process +   *  the predecessors before getting to D. We call a block that is ready +   *  for its second round of processing `done` (isBlockDone). Once we finish +   *  processing some block, we update the counters in MBBInfos and re-process +   *  any successors that are now done. +   */ +    MachineBasicBlock *Entry = &*MF->begin();    ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry); -  SmallVector<MachineBasicBlock*, 16> Loops; +  SmallVector<MachineBasicBlock *, 4> Workqueue;    for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator           MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {      MachineBasicBlock *MBB = *MBBI; -    enterBasicBlock(MBB); -    if (SeenUnknownBackEdge) -      Loops.push_back(MBB); -    for (MachineInstr &MI : *MBB) -      visitInstr(&MI); -    processUndefReads(MBB); -    leaveBasicBlock(MBB); +    // N.B: IncomingProcessed and IncomingCompleted were already updated while +    // processing this block's predecessors. +    MBBInfos[MBB].PrimaryCompleted = true; +    MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; +    bool Primary = true; +    Workqueue.push_back(MBB); +    while (!Workqueue.empty()) { +      MachineBasicBlock *ActiveMBB = &*Workqueue.back(); +      Workqueue.pop_back(); +      processBasicBlock(ActiveMBB, Primary); +      bool Done = isBlockDone(ActiveMBB); +      for (auto *Succ : ActiveMBB->successors()) { +        if (!isBlockDone(Succ)) { +          if (Primary) { +            MBBInfos[Succ].IncomingProcessed++; +          } +          if (Done) { +            MBBInfos[Succ].IncomingCompleted++; +          } +          if (isBlockDone(Succ)) { +            Workqueue.push_back(Succ); +          } +        } +      } +      Primary = false; +    }    } -  // Visit all the loop blocks again in order to merge DomainValues from -  // back-edges. -  for (MachineBasicBlock *MBB : Loops) { -    enterBasicBlock(MBB); -    for (MachineInstr &MI : *MBB) -      if (!MI.isDebugValue()) -        processDefs(&MI, false); -    processUndefReads(MBB); -    leaveBasicBlock(MBB); +  // We need to go through again and finalize any blocks that are not done yet. +  // This is possible if blocks have dead predecessors, so we didn't visit them +  // above. +  for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator +           MBBI = RPOT.begin(), +           MBBE = RPOT.end(); +       MBBI != MBBE; ++MBBI) { +    MachineBasicBlock *MBB = *MBBI; +    if (!isBlockDone(MBB)) { +      processBasicBlock(MBB, false); +      // Don't update successors here. We'll get to them anyway through this +      // loop. +    }    }    // Clear the LiveOuts vectors and collapse any remaining DomainValues.    for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator           MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { -    LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI); -    if (FI == LiveOuts.end() || !FI->second) +    auto FI = MBBInfos.find(*MBBI); +    if (FI == MBBInfos.end() || !FI->second.OutRegs)        continue;      for (unsigned i = 0, e = NumRegs; i != e; ++i) -      if (FI->second[i].Value) -        release(FI->second[i].Value); -    delete[] FI->second; +      if (FI->second.OutRegs[i].Value) +        release(FI->second.OutRegs[i].Value); +    delete[] FI->second.OutRegs;    } -  LiveOuts.clear(); +  MBBInfos.clear();    UndefReads.clear();    Avail.clear();    Allocator.DestroyAll();    return false;  } - -FunctionPass * -llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) { -  return new ExeDepsFix(RC); -} diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp new file mode 100644 index 000000000000..0759bf6713e0 --- /dev/null +++ b/lib/CodeGen/FEntryInserter.cpp @@ -0,0 +1,55 @@ +//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file edits function bodies to insert fentry calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +namespace { +struct FEntryInserter : public MachineFunctionPass { +  static char ID; // Pass identification, replacement for typeid +  FEntryInserter() : MachineFunctionPass(ID) { +    initializeFEntryInserterPass(*PassRegistry::getPassRegistry()); +  } + +  bool runOnMachineFunction(MachineFunction &F) override; +}; +} + +bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) { +  const std::string FEntryName = +      MF.getFunction()->getFnAttribute("fentry-call").getValueAsString(); +  if (FEntryName != "true") +    return false; + +  auto &FirstMBB = *MF.begin(); +  auto &FirstMI = *FirstMBB.begin(); + +  auto *TII = MF.getSubtarget().getInstrInfo(); +  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(), +          TII->get(TargetOpcode::FENTRY_CALL)); +  return true; +} + +char FEntryInserter::ID = 0; +char &llvm::FEntryInserterID = FEntryInserter::ID; +INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false, +                false) diff --git a/lib/CodeGen/FaultMaps.cpp b/lib/CodeGen/FaultMaps.cpp index 2acafafdb9fc..43f364128978 100644 --- a/lib/CodeGen/FaultMaps.cpp +++ b/lib/CodeGen/FaultMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- FaultMaps.cpp ---------------------------===// +//===- FaultMaps.cpp ------------------------------------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -7,14 +7,17 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/FaultMaps.h" - +#include "llvm/ADT/Twine.h"  #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/FaultMaps.h"  #include "llvm/MC/MCContext.h"  #include "llvm/MC/MCExpr.h"  #include "llvm/MC/MCObjectFileInfo.h"  #include "llvm/MC/MCStreamer.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h"  using namespace llvm; @@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,    }  } -  const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) {    switch (FT) {    default:      llvm_unreachable("unhandled fault type!"); -    case FaultMaps::FaultingLoad:      return "FaultingLoad"; +  case FaultMaps::FaultingLoadStore: +    return "FaultingLoadStore"; +  case FaultMaps::FaultingStore: +    return "FaultingStore";    }  } diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp index 31ab86fdf276..6be4c16c6301 100644 --- a/lib/CodeGen/GCStrategy.cpp +++ b/lib/CodeGen/GCStrategy.cpp @@ -1,4 +1,4 @@ -//===-- GCStrategy.cpp - Garbage Collector Description --------------------===// +//===- GCStrategy.cpp - Garbage Collector Description ---------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -18,7 +18,4 @@ using namespace llvm;  LLVM_INSTANTIATE_REGISTRY(GCRegistry) -GCStrategy::GCStrategy() -    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false), -      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true), -      UsesMetadata(false) {} +GCStrategy::GCStrategy() = default; diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt index 76ab5d36047e..03a8c4f5f909 100644 --- a/lib/CodeGen/GlobalISel/CMakeLists.txt +++ b/lib/CodeGen/GlobalISel/CMakeLists.txt @@ -22,7 +22,6 @@ else()    set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})  endif() -  # In LLVMBuild.txt files, it is not possible to mark a dependency to a  # library as optional. So instead, generate an empty library if we did  # not ask for it.  diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp index 13212212fa01..035a2ac78ed9 100644 --- a/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -24,40 +24,42 @@  using namespace llvm;  bool CallLowering::lowerCall( -    MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg, +    MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg,      ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const { -  auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout(); +  auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();    // First step is to marshall all the function's parameters into the correct    // physregs and memory locations. Gather the sequence of argument types that    // we'll pass to the assigner function.    SmallVector<ArgInfo, 8> OrigArgs;    unsigned i = 0; -  for (auto &Arg : CI.arg_operands()) { -    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}}; -    setArgFlags(OrigArg, i + 1, DL, CI); +  unsigned NumFixedArgs = CS.getFunctionType()->getNumParams(); +  for (auto &Arg : CS.args()) { +    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}, +                    i < NumFixedArgs}; +    setArgFlags(OrigArg, i + 1, DL, CS);      OrigArgs.push_back(OrigArg);      ++i;    }    MachineOperand Callee = MachineOperand::CreateImm(0); -  if (Function *F = CI.getCalledFunction()) +  if (const Function *F = CS.getCalledFunction())      Callee = MachineOperand::CreateGA(F, 0);    else      Callee = MachineOperand::CreateReg(GetCalleeReg(), false); -  ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}}; +  ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}};    if (!OrigRet.Ty->isVoidTy()) -    setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI); +    setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS); -  return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs); +  return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs);  }  template <typename FuncInfoTy>  void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,                                 const DataLayout &DL,                                 const FuncInfoTy &FuncInfo) const { -  const AttributeSet &Attrs = FuncInfo.getAttributes(); +  const AttributeList &Attrs = FuncInfo.getAttributes();    if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))      Arg.Flags.setZExt();    if (Attrs.hasAttribute(OpIdx, Attribute::SExt)) @@ -103,7 +105,6 @@ CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx,                                      const CallInst &FuncInfo) const;  bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, -                                     CCAssignFn *AssignFn,                                       ArrayRef<ArgInfo> Args,                                       ValueHandler &Handler) const {    MachineFunction &MF = MIRBuilder.getMF(); @@ -116,12 +117,20 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,    unsigned NumArgs = Args.size();    for (unsigned i = 0; i != NumArgs; ++i) {      MVT CurVT = MVT::getVT(Args[i].Ty); -    if (AssignFn(i, CurVT, CurVT, CCValAssign::Full, Args[i].Flags, CCInfo)) +    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo))        return false;    } -  for (unsigned i = 0, e = Args.size(); i != e; ++i) { -    CCValAssign &VA = ArgLocs[i]; +  for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) { +    assert(j < ArgLocs.size() && "Skipped too many arg locs"); + +    CCValAssign &VA = ArgLocs[j]; +    assert(VA.getValNo() == i && "Location doesn't correspond to current arg"); + +    if (VA.needsCustom()) { +      j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j)); +      continue; +    }      if (VA.isRegLoc())        Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA); diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 89a042ffc477..766187378446 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -12,7 +12,10 @@  #include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h"  #include "llvm/CodeGen/GlobalISel/CallLowering.h"  #include "llvm/CodeGen/Analysis.h"  #include "llvm/CodeGen/MachineFunction.h" @@ -21,11 +24,13 @@  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/TargetPassConfig.h"  #include "llvm/IR/Constant.h" +#include "llvm/IR/DebugInfo.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/GetElementPtrTypeIterator.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Type.h"  #include "llvm/IR/Value.h" +#include "llvm/Target/TargetFrameLowering.h"  #include "llvm/Target/TargetIntrinsicInfo.h"  #include "llvm/Target/TargetLowering.h" @@ -40,11 +45,21 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)  INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",                  false, false) -static void reportTranslationError(const Value &V, const Twine &Message) { -  std::string ErrStorage; -  raw_string_ostream Err(ErrStorage); -  Err << Message << ": " << V << '\n'; -  report_fatal_error(Err.str()); +static void reportTranslationError(MachineFunction &MF, +                                   const TargetPassConfig &TPC, +                                   OptimizationRemarkEmitter &ORE, +                                   OptimizationRemarkMissed &R) { +  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + +  // Print the function name explicitly if we don't have a debug location (which +  // makes the diagnostic less useful) or if we're going to emit a raw error. +  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) +    R << (" (in function: " + MF.getName() + ")").str(); + +  if (TPC.isGlobalISelAbortEnabled()) +    report_fatal_error(R.getMsg()); +  else +    ORE.emit(R);  }  IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) { @@ -59,28 +74,31 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {  unsigned IRTranslator::getOrCreateVReg(const Value &Val) {    unsigned &ValReg = ValToVReg[&Val]; -  // Check if this is the first time we see Val. -  if (!ValReg) { -    // Fill ValRegsSequence with the sequence of registers -    // we need to concat together to produce the value. -    assert(Val.getType()->isSized() && -           "Don't know how to create an empty vreg"); -    unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL}); -    ValReg = VReg; - -    if (auto CV = dyn_cast<Constant>(&Val)) { -      bool Success = translate(*CV, VReg); -      if (!Success) { -        if (!TPC->isGlobalISelAbortEnabled()) { -          MF->getProperties().set( -              MachineFunctionProperties::Property::FailedISel); -          return VReg; -        } -        reportTranslationError(Val, "unable to translate constant"); -      } + +  if (ValReg) +    return ValReg; + +  // Fill ValRegsSequence with the sequence of registers +  // we need to concat together to produce the value. +  assert(Val.getType()->isSized() && +         "Don't know how to create an empty vreg"); +  unsigned VReg = +      MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL)); +  ValReg = VReg; + +  if (auto CV = dyn_cast<Constant>(&Val)) { +    bool Success = translate(*CV, VReg); +    if (!Success) { +      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", +                                 MF->getFunction()->getSubprogram(), +                                 &MF->getFunction()->getEntryBlock()); +      R << "unable to translate constant: " << ore::NV("Type", Val.getType()); +      reportTranslationError(*MF, *TPC, *ORE, R); +      return VReg;      }    } -  return ValReg; + +  return VReg;  }  int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) { @@ -112,28 +130,27 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {    } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {      Alignment = LI->getAlignment();      ValTy = LI->getType(); -  } else if (!TPC->isGlobalISelAbortEnabled()) { -    MF->getProperties().set( -        MachineFunctionProperties::Property::FailedISel); +  } else { +    OptimizationRemarkMissed R("gisel-irtranslator", "", &I); +    R << "unable to translate memop: " << ore::NV("Opcode", &I); +    reportTranslationError(*MF, *TPC, *ORE, R);      return 1; -  } else -    llvm_unreachable("unhandled memory instruction"); +  }    return Alignment ? Alignment : DL->getABITypeAlignment(ValTy);  } -MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) { +MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {    MachineBasicBlock *&MBB = BBToMBB[&BB]; -  if (!MBB) { -    MBB = MF->CreateMachineBasicBlock(&BB); -    MF->push_back(MBB); - -    if (BB.hasAddressTaken()) -      MBB->setHasAddressTaken(); -  } +  assert(MBB && "BasicBlock was not encountered before");    return *MBB;  } +void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { +  assert(NewPred && "new predecessor must be a real MachineBasicBlock"); +  MachinePreds[Edge].push_back(NewPred); +} +  bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,                                       MachineIRBuilder &MIRBuilder) {    // FIXME: handle signed/unsigned wrapping flags. @@ -149,6 +166,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,    return true;  } +bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) { +  // -0.0 - X --> G_FNEG +  if (isa<Constant>(U.getOperand(0)) && +      U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) { +    MIRBuilder.buildInstr(TargetOpcode::G_FNEG) +        .addDef(getOrCreateVReg(U)) +        .addUse(getOrCreateVReg(*U.getOperand(1))); +    return true; +  } +  return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder); +} +  bool IRTranslator::translateCompare(const User &U,                                      MachineIRBuilder &MIRBuilder) {    const CmpInst *CI = dyn_cast<CmpInst>(&U); @@ -158,9 +187,14 @@ bool IRTranslator::translateCompare(const User &U,    CmpInst::Predicate Pred =        CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>(                                      cast<ConstantExpr>(U).getPredicate()); -    if (CmpInst::isIntPredicate(Pred))      MIRBuilder.buildICmp(Pred, Res, Op0, Op1); +  else if (Pred == CmpInst::FCMP_FALSE) +    MIRBuilder.buildCopy( +        Res, getOrCreateVReg(*Constant::getNullValue(CI->getType()))); +  else if (Pred == CmpInst::FCMP_TRUE) +    MIRBuilder.buildCopy( +        Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));    else      MIRBuilder.buildFCmp(Pred, Res, Op0, Op1); @@ -183,18 +217,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {      // We want a G_BRCOND to the true BB followed by an unconditional branch.      unsigned Tst = getOrCreateVReg(*BrInst.getCondition());      const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++)); -    MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt); +    MachineBasicBlock &TrueBB = getMBB(TrueTgt);      MIRBuilder.buildBrCond(Tst, TrueBB);    }    const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ)); -  MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt); -  MIRBuilder.buildBr(TgtBB); +  MachineBasicBlock &TgtBB = getMBB(BrTgt); +  MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + +  // If the unconditional target is the layout successor, fallthrough. +  if (!CurBB.isLayoutSuccessor(&TgtBB)) +    MIRBuilder.buildBr(TgtBB);    // Link successors. -  MachineBasicBlock &CurBB = MIRBuilder.getMBB();    for (const BasicBlock *Succ : BrInst.successors()) -    CurBB.addSuccessor(&getOrCreateBB(*Succ)); +    CurBB.addSuccessor(&getMBB(*Succ));    return true;  } @@ -209,30 +246,52 @@ bool IRTranslator::translateSwitch(const User &U,    const SwitchInst &SwInst = cast<SwitchInst>(U);    const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition()); +  const BasicBlock *OrigBB = SwInst.getParent(); -  LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL); +  LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL);    for (auto &CaseIt : SwInst.cases()) {      const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());      const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);      MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue); -    MachineBasicBlock &CurBB = MIRBuilder.getMBB(); -    MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor()); +    MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); +    const BasicBlock *TrueBB = CaseIt.getCaseSuccessor(); +    MachineBasicBlock &TrueMBB = getMBB(*TrueBB); -    MIRBuilder.buildBrCond(Tst, TrueBB); -    CurBB.addSuccessor(&TrueBB); +    MIRBuilder.buildBrCond(Tst, TrueMBB); +    CurMBB.addSuccessor(&TrueMBB); +    addMachineCFGPred({OrigBB, TrueBB}, &CurMBB); -    MachineBasicBlock *FalseBB = +    MachineBasicBlock *FalseMBB =          MF->CreateMachineBasicBlock(SwInst.getParent()); -    MF->push_back(FalseBB); -    MIRBuilder.buildBr(*FalseBB); -    CurBB.addSuccessor(FalseBB); +    // Insert the comparison blocks one after the other. +    MF->insert(std::next(CurMBB.getIterator()), FalseMBB); +    MIRBuilder.buildBr(*FalseMBB); +    CurMBB.addSuccessor(FalseMBB); -    MIRBuilder.setMBB(*FalseBB); +    MIRBuilder.setMBB(*FalseMBB);    }    // handle default case -  MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest()); -  MIRBuilder.buildBr(DefaultBB); -  MIRBuilder.getMBB().addSuccessor(&DefaultBB); +  const BasicBlock *DefaultBB = SwInst.getDefaultDest(); +  MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB); +  MIRBuilder.buildBr(DefaultMBB); +  MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); +  CurMBB.addSuccessor(&DefaultMBB); +  addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB); + +  return true; +} + +bool IRTranslator::translateIndirectBr(const User &U, +                                       MachineIRBuilder &MIRBuilder) { +  const IndirectBrInst &BrInst = cast<IndirectBrInst>(U); + +  const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress()); +  MIRBuilder.buildBrIndirect(Tgt); + +  // Link successors. +  MachineBasicBlock &CurBB = MIRBuilder.getMBB(); +  for (const BasicBlock *Succ : BrInst.successors()) +    CurBB.addSuccessor(&getMBB(*Succ));    return true;  } @@ -240,47 +299,38 @@ bool IRTranslator::translateSwitch(const User &U,  bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {    const LoadInst &LI = cast<LoadInst>(U); -  if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic()) -    return false; - -  assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment");    auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile                                 : MachineMemOperand::MONone;    Flags |= MachineMemOperand::MOLoad;    unsigned Res = getOrCreateVReg(LI);    unsigned Addr = getOrCreateVReg(*LI.getPointerOperand()); -  LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL}; +    MIRBuilder.buildLoad(        Res, Addr,        *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),                                  Flags, DL->getTypeStoreSize(LI.getType()), -                                getMemOpAlignment(LI))); +                                getMemOpAlignment(LI), AAMDNodes(), nullptr, +                                LI.getSynchScope(), LI.getOrdering()));    return true;  }  bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {    const StoreInst &SI = cast<StoreInst>(U); - -  if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic()) -    return false; - -  assert(!SI.isAtomic() && "only non-atomic stores supported at the moment");    auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile                                 : MachineMemOperand::MONone;    Flags |= MachineMemOperand::MOStore;    unsigned Val = getOrCreateVReg(*SI.getValueOperand());    unsigned Addr = getOrCreateVReg(*SI.getPointerOperand()); -  LLT VTy{*SI.getValueOperand()->getType(), *DL}, -      PTy{*SI.getPointerOperand()->getType(), *DL};    MIRBuilder.buildStore(        Val, Addr,        *MF->getMachineMemOperand(            MachinePointerInfo(SI.getPointerOperand()), Flags,            DL->getTypeStoreSize(SI.getValueOperand()->getType()), -          getMemOpAlignment(SI))); +          getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSynchScope(), +          SI.getOrdering()));    return true;  } @@ -305,7 +355,7 @@ bool IRTranslator::translateExtractValue(const User &U,    uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);    unsigned Res = getOrCreateVReg(U); -  MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src)); +  MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset);    return true;  } @@ -348,12 +398,18 @@ bool IRTranslator::translateSelect(const User &U,  bool IRTranslator::translateBitCast(const User &U,                                      MachineIRBuilder &MIRBuilder) { -  if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) { +  // If we're bitcasting to the source type, we can reuse the source vreg. +  if (getLLTForType(*U.getOperand(0)->getType(), *DL) == +      getLLTForType(*U.getType(), *DL)) { +    // Get the source vreg now, to avoid invalidating ValToVReg. +    unsigned SrcReg = getOrCreateVReg(*U.getOperand(0));      unsigned &Reg = ValToVReg[&U]; +    // If we already assigned a vreg for this bitcast, we can't change that. +    // Emit a copy to satisfy the users we already emitted.      if (Reg) -      MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0))); +      MIRBuilder.buildCopy(Reg, SrcReg);      else -      Reg = getOrCreateVReg(*U.getOperand(0)); +      Reg = SrcReg;      return true;    }    return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder); @@ -375,9 +431,10 @@ bool IRTranslator::translateGetElementPtr(const User &U,    Value &Op0 = *U.getOperand(0);    unsigned BaseReg = getOrCreateVReg(Op0); -  LLT PtrTy{*Op0.getType(), *DL}; -  unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace()); -  LLT OffsetTy = LLT::scalar(PtrSize); +  Type *PtrIRTy = Op0.getType(); +  LLT PtrTy = getLLTForType(*PtrIRTy, *DL); +  Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy); +  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);    int64_t Offset = 0;    for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U); @@ -399,8 +456,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,        if (Offset != 0) {          unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy); -        unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); -        MIRBuilder.buildConstant(OffsetReg, Offset); +        unsigned OffsetReg = +            getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));          MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);          BaseReg = NewBaseReg; @@ -408,8 +465,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,        }        // N = N + Idx * ElementSize; -      unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy); -      MIRBuilder.buildConstant(ElementSizeReg, ElementSize); +      unsigned ElementSizeReg = +          getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));        unsigned IdxReg = getOrCreateVReg(*Idx);        if (MRI->getType(IdxReg) != OffsetTy) { @@ -428,8 +485,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,    }    if (Offset != 0) { -    unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); -    MIRBuilder.buildConstant(OffsetReg, Offset); +    unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));      MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg);      return true;    } @@ -438,13 +494,12 @@ bool IRTranslator::translateGetElementPtr(const User &U,    return true;  } -bool IRTranslator::translateMemcpy(const CallInst &CI, -                                   MachineIRBuilder &MIRBuilder) { -  LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL}; -  if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() != -          0 || -      cast<PointerType>(CI.getArgOperand(1)->getType())->getAddressSpace() != -          0 || +bool IRTranslator::translateMemfunc(const CallInst &CI, +                                    MachineIRBuilder &MIRBuilder, +                                    unsigned ID) { +  LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL); +  Type *DstTy = CI.getArgOperand(0)->getType(); +  if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||        SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))      return false; @@ -454,14 +509,32 @@ bool IRTranslator::translateMemcpy(const CallInst &CI,      Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());    } -  MachineOperand Callee = MachineOperand::CreateES("memcpy"); +  const char *Callee; +  switch (ID) { +  case Intrinsic::memmove: +  case Intrinsic::memcpy: { +    Type *SrcTy = CI.getArgOperand(1)->getType(); +    if(cast<PointerType>(SrcTy)->getAddressSpace() != 0) +      return false; +    Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove"; +    break; +  } +  case Intrinsic::memset: +    Callee = "memset"; +    break; +  default: +    return false; +  } -  return CLI->lowerCall(MIRBuilder, Callee, +  return CLI->lowerCall(MIRBuilder, CI.getCallingConv(), +                        MachineOperand::CreateES(Callee),                          CallLowering::ArgInfo(0, CI.getType()), Args);  }  void IRTranslator::getStackGuard(unsigned DstReg,                                   MachineIRBuilder &MIRBuilder) { +  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); +  MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));    auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD);    MIB.addDef(DstReg); @@ -482,7 +555,7 @@ void IRTranslator::getStackGuard(unsigned DstReg,  bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,                                                MachineIRBuilder &MIRBuilder) { -  LLT Ty{*CI.getOperand(0)->getType(), *DL}; +  LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL);    LLT s1 = LLT::scalar(1);    unsigned Width = Ty.getSizeInBits();    unsigned Res = MRI->createGenericVirtualRegister(Ty); @@ -494,8 +567,8 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,                   .addUse(getOrCreateVReg(*CI.getOperand(1)));    if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) { -    unsigned Zero = MRI->createGenericVirtualRegister(s1); -    EntryBuilder.buildConstant(Zero, 0); +    unsigned Zero = getOrCreateVReg( +        *Constant::getNullValue(Type::getInt1Ty(CI.getContext())));      MIB.addUse(Zero);    } @@ -508,12 +581,83 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,    switch (ID) {    default:      break; -  case Intrinsic::dbg_declare: -  case Intrinsic::dbg_value: -    // FIXME: these obviously need to be supported properly. -    MF->getProperties().set( -          MachineFunctionProperties::Property::FailedISel); +  case Intrinsic::lifetime_start: +  case Intrinsic::lifetime_end: +    // Stack coloring is not enabled in O0 (which we care about now) so we can +    // drop these. Make sure someone notices when we start compiling at higher +    // opts though. +    if (MF->getTarget().getOptLevel() != CodeGenOpt::None) +      return false; +    return true; +  case Intrinsic::dbg_declare: { +    const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI); +    assert(DI.getVariable() && "Missing variable"); + +    const Value *Address = DI.getAddress(); +    if (!Address || isa<UndefValue>(Address)) { +      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); +      return true; +    } + +    assert(DI.getVariable()->isValidLocationForIntrinsic( +               MIRBuilder.getDebugLoc()) && +           "Expected inlined-at fields to agree"); +    auto AI = dyn_cast<AllocaInst>(Address); +    if (AI && AI->isStaticAlloca()) { +      // Static allocas are tracked at the MF level, no need for DBG_VALUE +      // instructions (in fact, they get ignored if they *do* exist). +      MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(), +                             getOrCreateFrameIndex(*AI), DI.getDebugLoc()); +    } else +      MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address), +                                     DI.getVariable(), DI.getExpression()); +    return true; +  } +  case Intrinsic::vaend: +    // No target I know of cares about va_end. Certainly no in-tree target +    // does. Simplest intrinsic ever!      return true; +  case Intrinsic::vastart: { +    auto &TLI = *MF->getSubtarget().getTargetLowering(); +    Value *Ptr = CI.getArgOperand(0); +    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8; + +    MIRBuilder.buildInstr(TargetOpcode::G_VASTART) +        .addUse(getOrCreateVReg(*Ptr)) +        .addMemOperand(MF->getMachineMemOperand( +            MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0)); +    return true; +  } +  case Intrinsic::dbg_value: { +    // This form of DBG_VALUE is target-independent. +    const DbgValueInst &DI = cast<DbgValueInst>(CI); +    const Value *V = DI.getValue(); +    assert(DI.getVariable()->isValidLocationForIntrinsic( +               MIRBuilder.getDebugLoc()) && +           "Expected inlined-at fields to agree"); +    if (!V) { +      // Currently the optimizer can produce this; insert an undef to +      // help debugging.  Probably the optimizer should not do this. +      MIRBuilder.buildIndirectDbgValue(0, DI.getOffset(), DI.getVariable(), +                                       DI.getExpression()); +    } else if (const auto *CI = dyn_cast<Constant>(V)) { +      MIRBuilder.buildConstDbgValue(*CI, DI.getOffset(), DI.getVariable(), +                                    DI.getExpression()); +    } else { +      unsigned Reg = getOrCreateVReg(*V); +      // FIXME: This does not handle register-indirect values at offset 0. The +      // direct/indirect thing shouldn't really be handled by something as +      // implicit as reg+noreg vs reg+imm in the first palce, but it seems +      // pretty baked in right now. +      if (DI.getOffset() != 0) +        MIRBuilder.buildIndirectDbgValue(Reg, DI.getOffset(), DI.getVariable(), +                                         DI.getExpression()); +      else +        MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), +                                       DI.getExpression()); +    } +    return true; +  }    case Intrinsic::uadd_with_overflow:      return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder);    case Intrinsic::sadd_with_overflow: @@ -526,8 +670,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,      return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);    case Intrinsic::smul_with_overflow:      return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder); +  case Intrinsic::pow: +    MIRBuilder.buildInstr(TargetOpcode::G_FPOW) +        .addDef(getOrCreateVReg(CI)) +        .addUse(getOrCreateVReg(*CI.getArgOperand(0))) +        .addUse(getOrCreateVReg(*CI.getArgOperand(1))); +    return true;    case Intrinsic::memcpy: -    return translateMemcpy(CI, MIRBuilder); +  case Intrinsic::memmove: +  case Intrinsic::memset: +    return translateMemfunc(CI, MIRBuilder, ID);    case Intrinsic::eh_typeid_for: {      GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));      unsigned Reg = getOrCreateVReg(CI); @@ -546,7 +698,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,      getStackGuard(getOrCreateVReg(CI), MIRBuilder);      return true;    case Intrinsic::stackprotector: { -    LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL}; +    LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);      unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);      getStackGuard(GuardVal, MIRBuilder); @@ -564,18 +716,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,    return false;  } +bool IRTranslator::translateInlineAsm(const CallInst &CI, +                                      MachineIRBuilder &MIRBuilder) { +  const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue()); +  if (!IA.getConstraintString().empty()) +    return false; + +  unsigned ExtraInfo = 0; +  if (IA.hasSideEffects()) +    ExtraInfo |= InlineAsm::Extra_HasSideEffects; +  if (IA.getDialect() == InlineAsm::AD_Intel) +    ExtraInfo |= InlineAsm::Extra_AsmDialect; + +  MIRBuilder.buildInstr(TargetOpcode::INLINEASM) +    .addExternalSymbol(IA.getAsmString().c_str()) +    .addImm(ExtraInfo); + +  return true; +} +  bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {    const CallInst &CI = cast<CallInst>(U);    auto TII = MF->getTarget().getIntrinsicInfo();    const Function *F = CI.getCalledFunction(); +  if (CI.isInlineAsm()) +    return translateInlineAsm(CI, MIRBuilder); +    if (!F || !F->isIntrinsic()) {      unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);      SmallVector<unsigned, 8> Args;      for (auto &Arg: CI.arg_operands())        Args.push_back(getOrCreateVReg(*Arg)); -    return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() { +    MF->getFrameInfo().setHasCalls(true); +    return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {        return getOrCreateVReg(*CI.getCalledValue());      });    } @@ -594,10 +769,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {        MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory());    for (auto &Arg : CI.arg_operands()) { -    if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) -      MIB.addImm(CI->getSExtValue()); -    else -      MIB.addUse(getOrCreateVReg(*Arg)); +    // Some intrinsics take metadata parameters. Reject them. +    if (isa<MetadataAsValue>(Arg)) +      return false; +    MIB.addUse(getOrCreateVReg(*Arg));    }    return true;  } @@ -610,7 +785,7 @@ bool IRTranslator::translateInvoke(const User &U,    const BasicBlock *ReturnBB = I.getSuccessor(0);    const BasicBlock *EHPadBB = I.getSuccessor(1); -  const Value *Callee(I.getCalledValue()); +  const Value *Callee = I.getCalledValue();    const Function *Fn = dyn_cast<Function>(Callee);    if (isa<InlineAsm>(Callee))      return false; @@ -634,23 +809,24 @@ bool IRTranslator::translateInvoke(const User &U,    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);    unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I); -  SmallVector<CallLowering::ArgInfo, 8> Args; +  SmallVector<unsigned, 8> Args;    for (auto &Arg: I.arg_operands()) -    Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); +    Args.push_back(getOrCreateVReg(*Arg)); -  if (!CLI->lowerCall(MIRBuilder, MachineOperand::CreateGA(Fn, 0), -                      CallLowering::ArgInfo(Res, I.getType()), Args)) +  if (!CLI->lowerCall(MIRBuilder, &I, Res, Args, +                      [&]() { return getOrCreateVReg(*I.getCalledValue()); }))      return false;    MCSymbol *EndSymbol = Context.createTempSymbol();    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);    // FIXME: track probabilities. -  MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB), -                    &ReturnMBB = getOrCreateBB(*ReturnBB); +  MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB), +                    &ReturnMBB = getMBB(*ReturnBB);    MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);    MIRBuilder.getMBB().addSuccessor(&ReturnMBB);    MIRBuilder.getMBB().addSuccessor(&EHPadMBB); +  MIRBuilder.buildBr(ReturnMBB);    return true;  } @@ -684,37 +860,158 @@ bool IRTranslator::translateLandingPad(const User &U,    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)      .addSym(MF->addLandingPad(&MBB)); +  LLT Ty = getLLTForType(*LP.getType(), *DL); +  unsigned Undef = MRI->createGenericVirtualRegister(Ty); +  MIRBuilder.buildUndef(Undef); + +  SmallVector<LLT, 2> Tys; +  for (Type *Ty : cast<StructType>(LP.getType())->elements()) +    Tys.push_back(getLLTForType(*Ty, *DL)); +  assert(Tys.size() == 2 && "Only two-valued landingpads are supported"); +    // Mark exception register as live in. -  SmallVector<unsigned, 2> Regs; -  SmallVector<uint64_t, 2> Offsets; -  LLT p0 = LLT::pointer(0, DL->getPointerSizeInBits()); -  if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) { -    unsigned VReg = MRI->createGenericVirtualRegister(p0); -    MIRBuilder.buildCopy(VReg, Reg); -    Regs.push_back(VReg); -    Offsets.push_back(0); +  unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn); +  if (!ExceptionReg) +    return false; + +  MBB.addLiveIn(ExceptionReg); +  unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]), +           Tmp = MRI->createGenericVirtualRegister(Ty); +  MIRBuilder.buildCopy(VReg, ExceptionReg); +  MIRBuilder.buildInsert(Tmp, Undef, VReg, 0); + +  unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn); +  if (!SelectorReg) +    return false; + +  MBB.addLiveIn(SelectorReg); + +  // N.b. the exception selector register always has pointer type and may not +  // match the actual IR-level type in the landingpad so an extra cast is +  // needed. +  unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]); +  MIRBuilder.buildCopy(PtrVReg, SelectorReg); + +  VReg = MRI->createGenericVirtualRegister(Tys[1]); +  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg); +  MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg, +                         Tys[0].getSizeInBits()); +  return true; +} + +bool IRTranslator::translateAlloca(const User &U, +                                   MachineIRBuilder &MIRBuilder) { +  auto &AI = cast<AllocaInst>(U); + +  if (AI.isStaticAlloca()) { +    unsigned Res = getOrCreateVReg(AI); +    int FI = getOrCreateFrameIndex(AI); +    MIRBuilder.buildFrameIndex(Res, FI); +    return true; +  } + +  // Now we're in the harder dynamic case. +  Type *Ty = AI.getAllocatedType(); +  unsigned Align = +      std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment()); + +  unsigned NumElts = getOrCreateVReg(*AI.getArraySize()); + +  Type *IntPtrIRTy = DL->getIntPtrType(AI.getType()); +  LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL); +  if (MRI->getType(NumElts) != IntPtrTy) { +    unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy); +    MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts); +    NumElts = ExtElts;    } -  if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) { -    unsigned VReg = MRI->createGenericVirtualRegister(p0); -    MIRBuilder.buildCopy(VReg, Reg); -    Regs.push_back(VReg); -    Offsets.push_back(p0.getSizeInBits()); +  unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy); +  unsigned TySize = +      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty))); +  MIRBuilder.buildMul(AllocSize, NumElts, TySize); + +  LLT PtrTy = getLLTForType(*AI.getType(), *DL); +  auto &TLI = *MF->getSubtarget().getTargetLowering(); +  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + +  unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy); +  MIRBuilder.buildCopy(SPTmp, SPReg); + +  unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy); +  MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize); + +  // Handle alignment. We have to realign if the allocation granule was smaller +  // than stack alignment, or the specific alloca requires more than stack +  // alignment. +  unsigned StackAlign = +      MF->getSubtarget().getFrameLowering()->getStackAlignment(); +  Align = std::max(Align, StackAlign); +  if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) { +    // Round the size of the allocation up to the stack alignment size +    // by add SA-1 to the size. This doesn't overflow because we're computing +    // an address inside an alloca. +    unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy); +    MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align)); +    AllocTmp = AlignedAlloc;    } -  MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets); +  MIRBuilder.buildCopy(SPReg, AllocTmp); +  MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp); + +  MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI); +  assert(MF->getFrameInfo().hasVarSizedObjects());    return true;  } -bool IRTranslator::translateStaticAlloca(const AllocaInst &AI, -                                         MachineIRBuilder &MIRBuilder) { -  if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca()) -    return false; +bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) { +  // FIXME: We may need more info about the type. Because of how LLT works, +  // we're completely discarding the i64/double distinction here (amongst +  // others). Fortunately the ABIs I know of where that matters don't use va_arg +  // anyway but that's not guaranteed. +  MIRBuilder.buildInstr(TargetOpcode::G_VAARG) +    .addDef(getOrCreateVReg(U)) +    .addUse(getOrCreateVReg(*U.getOperand(0))) +    .addImm(DL->getABITypeAlignment(U.getType())); +  return true; +} -  assert(AI.isStaticAlloca() && "only handle static allocas now"); -  unsigned Res = getOrCreateVReg(AI); -  int FI = getOrCreateFrameIndex(AI); -  MIRBuilder.buildFrameIndex(Res, FI); +bool IRTranslator::translateInsertElement(const User &U, +                                          MachineIRBuilder &MIRBuilder) { +  // If it is a <1 x Ty> vector, use the scalar as it is +  // not a legal vector type in LLT. +  if (U.getType()->getVectorNumElements() == 1) { +    unsigned Elt = getOrCreateVReg(*U.getOperand(1)); +    ValToVReg[&U] = Elt; +    return true; +  } +  MIRBuilder.buildInsertVectorElement( +      getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)), +      getOrCreateVReg(*U.getOperand(1)), getOrCreateVReg(*U.getOperand(2))); +  return true; +} + +bool IRTranslator::translateExtractElement(const User &U, +                                           MachineIRBuilder &MIRBuilder) { +  // If it is a <1 x Ty> vector, use the scalar as it is +  // not a legal vector type in LLT. +  if (U.getOperand(0)->getType()->getVectorNumElements() == 1) { +    unsigned Elt = getOrCreateVReg(*U.getOperand(0)); +    ValToVReg[&U] = Elt; +    return true; +  } +  MIRBuilder.buildExtractVectorElement(getOrCreateVReg(U), +                                       getOrCreateVReg(*U.getOperand(0)), +                                       getOrCreateVReg(*U.getOperand(1))); +  return true; +} + +bool IRTranslator::translateShuffleVector(const User &U, +                                          MachineIRBuilder &MIRBuilder) { +  MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR) +      .addDef(getOrCreateVReg(U)) +      .addUse(getOrCreateVReg(*U.getOperand(0))) +      .addUse(getOrCreateVReg(*U.getOperand(1))) +      .addUse(getOrCreateVReg(*U.getOperand(2)));    return true;  } @@ -736,11 +1033,21 @@ void IRTranslator::finishPendingPhis() {      // won't create extra control flow here, otherwise we need to find the      // dominating predecessor here (or perhaps force the weirder IRTranslators      // to provide a simple boundary). +    SmallSet<const BasicBlock *, 4> HandledPreds; +      for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) { -      assert(BBToMBB[PI->getIncomingBlock(i)]->isSuccessor(MIB->getParent()) && -             "I appear to have misunderstood Machine PHIs"); -      MIB.addUse(getOrCreateVReg(*PI->getIncomingValue(i))); -      MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]); +      auto IRPred = PI->getIncomingBlock(i); +      if (HandledPreds.count(IRPred)) +        continue; + +      HandledPreds.insert(IRPred); +      unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i)); +      for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) { +        assert(Pred->isSuccessor(MIB->getParent()) && +               "incorrect CFG at MachineBasicBlock level"); +        MIB.addUse(ValReg); +        MIB.addMBB(Pred); +      }      }    }  } @@ -752,9 +1059,7 @@ bool IRTranslator::translate(const Instruction &Inst) {      case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);  #include "llvm/IR/Instruction.def"    default: -    if (!TPC->isGlobalISelAbortEnabled()) -      return false; -    llvm_unreachable("unknown opcode"); +    return false;    }  } @@ -764,25 +1069,43 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {    else if (auto CF = dyn_cast<ConstantFP>(&C))      EntryBuilder.buildFConstant(Reg, *CF);    else if (isa<UndefValue>(C)) -    EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg); +    EntryBuilder.buildUndef(Reg);    else if (isa<ConstantPointerNull>(C))      EntryBuilder.buildConstant(Reg, 0);    else if (auto GV = dyn_cast<GlobalValue>(&C))      EntryBuilder.buildGlobalValue(Reg, GV); -  else if (auto CE = dyn_cast<ConstantExpr>(&C)) { +  else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) { +    if (!CAZ->getType()->isVectorTy()) +      return false; +    // Return the scalar if it is a <1 x Ty> vector. +    if (CAZ->getNumElements() == 1) +      return translate(*CAZ->getElementValue(0u), Reg); +    std::vector<unsigned> Ops; +    for (unsigned i = 0; i < CAZ->getNumElements(); ++i) { +      Constant &Elt = *CAZ->getElementValue(i); +      Ops.push_back(getOrCreateVReg(Elt)); +    } +    EntryBuilder.buildMerge(Reg, Ops); +  } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) { +    // Return the scalar if it is a <1 x Ty> vector. +    if (CV->getNumElements() == 1) +      return translate(*CV->getElementAsConstant(0), Reg); +    std::vector<unsigned> Ops; +    for (unsigned i = 0; i < CV->getNumElements(); ++i) { +      Constant &Elt = *CV->getElementAsConstant(i); +      Ops.push_back(getOrCreateVReg(Elt)); +    } +    EntryBuilder.buildMerge(Reg, Ops); +  } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {      switch(CE->getOpcode()) {  #define HANDLE_INST(NUM, OPCODE, CLASS)                         \        case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder);  #include "llvm/IR/Instruction.def"      default: -      if (!TPC->isGlobalISelAbortEnabled()) -        return false; -      llvm_unreachable("unknown opcode"); +      return false;      } -  } else if (!TPC->isGlobalISelAbortEnabled()) +  } else      return false; -  else -    llvm_unreachable("unhandled constant kind");    return true;  } @@ -793,7 +1116,7 @@ void IRTranslator::finalizeFunction() {    PendingPHIs.clear();    ValToVReg.clear();    FrameIndices.clear(); -  Constants.clear(); +  MachinePreds.clear();  }  bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { @@ -807,85 +1130,101 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {    MRI = &MF->getRegInfo();    DL = &F.getParent()->getDataLayout();    TPC = &getAnalysis<TargetPassConfig>(); +  ORE = make_unique<OptimizationRemarkEmitter>(&F);    assert(PendingPHIs.empty() && "stale PHIs"); -  // Setup a separate basic-block for the arguments and constants, falling -  // through to the IR-level Function's entry block. +  // Release the per-function state when we return, whether we succeeded or not. +  auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); }); + +  // Setup a separate basic-block for the arguments and constants    MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();    MF->push_back(EntryBB); -  EntryBB->addSuccessor(&getOrCreateBB(F.front()));    EntryBuilder.setMBB(*EntryBB); +  // Create all blocks, in IR order, to preserve the layout. +  for (const BasicBlock &BB: F) { +    auto *&MBB = BBToMBB[&BB]; + +    MBB = MF->CreateMachineBasicBlock(&BB); +    MF->push_back(MBB); + +    if (BB.hasAddressTaken()) +      MBB->setHasAddressTaken(); +  } + +  // Make our arguments/constants entry block fallthrough to the IR entry block. +  EntryBB->addSuccessor(&getMBB(F.front())); +    // Lower the actual args into this basic block.    SmallVector<unsigned, 8> VRegArgs;    for (const Argument &Arg: F.args())      VRegArgs.push_back(getOrCreateVReg(Arg)); -  bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs); -  if (!Succeeded) { -    if (!TPC->isGlobalISelAbortEnabled()) { -      MF->getProperties().set( -          MachineFunctionProperties::Property::FailedISel); -      finalizeFunction(); -      return false; -    } -    report_fatal_error("Unable to lower arguments"); +  if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) { +    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", +                               MF->getFunction()->getSubprogram(), +                               &MF->getFunction()->getEntryBlock()); +    R << "unable to lower arguments: " << ore::NV("Prototype", F.getType()); +    reportTranslationError(*MF, *TPC, *ORE, R); +    return false;    }    // And translate the function!    for (const BasicBlock &BB: F) { -    MachineBasicBlock &MBB = getOrCreateBB(BB); +    MachineBasicBlock &MBB = getMBB(BB);      // Set the insertion point of all the following translations to      // the end of this basic block.      CurBuilder.setMBB(MBB);      for (const Instruction &Inst: BB) { -      Succeeded &= translate(Inst); -      if (!Succeeded) { -        if (TPC->isGlobalISelAbortEnabled()) -          reportTranslationError(Inst, "unable to translate instruction"); -        MF->getProperties().set( -            MachineFunctionProperties::Property::FailedISel); -        break; -      } -    } -  } - -  if (Succeeded) { -    finishPendingPhis(); - -    // Now that the MachineFrameInfo has been configured, no further changes to -    // the reserved registers are possible. -    MRI->freezeReservedRegs(*MF); - -    // Merge the argument lowering and constants block with its single -    // successor, the LLVM-IR entry block.  We want the basic block to -    // be maximal. -    assert(EntryBB->succ_size() == 1 && -           "Custom BB used for lowering should have only one successor"); -    // Get the successor of the current entry block. -    MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); -    assert(NewEntryBB.pred_size() == 1 && -           "LLVM-IR entry block has a predecessor!?"); -    // Move all the instruction from the current entry block to the -    // new entry block. -    NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), -                      EntryBB->end()); - -    // Update the live-in information for the new entry block. -    for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) -      NewEntryBB.addLiveIn(LiveIn); -    NewEntryBB.sortUniqueLiveIns(); +      if (translate(Inst)) +        continue; -    // Get rid of the now empty basic block. -    EntryBB->removeSuccessor(&NewEntryBB); -    MF->remove(EntryBB); +      std::string InstStrStorage; +      raw_string_ostream InstStr(InstStrStorage); +      InstStr << Inst; -    assert(&MF->front() == &NewEntryBB && -           "New entry wasn't next in the list of basic block!"); +      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", +                                 Inst.getDebugLoc(), &BB); +      R << "unable to translate instruction: " << ore::NV("Opcode", &Inst) +        << ": '" << InstStr.str() << "'"; +      reportTranslationError(*MF, *TPC, *ORE, R); +      return false; +    }    } -  finalizeFunction(); +  finishPendingPhis(); + +  // Now that the MachineFrameInfo has been configured, no further changes to +  // the reserved registers are possible. +  MRI->freezeReservedRegs(*MF); + +  // Merge the argument lowering and constants block with its single +  // successor, the LLVM-IR entry block.  We want the basic block to +  // be maximal. +  assert(EntryBB->succ_size() == 1 && +         "Custom BB used for lowering should have only one successor"); +  // Get the successor of the current entry block. +  MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); +  assert(NewEntryBB.pred_size() == 1 && +         "LLVM-IR entry block has a predecessor!?"); +  // Move all the instruction from the current entry block to the +  // new entry block. +  NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), +                    EntryBB->end()); + +  // Update the live-in information for the new entry block. +  for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) +    NewEntryBB.addLiveIn(LiveIn); +  NewEntryBB.sortUniqueLiveIns(); + +  // Get rid of the now empty basic block. +  EntryBB->removeSuccessor(&NewEntryBB); +  MF->remove(EntryBB); +  MF->DeleteMachineBasicBlock(EntryBB); + +  assert(&MF->front() == &NewEntryBB && +         "New entry wasn't next in the list of basic block!");    return false;  } diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 1d205cd6c9c8..26454c1ef00f 100644 --- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -12,11 +12,15 @@  #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"  #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/ScopeExit.h"  #include "llvm/ADT/Twine.h"  #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"  #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h"  #include "llvm/IR/Function.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h" @@ -44,17 +48,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {    MachineFunctionPass::getAnalysisUsage(AU);  } -static void reportSelectionError(const MachineInstr *MI, const Twine &Message) { -  const MachineFunction &MF = *MI->getParent()->getParent(); -  std::string ErrStorage; -  raw_string_ostream Err(ErrStorage); -  Err << Message << ":\nIn function: " << MF.getName() << '\n'; -  if (MI) -    Err << *MI << '\n'; -  report_fatal_error(Err.str()); -} -  bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { +  const MachineRegisterInfo &MRI = MF.getRegInfo(); + +  // No matter what happens, whether we successfully select the function or not, +  // nothing is going to use the vreg types after us.  Make sure they disappear. +  auto ClearVRegTypesOnReturn = +      make_scope_exit([&]() { MRI.getVRegToType().clear(); }); +    // If the ISel pipeline failed, do not bother running that pass.    if (MF.getProperties().hasProperty(            MachineFunctionProperties::Property::FailedISel)) @@ -66,11 +67,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {    const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();    assert(ISel && "Cannot work without InstructionSelector"); +  // An optimization remark emitter. Used to report failures. +  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); +    // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many    // other MF/MFI fields we need to initialize. -  const MachineRegisterInfo &MRI = MF.getRegInfo(); -  #ifndef NDEBUG    // Check that our input is fully legal: we require the function to have the    // Legalized property, so it should be. @@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {    // that it has the same layering problem, but we only use inline methods so    // end up not needing to link against the GlobalISel library.    if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) -    for (const MachineBasicBlock &MBB : MF) -      for (const MachineInstr &MI : MBB) -        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) -          reportSelectionError(&MI, "Instruction is not legal"); +    for (MachineBasicBlock &MBB : MF) +      for (MachineInstr &MI : MBB) +        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { +          reportGISelFailure(MF, TPC, MORE, "gisel-select", +                             "instruction is not legal", MI); +          return false; +        }  #endif    // FIXME: We could introduce new blocks and will need to fix the outer loop.    // Until then, keep track of the number of blocks to assert that we don't.    const size_t NumBlocks = MF.size(); -  bool Failed = false;    for (MachineBasicBlock *MBB : post_order(&MF)) {      if (MBB->empty())        continue; @@ -115,14 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {        DEBUG(dbgs() << "Selecting: \n  " << MI); +      // We could have folded this instruction away already, making it dead. +      // If so, erase it. +      if (isTriviallyDead(MI, MRI)) { +        DEBUG(dbgs() << "Is dead; erasing.\n"); +        MI.eraseFromParentAndMarkDBGValuesForRemoval(); +        continue; +      } +        if (!ISel->select(MI)) { -        if (TPC.isGlobalISelAbortEnabled()) -          // FIXME: It would be nice to dump all inserted instructions.  It's -          // not -          // obvious how, esp. considering select() can insert after MI. -          reportSelectionError(&MI, "Cannot select"); -        Failed = true; -        break; +        // FIXME: It would be nice to dump all inserted instructions.  It's +        // not obvious how, esp. considering select() can insert after MI. +        reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI); +        return false;        }        // Dump the range of instructions that MI expanded into. @@ -142,33 +151,36 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {    for (auto &VRegToType : MRI.getVRegToType()) {      unsigned VReg = VRegToType.first;      auto *RC = MRI.getRegClassOrNull(VReg); -    auto *MI = MRI.def_instr_begin(VReg) == MRI.def_instr_end() -                   ? nullptr -                   : &*MRI.def_instr_begin(VReg); -    if (!RC) { -      if (TPC.isGlobalISelAbortEnabled()) -        reportSelectionError(MI, "VReg as no regclass after selection"); -      Failed = true; -      break; -    } +    MachineInstr *MI = nullptr; +    if (!MRI.def_empty(VReg)) +      MI = &*MRI.def_instr_begin(VReg); +    else if (!MRI.use_empty(VReg)) +      MI = &*MRI.use_instr_begin(VReg); + +    if (MI && !RC) { +      reportGISelFailure(MF, TPC, MORE, "gisel-select", +                         "VReg has no regclass after selection", *MI); +      return false; +    } else if (!RC) +      continue;      if (VRegToType.second.isValid() &&          VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) { -      if (TPC.isGlobalISelAbortEnabled()) -        reportSelectionError( -            MI, "VReg has explicit size different from class size"); -      Failed = true; -      break; +      reportGISelFailure(MF, TPC, MORE, "gisel-select", +                         "VReg has explicit size different from class size", +                         *MI); +      return false;      }    } -  MRI.getVRegToType().clear(); - -  if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) { -    MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); +  if (MF.size() != NumBlocks) { +    MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure", +                                      MF.getFunction()->getSubprogram(), +                                      /*MBB=*/nullptr); +    R << "inserting blocks is not supported yet"; +    reportGISelFailure(MF, TPC, MORE, R);      return false;    } -  assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet");    // FIXME: Should we accurately track changes?    return true; diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 5c34da0dc557..fb9d01ef8542 100644 --- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -14,6 +14,8 @@  #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"  #include "llvm/CodeGen/GlobalISel/Utils.h"  #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetRegisterInfo.h" @@ -55,6 +57,45 @@ bool InstructionSelector::constrainSelectedInstRegOperands(      // constrainOperandRegClass does that for us.      MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),                                         Reg, OpI)); + +    // Tie uses to defs as indicated in MCInstrDesc. +    if (MO.isUse()) { +      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO); +      if (DefIdx != -1) +        I.tieOperands(DefIdx, OpI); +    }    }    return true;  } + +Optional<int64_t> +InstructionSelector::getConstantVRegVal(unsigned VReg, +                                        const MachineRegisterInfo &MRI) const { +  MachineInstr *MI = MRI.getVRegDef(VReg); +  if (MI->getOpcode() != TargetOpcode::G_CONSTANT) +    return None; + +  if (MI->getOperand(1).isImm()) +    return MI->getOperand(1).getImm(); + +  if (MI->getOperand(1).isCImm() && +      MI->getOperand(1).getCImm()->getBitWidth() <= 64) +    return MI->getOperand(1).getCImm()->getSExtValue(); + +  return None; +} + +bool InstructionSelector::isOperandImmEqual( +    const MachineOperand &MO, int64_t Value, +    const MachineRegisterInfo &MRI) const { + +  if (MO.getReg()) +    if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI)) +      return *VRegVal == Value; +  return false; +} + +bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const { +  return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() && +         MI.implicit_operands().begin() == MI.implicit_operands().end(); +} diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp index e86356880e99..657ddb307919 100644 --- a/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -16,6 +16,8 @@  #include "llvm/CodeGen/GlobalISel/Legalizer.h"  #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"  #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/TargetPassConfig.h"  #include "llvm/Support/Debug.h" @@ -92,10 +94,7 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,             "unexpected physical register in G_SEQUENCE");      // Finally we can replace the uses. -    for (auto &Use : MRI.use_operands(ExtractReg)) { -      Changed = true; -      Use.setReg(OrigReg); -    } +    MRI.replaceRegWith(ExtractReg, OrigReg);    }    if (AllDefsReplaced) { @@ -114,6 +113,36 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,    return Changed;  } +bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI, +                              const TargetInstrInfo &TII) { +  if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES) +    return false; + +  unsigned NumDefs = MI.getNumOperands() - 1; +  unsigned SrcReg = MI.getOperand(NumDefs).getReg(); +  MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg); +  if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES) +    return false; + +  if (MergeI.getNumOperands() - 1 != NumDefs) +    return false; + +  // FIXME: is a COPY appropriate if the types mismatch? We know both registers +  // are allocatable by now. +  if (MRI.getType(MI.getOperand(0).getReg()) != +      MRI.getType(MergeI.getOperand(1).getReg())) +    return false; + +  for (unsigned Idx = 0; Idx < NumDefs; ++Idx) +    MRI.replaceRegWith(MI.getOperand(Idx).getReg(), +                       MergeI.getOperand(Idx + 1).getReg()); + +  MI.eraseFromParent(); +  if (MRI.use_empty(MergeI.getOperand(0).getReg())) +    MergeI.eraseFromParent(); +  return true; +} +  bool Legalizer::runOnMachineFunction(MachineFunction &MF) {    // If the ISel pipeline failed, do not bother running that pass.    if (MF.getProperties().hasProperty( @@ -122,7 +151,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {    DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');    init(MF);    const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); -  const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo(); +  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);    LegalizerHelper Helper(MF);    // FIXME: an instruction may need more than one pass before it is legal. For @@ -142,27 +171,33 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {        // and are assumed to be legal.        if (!isPreISelGenericOpcode(MI->getOpcode()))          continue; - -      auto Res = Helper.legalizeInstr(*MI, LegalizerInfo); - -      // Error out if we couldn't legalize this instruction. We may want to fall -      // back to DAG ISel instead in the future. -      if (Res == LegalizerHelper::UnableToLegalize) { -        if (!TPC.isGlobalISelAbortEnabled()) { -          MF.getProperties().set( -              MachineFunctionProperties::Property::FailedISel); -          return false; +      SmallVector<MachineInstr *, 4> WorkList; +      Helper.MIRBuilder.recordInsertions( +          [&](MachineInstr *MI) { WorkList.push_back(MI); }); +      WorkList.push_back(&*MI); + +      LegalizerHelper::LegalizeResult Res; +      unsigned Idx = 0; +      do { +        Res = Helper.legalizeInstrStep(*WorkList[Idx]); +        // Error out if we couldn't legalize this instruction. We may want to +        // fall +        // back to DAG ISel instead in the future. +        if (Res == LegalizerHelper::UnableToLegalize) { +          Helper.MIRBuilder.stopRecordingInsertions(); +          if (Res == LegalizerHelper::UnableToLegalize) { +            reportGISelFailure(MF, TPC, MORE, "gisel-legalize", +                               "unable to legalize instruction", +                               *WorkList[Idx]); +            return false; +          }          } -        std::string Msg; -        raw_string_ostream OS(Msg); -        OS << "unable to legalize instruction: "; -        MI->print(OS); -        report_fatal_error(OS.str()); -      } - -      Changed |= Res == LegalizerHelper::Legalized; -    } +        Changed |= Res == LegalizerHelper::Legalized; +        ++Idx; +      } while (Idx < WorkList.size()); +      Helper.MIRBuilder.stopRecordingInsertions(); +    }    MachineRegisterInfo &MRI = MF.getRegInfo();    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -173,6 +208,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {        NextMI = std::next(MI);        Changed |= combineExtracts(*MI, MRI, TII); +      Changed |= combineMerges(*MI, MRI, TII);      }    } diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index eb25b6ca268f..20358f7ee6c2 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -29,14 +29,13 @@  using namespace llvm;  LegalizerHelper::LegalizerHelper(MachineFunction &MF) -  : MRI(MF.getRegInfo()) { +    : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {    MIRBuilder.setMF(MF);  }  LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstrStep(MachineInstr &MI, -                                   const LegalizerInfo &LegalizerInfo) { -  auto Action = LegalizerInfo.getAction(MI, MRI); +LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { +  auto Action = LI.getAction(MI, MRI);    switch (std::get<0>(Action)) {    case LegalizerInfo::Legal:      return AlreadyLegal; @@ -50,46 +49,32 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI,      return lower(MI, std::get<1>(Action), std::get<2>(Action));    case LegalizerInfo::FewerElements:      return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action)); +  case LegalizerInfo::Custom: +    return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized +                                                  : UnableToLegalize;    default:      return UnableToLegalize;    }  } -LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstr(MachineInstr &MI, -                               const LegalizerInfo &LegalizerInfo) { -  SmallVector<MachineInstr *, 4> WorkList; -  MIRBuilder.recordInsertions( -      [&](MachineInstr *MI) { WorkList.push_back(MI); }); -  WorkList.push_back(&MI); - -  bool Changed = false; -  LegalizeResult Res; -  unsigned Idx = 0; -  do { -    Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo); -    if (Res == UnableToLegalize) { -      MIRBuilder.stopRecordingInsertions(); -      return UnableToLegalize; -    } -    Changed |= Res == Legalized; -    ++Idx; -  } while (Idx < WorkList.size()); - -  MIRBuilder.stopRecordingInsertions(); - -  return Changed ? Legalized : AlreadyLegal; -} -  void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts,                                     SmallVectorImpl<unsigned> &VRegs) { -  unsigned Size = Ty.getSizeInBits(); -  SmallVector<uint64_t, 4> Indexes; -  for (int i = 0; i < NumParts; ++i) { +  for (int i = 0; i < NumParts; ++i)      VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); -    Indexes.push_back(i * Size); +  MIRBuilder.buildUnmerge(VRegs, Reg); +} + +static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { +  switch (Opcode) { +  case TargetOpcode::G_FADD: +    assert((Size == 32 || Size == 64) && "Unsupported size"); +    return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32; +  case TargetOpcode::G_FREM: +    return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32; +  case TargetOpcode::G_FPOW: +    return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;    } -  MIRBuilder.buildExtract(VRegs, Indexes, Reg); +  llvm_unreachable("Unknown libcall function");  }  LegalizerHelper::LegalizeResult @@ -101,17 +86,19 @@ LegalizerHelper::libcall(MachineInstr &MI) {    switch (MI.getOpcode()) {    default:      return UnableToLegalize; +  case TargetOpcode::G_FADD: +  case TargetOpcode::G_FPOW:    case TargetOpcode::G_FREM: {      auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();      Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);      auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();      auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); -    const char *Name = -        TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32); - +    auto Libcall = getRTLibDesc(MI.getOpcode(), Size); +    const char *Name = TLI.getLibcallName(Libcall); +    MIRBuilder.getMF().getFrameInfo().setHasCalls(true);      CLI.lowerCall( -        MIRBuilder, MachineOperand::CreateES(Name), -        {MI.getOperand(0).getReg(), Ty}, +        MIRBuilder, TLI.getLibcallCallingConv(Libcall), +        MachineOperand::CreateES(Name), {MI.getOperand(0).getReg(), Ty},          {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}});      MI.eraseFromParent();      return Legalized; @@ -125,19 +112,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,    // FIXME: Don't know how to handle secondary types yet.    if (TypeIdx != 0)      return UnableToLegalize; + +  MIRBuilder.setInstr(MI); +    switch (MI.getOpcode()) {    default:      return UnableToLegalize;    case TargetOpcode::G_ADD: {      // Expand in terms of carry-setting/consuming G_ADDE instructions. -    unsigned NarrowSize = NarrowTy.getSizeInBits();      int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /                     NarrowTy.getSizeInBits(); -    MIRBuilder.setInstr(MI); -      SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; -    SmallVector<uint64_t, 2> Indexes;      extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);      extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -152,11 +138,138 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,                              Src2Regs[i], CarryIn);        DstRegs.push_back(DstReg); -      Indexes.push_back(i * NarrowSize);        CarryIn = CarryOut;      }      unsigned DstReg = MI.getOperand(0).getReg(); -    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); +    MIRBuilder.buildMerge(DstReg, DstRegs); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_INSERT: { +    if (TypeIdx != 0) +      return UnableToLegalize; + +    int64_t NarrowSize = NarrowTy.getSizeInBits(); +    int NumParts = +        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + +    SmallVector<unsigned, 2> SrcRegs, DstRegs; +    SmallVector<uint64_t, 2> Indexes; +    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + +    unsigned OpReg = MI.getOperand(2).getReg(); +    int64_t OpStart = MI.getOperand(3).getImm(); +    int64_t OpSize = MRI.getType(OpReg).getSizeInBits(); +    for (int i = 0; i < NumParts; ++i) { +      unsigned DstStart = i * NarrowSize; + +      if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { +        // No part of the insert affects this subregister, forward the original. +        DstRegs.push_back(SrcRegs[i]); +        continue; +      } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { +        // The entire subregister is defined by this insert, forward the new +        // value. +        DstRegs.push_back(OpReg); +        continue; +      } + +      // OpSegStart is where this destination segment would start in OpReg if it +      // extended infinitely in both directions. +      int64_t ExtractOffset, InsertOffset, SegSize; +      if (OpStart < DstStart) { +        InsertOffset = 0; +        ExtractOffset = DstStart - OpStart; +        SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); +      } else { +        InsertOffset = OpStart - DstStart; +        ExtractOffset = 0; +        SegSize = +            std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); +      } + +      unsigned SegReg = OpReg; +      if (ExtractOffset != 0 || SegSize != OpSize) { +        // A genuine extract is needed. +        SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); +        MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); +      } + +      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); +      MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset); +      DstRegs.push_back(DstReg); +    } + +    assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered"); +    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_LOAD: { +    unsigned NarrowSize = NarrowTy.getSizeInBits(); +    int NumParts = +        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; +    LLT NarrowPtrTy = LLT::pointer( +        MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize); + +    SmallVector<unsigned, 2> DstRegs; +    for (int i = 0; i < NumParts; ++i) { +      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); +      unsigned SrcReg = MRI.createGenericVirtualRegister(NarrowPtrTy); +      unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64)); + +      MIRBuilder.buildConstant(Offset, i * NarrowSize / 8); +      MIRBuilder.buildGEP(SrcReg, MI.getOperand(1).getReg(), Offset); +      // TODO: This is conservatively correct, but we probably want to split the +      // memory operands in the future. +      MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin()); + +      DstRegs.push_back(DstReg); +    } +    unsigned DstReg = MI.getOperand(0).getReg(); +    MIRBuilder.buildMerge(DstReg, DstRegs); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_STORE: { +    unsigned NarrowSize = NarrowTy.getSizeInBits(); +    int NumParts = +        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; +    LLT NarrowPtrTy = LLT::pointer( +        MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize); + +    SmallVector<unsigned, 2> SrcRegs; +    extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs); + +    for (int i = 0; i < NumParts; ++i) { +      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowPtrTy); +      unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64)); +      MIRBuilder.buildConstant(Offset, i * NarrowSize / 8); +      MIRBuilder.buildGEP(DstReg, MI.getOperand(1).getReg(), Offset); +      // TODO: This is conservatively correct, but we probably want to split the +      // memory operands in the future. +      MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin()); +    } +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_CONSTANT: { +    unsigned NarrowSize = NarrowTy.getSizeInBits(); +    int NumParts = +        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; +    const APInt &Cst = MI.getOperand(1).getCImm()->getValue(); +    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + +    SmallVector<unsigned, 2> DstRegs; +    for (int i = 0; i < NumParts; ++i) { +      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); +      ConstantInt *CI = +          ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize)); +      MIRBuilder.buildConstant(DstReg, *CI); +      DstRegs.push_back(DstReg); +    } +    unsigned DstReg = MI.getOperand(0).getReg(); +    MIRBuilder.buildMerge(DstReg, DstRegs);      MI.eraseFromParent();      return Legalized;    } @@ -175,7 +288,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {    case TargetOpcode::G_MUL:    case TargetOpcode::G_OR:    case TargetOpcode::G_XOR: -  case TargetOpcode::G_SUB: { +  case TargetOpcode::G_SUB: +  case TargetOpcode::G_SHL: {      // Perform operation at larger width (any extension is fine here, high bits      // don't affect the result) and then truncate the result back to the      // original type. @@ -195,10 +309,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {      return Legalized;    }    case TargetOpcode::G_SDIV: -  case TargetOpcode::G_UDIV: { -    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV -                          ? TargetOpcode::G_SEXT -                          : TargetOpcode::G_ZEXT; +  case TargetOpcode::G_UDIV: +  case TargetOpcode::G_ASHR: +  case TargetOpcode::G_LSHR: { +    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV || +                             MI.getOpcode() == TargetOpcode::G_ASHR +                         ? TargetOpcode::G_SEXT +                         : TargetOpcode::G_ZEXT;      unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy);      MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse( @@ -218,6 +335,85 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {      MI.eraseFromParent();      return Legalized;    } +  case TargetOpcode::G_SELECT: { +    if (TypeIdx != 0) +      return UnableToLegalize; + +    // Perform operation at larger width (any extension is fine here, high bits +    // don't affect the result) and then truncate the result back to the +    // original type. +    unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy); +    unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy); +    MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg()); +    MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg()); + +    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); +    MIRBuilder.buildInstr(TargetOpcode::G_SELECT) +        .addDef(DstExt) +        .addReg(MI.getOperand(1).getReg()) +        .addUse(Src1Ext) +        .addUse(Src2Ext); + +    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_FPTOSI: +  case TargetOpcode::G_FPTOUI: { +    if (TypeIdx != 0) +      return UnableToLegalize; + +    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); +    MIRBuilder.buildInstr(MI.getOpcode()) +        .addDef(DstExt) +        .addUse(MI.getOperand(1).getReg()); + +    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_SITOFP: +  case TargetOpcode::G_UITOFP: { +    if (TypeIdx != 1) +      return UnableToLegalize; + +    unsigned Src = MI.getOperand(1).getReg(); +    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + +    if (MI.getOpcode() == TargetOpcode::G_SITOFP) { +      MIRBuilder.buildSExt(SrcExt, Src); +    } else { +      assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op"); +      MIRBuilder.buildZExt(SrcExt, Src); +    } + +    MIRBuilder.buildInstr(MI.getOpcode()) +        .addDef(MI.getOperand(0).getReg()) +        .addUse(SrcExt); + +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_INSERT: { +    if (TypeIdx != 0) +      return UnableToLegalize; + +    unsigned Src = MI.getOperand(1).getReg(); +    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); +    MIRBuilder.buildAnyExt(SrcExt, Src); + +    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); +    auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(), +                                      MI.getOperand(3).getImm()); +    for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) { +      MIB.addReg(MI.getOperand(OpNum).getReg()); +      MIB.addImm(MI.getOperand(OpNum + 1).getImm()); +    } + +    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); +    MI.eraseFromParent(); +    return Legalized; +  }    case TargetOpcode::G_LOAD: {      assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==                 WideTy.getSizeInBits() && @@ -231,12 +427,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {      return Legalized;    }    case TargetOpcode::G_STORE: { -    assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == -               WideTy.getSizeInBits() && -           "illegal to increase number of bytes modified by a store"); +    if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) || +        WideTy != LLT::scalar(8)) +      return UnableToLegalize; + +    auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); +    auto Content = TLI.getBooleanContents(false, false); + +    unsigned ExtOp = TargetOpcode::G_ANYEXT; +    if (Content == TargetLoweringBase::ZeroOrOneBooleanContent) +      ExtOp = TargetOpcode::G_ZEXT; +    else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent) +      ExtOp = TargetOpcode::G_SEXT; +    else +      ExtOp = TargetOpcode::G_ANYEXT;      unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); -    MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg()); +    MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse( +        MI.getOperand(0).getReg());      MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(),                            **MI.memoperands_begin());      MI.eraseFromParent(); @@ -315,6 +523,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {      MI.eraseFromParent();      return Legalized;    } +  case TargetOpcode::G_SMULO: +  case TargetOpcode::G_UMULO: { +    // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the +    // result. +    unsigned Res = MI.getOperand(0).getReg(); +    unsigned Overflow = MI.getOperand(1).getReg(); +    unsigned LHS = MI.getOperand(2).getReg(); +    unsigned RHS = MI.getOperand(3).getReg(); + +    MIRBuilder.buildMul(Res, LHS, RHS); + +    unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO +                          ? TargetOpcode::G_SMULH +                          : TargetOpcode::G_UMULH; + +    unsigned HiPart = MRI.createGenericVirtualRegister(Ty); +    MIRBuilder.buildInstr(Opcode) +      .addDef(HiPart) +      .addUse(LHS) +      .addUse(RHS); + +    unsigned Zero = MRI.createGenericVirtualRegister(Ty); +    MIRBuilder.buildConstant(Zero, 0); +    MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_FNEG: { +    // TODO: Handle vector types once we are able to +    // represent them. +    if (Ty.isVector()) +      return UnableToLegalize; +    unsigned Res = MI.getOperand(0).getReg(); +    Type *ZeroTy; +    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); +    switch (Ty.getSizeInBits()) { +    case 16: +      ZeroTy = Type::getHalfTy(Ctx); +      break; +    case 32: +      ZeroTy = Type::getFloatTy(Ctx); +      break; +    case 64: +      ZeroTy = Type::getDoubleTy(Ctx); +      break; +    default: +      llvm_unreachable("unexpected floating-point type"); +    } +    ConstantFP &ZeroForNegation = +        *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy)); +    unsigned Zero = MRI.createGenericVirtualRegister(Ty); +    MIRBuilder.buildFConstant(Zero, ZeroForNegation); +    MIRBuilder.buildInstr(TargetOpcode::G_FSUB) +        .addDef(Res) +        .addUse(Zero) +        .addUse(MI.getOperand(1).getReg()); +    MI.eraseFromParent(); +    return Legalized; +  } +  case TargetOpcode::G_FSUB: { +    // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). +    // First, check if G_FNEG is marked as Lower. If so, we may +    // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. +    if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower) +      return UnableToLegalize; +    unsigned Res = MI.getOperand(0).getReg(); +    unsigned LHS = MI.getOperand(1).getReg(); +    unsigned RHS = MI.getOperand(2).getReg(); +    unsigned Neg = MRI.createGenericVirtualRegister(Ty); +    MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS); +    MIRBuilder.buildInstr(TargetOpcode::G_FADD) +        .addDef(Res) +        .addUse(LHS) +        .addUse(Neg); +    MI.eraseFromParent(); +    return Legalized; +  }    }  } @@ -335,7 +620,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,      MIRBuilder.setInstr(MI);      SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; -    SmallVector<uint64_t, 2> Indexes;      extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);      extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -343,10 +627,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,        unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);        MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]);        DstRegs.push_back(DstReg); -      Indexes.push_back(i * NarrowSize);      } -    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); +    MIRBuilder.buildMerge(DstReg, DstRegs);      MI.eraseFromParent();      return Legalized;    } diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index e49662075ed5..eaf4056e47ea 100644 --- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -41,6 +41,8 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {    DefaultActions[TargetOpcode::G_STORE] = NarrowScalar;    DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar; +  DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar; +  DefaultActions[TargetOpcode::G_FNEG] = Lower;  }  void LegalizerInfo::computeTables() { @@ -71,28 +73,36 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {    // These *have* to be implemented for now, they're the fundamental basis of    // how everything else is transformed. -  // Nothing is going to go well with types that aren't a power of 2 yet, so -  // don't even try because we might make things worse. -  if (!isPowerOf2_64(Aspect.Type.getSizeInBits())) -      return std::make_pair(Unsupported, LLT()); -    // FIXME: the long-term plan calls for expansion in terms of load/store (if    // they're not legal).    if (Aspect.Opcode == TargetOpcode::G_SEQUENCE || -      Aspect.Opcode == TargetOpcode::G_EXTRACT) +      Aspect.Opcode == TargetOpcode::G_EXTRACT || +      Aspect.Opcode == TargetOpcode::G_MERGE_VALUES || +      Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)      return std::make_pair(Legal, Aspect.Type); +  LLT Ty = Aspect.Type;    LegalizeAction Action = findInActions(Aspect); +  // LegalizerHelper is not able to handle non-power-of-2 types right now, so do +  // not try to legalize them unless they are marked as Legal or Custom. +  // FIXME: This is a temporary hack until the general non-power-of-2 +  // legalization works. +  if (!isPowerOf2_64(Ty.getSizeInBits()) && +      !(Action == Legal || Action == Custom)) +    return std::make_pair(Unsupported, LLT()); +    if (Action != NotFound)      return findLegalAction(Aspect, Action);    unsigned Opcode = Aspect.Opcode; -  LLT Ty = Aspect.Type;    if (!Ty.isVector()) {      auto DefaultAction = DefaultActions.find(Aspect.Opcode);      if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)        return std::make_pair(Legal, Ty); +    if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower) +      return std::make_pair(Lower, Ty); +      if (DefaultAction == DefaultActions.end() ||          DefaultAction->second != NarrowScalar)        return std::make_pair(Unsupported, LLT()); @@ -160,6 +170,7 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,    case Legal:    case Lower:    case Libcall: +  case Custom:      return Aspect.Type;    case NarrowScalar: {      return findLegalType(Aspect, @@ -180,3 +191,9 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,    }    }  } + +bool LegalizerInfo::legalizeCustom(MachineInstr &MI, +                                   MachineRegisterInfo &MRI, +                                   MachineIRBuilder &MIRBuilder) const { +  return false; +} diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index c04f6e4ae897..8d1a263395a0 100644 --- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -15,6 +15,7 @@  #include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineInstrBuilder.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugInfo.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetOpcodes.h"  #include "llvm/Target/TargetSubtargetInfo.h" @@ -54,7 +55,7 @@ void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,  void MachineIRBuilder::recordInsertions(      std::function<void(MachineInstr *)> Inserted) { -  InsertedInstr = Inserted; +  InsertedInstr = std::move(Inserted);  }  void MachineIRBuilder::stopRecordingInsertions() { @@ -82,6 +83,70 @@ MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) {    return MIB;  } +MachineInstrBuilder MachineIRBuilder::buildDirectDbgValue( +    unsigned Reg, const MDNode *Variable, const MDNode *Expr) { +  assert(isa<DILocalVariable>(Variable) && "not a variable"); +  assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); +  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && +         "Expected inlined-at fields to agree"); +  return buildInstr(TargetOpcode::DBG_VALUE) +      .addReg(Reg, RegState::Debug) +      .addReg(0, RegState::Debug) +      .addMetadata(Variable) +      .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildIndirectDbgValue( +    unsigned Reg, unsigned Offset, const MDNode *Variable, const MDNode *Expr) { +  assert(isa<DILocalVariable>(Variable) && "not a variable"); +  assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); +  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && +         "Expected inlined-at fields to agree"); +  return buildInstr(TargetOpcode::DBG_VALUE) +      .addReg(Reg, RegState::Debug) +      .addImm(Offset) +      .addMetadata(Variable) +      .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI, +                                                      const MDNode *Variable, +                                                      const MDNode *Expr) { +  assert(isa<DILocalVariable>(Variable) && "not a variable"); +  assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); +  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && +         "Expected inlined-at fields to agree"); +  return buildInstr(TargetOpcode::DBG_VALUE) +      .addFrameIndex(FI) +      .addImm(0) +      .addMetadata(Variable) +      .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, +                                                         unsigned Offset, +                                                         const MDNode *Variable, +                                                         const MDNode *Expr) { +  assert(isa<DILocalVariable>(Variable) && "not a variable"); +  assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); +  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && +         "Expected inlined-at fields to agree"); +  auto MIB = buildInstr(TargetOpcode::DBG_VALUE); +  if (auto *CI = dyn_cast<ConstantInt>(&C)) { +    if (CI->getBitWidth() > 64) +      MIB.addCImm(CI); +    else +      MIB.addImm(CI->getZExtValue()); +  } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) { +    MIB.addFPImm(CFP); +  } else { +    // Insert %noreg if we didn't find a usable constant and had to drop it. +    MIB.addReg(0U); +  } + +  return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr); +} +  MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) {    assert(MRI->getType(Res).isPointer() && "invalid operand type");    return buildInstr(TargetOpcode::G_FRAME_INDEX) @@ -126,6 +191,17 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,        .addUse(Op1);  } +MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0, +                                                   uint32_t NumBits) { +  assert(MRI->getType(Res).isPointer() && +         MRI->getType(Res) == MRI->getType(Op0) && "type mismatch"); + +  return buildInstr(TargetOpcode::G_PTR_MASK) +      .addDef(Res) +      .addUse(Op0) +      .addImm(NumBits); +} +  MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0,                                                 unsigned Op1) {    assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && @@ -152,10 +228,27 @@ MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0,        .addUse(Op1);  } +MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0, +                                               unsigned Op1) { +  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && +         "invalid operand type"); +  assert(MRI->getType(Res) == MRI->getType(Op0) && +         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch"); + +  return buildInstr(TargetOpcode::G_AND) +      .addDef(Res) +      .addUse(Op0) +      .addUse(Op1); +} +  MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {    return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);  } +MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) { +  return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt); +} +  MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) {    return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);  } @@ -262,34 +355,56 @@ MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res,    return buildInstr(Opcode).addDef(Res).addUse(Op);  } -MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results, -                                                   ArrayRef<uint64_t> Indices, -                                                   unsigned Src) { -#ifndef NDEBUG -  assert(Results.size() == Indices.size() && "inconsistent number of regs"); -  assert(!Results.empty() && "invalid trivial extract"); -  assert(std::is_sorted(Indices.begin(), Indices.end()) && -         "extract offsets must be in ascending order"); +MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res, +                                                       unsigned Op) { +  unsigned Opcode = TargetOpcode::COPY; +  if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) +    Opcode = TargetOpcode::G_ZEXT; +  else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) +    Opcode = TargetOpcode::G_TRUNC; -  assert(MRI->getType(Src).isValid() && "invalid operand type"); -  for (auto Res : Results) -    assert(MRI->getType(Res).isValid() && "invalid operand type"); -#endif +  return buildInstr(Opcode).addDef(Res).addUse(Op); +} -  auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT)); -  for (auto Res : Results) -    MIB.addDef(Res); -  MIB.addUse(Src); +MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) { +  LLT SrcTy = MRI->getType(Src); +  LLT DstTy = MRI->getType(Dst); +  if (SrcTy == DstTy) +    return buildCopy(Dst, Src); + +  unsigned Opcode; +  if (SrcTy.isPointer() && DstTy.isScalar()) +    Opcode = TargetOpcode::G_PTRTOINT; +  else if (DstTy.isPointer() && SrcTy.isScalar()) +    Opcode = TargetOpcode::G_INTTOPTR; +  else { +    assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet"); +    Opcode = TargetOpcode::G_BITCAST; +  } -  for (auto Idx : Indices) -    MIB.addImm(Idx); +  return buildInstr(Opcode).addDef(Dst).addUse(Src); +} -  getMBB().insert(getInsertPt(), MIB); -  if (InsertedInstr) -    InsertedInstr(MIB); +MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src, +                                                   uint64_t Index) { +#ifndef NDEBUG +  assert(MRI->getType(Src).isValid() && "invalid operand type"); +  assert(MRI->getType(Res).isValid() && "invalid operand type"); +  assert(Index + MRI->getType(Res).getSizeInBits() <= +             MRI->getType(Src).getSizeInBits() && +         "extracting off end of register"); +#endif -  return MIB; +  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) { +    assert(Index == 0 && "insertion past the end of a register"); +    return buildCast(Res, Src); +  } + +  return buildInstr(TargetOpcode::G_EXTRACT) +      .addDef(Res) +      .addUse(Src) +      .addImm(Index);  }  MachineInstrBuilder @@ -316,6 +431,64 @@ MachineIRBuilder::buildSequence(unsigned Res,    return MIB;  } +MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) { +  return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res); +} + +MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res, +                                                 ArrayRef<unsigned> Ops) { + +#ifndef NDEBUG +  assert(!Ops.empty() && "invalid trivial sequence"); +  LLT Ty = MRI->getType(Ops[0]); +  for (auto Reg : Ops) +    assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); +  assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() == +             MRI->getType(Res).getSizeInBits() && +         "input operands do not cover output register"); +#endif + +  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES); +  MIB.addDef(Res); +  for (unsigned i = 0; i < Ops.size(); ++i) +    MIB.addUse(Ops[i]); +  return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res, +                                                   unsigned Op) { + +#ifndef NDEBUG +  assert(!Res.empty() && "invalid trivial sequence"); +  LLT Ty = MRI->getType(Res[0]); +  for (auto Reg : Res) +    assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); +  assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() == +             MRI->getType(Op).getSizeInBits() && +         "input operands do not cover output register"); +#endif + +  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES); +  for (unsigned i = 0; i < Res.size(); ++i) +    MIB.addDef(Res[i]); +  MIB.addUse(Op); +  return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src, +                                                  unsigned Op, unsigned Index) { +  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) { +    assert(Index == 0 && "insertion past the end of a register"); +    return buildCast(Res, Op); +  } + +  return buildInstr(TargetOpcode::G_INSERT) +      .addDef(Res) +      .addUse(Src) +      .addUse(Op) +      .addImm(Index); +} +  MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,                                                       unsigned Res,                                                       bool HasSideEffects) { @@ -395,9 +568,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,    if (ResTy.isScalar() || ResTy.isPointer())      assert(MRI->getType(Tst).isScalar() && "type mismatch");    else -    assert(MRI->getType(Tst).isVector() && -           MRI->getType(Tst).getNumElements() == -               MRI->getType(Op0).getNumElements() && +    assert((MRI->getType(Tst).isScalar() || +            (MRI->getType(Tst).isVector() && +             MRI->getType(Tst).getNumElements() == +                 MRI->getType(Op0).getNumElements())) &&             "type mismatch");  #endif @@ -408,6 +582,46 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,        .addUse(Op1);  } +MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res, +                                                               unsigned Val, +                                                               unsigned Elt, +                                                               unsigned Idx) { +#ifndef NDEBUG +  LLT ResTy = MRI->getType(Res); +  LLT ValTy = MRI->getType(Val); +  LLT EltTy = MRI->getType(Elt); +  LLT IdxTy = MRI->getType(Idx); +  assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type"); +  assert(EltTy.isScalar() && IdxTy.isScalar() && "invalid operand type"); +  assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch"); +  assert(ResTy.getElementType() == EltTy && "type mismatch"); +#endif + +  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT) +      .addDef(Res) +      .addUse(Val) +      .addUse(Elt) +      .addUse(Idx); +} + +MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res, +                                                                unsigned Val, +                                                                unsigned Idx) { +#ifndef NDEBUG +  LLT ResTy = MRI->getType(Res); +  LLT ValTy = MRI->getType(Val); +  LLT IdxTy = MRI->getType(Idx); +  assert(ValTy.isVector() && "invalid operand type"); +  assert(ResTy.isScalar() && IdxTy.isScalar() && "invalid operand type"); +  assert(ValTy.getElementType() == ResTy && "type mismatch"); +#endif + +  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT) +      .addDef(Res) +      .addUse(Val) +      .addUse(Idx); +} +  void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src,                                          bool IsExtend) {  #ifndef NDEBUG diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp index cc026ef27296..f935390a8d1b 100644 --- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -14,6 +14,7 @@  #include "llvm/ADT/PostOrderIterator.h"  #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"  #include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/Utils.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -71,6 +72,7 @@ void RegBankSelect::init(MachineFunction &MF) {      MBPI = nullptr;    }    MIRBuilder.setMF(MF); +  MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);  }  void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { @@ -585,18 +587,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {    // LegalizerInfo as it's currently in the separate GlobalISel library.    const MachineRegisterInfo &MRI = MF.getRegInfo();    if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) { -    for (const MachineBasicBlock &MBB : MF) { -      for (const MachineInstr &MI : MBB) { +    for (MachineBasicBlock &MBB : MF) { +      for (MachineInstr &MI : MBB) {          if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { -          if (!TPC->isGlobalISelAbortEnabled()) { -            MF.getProperties().set( -                MachineFunctionProperties::Property::FailedISel); -            return false; -          } -          std::string ErrStorage; -          raw_string_ostream Err(ErrStorage); -          Err << "Instruction is not legal: " << MI << '\n'; -          report_fatal_error(Err.str()); +          reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", +                             "instruction is not legal", MI); +          return false;          }        }      } @@ -622,9 +618,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {          continue;        if (!assignInstr(MI)) { -        if (TPC->isGlobalISelAbortEnabled()) -          report_fatal_error("Unable to map instruction"); -        MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); +        reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", +                           "unable to map instruction", MI);          return false;        }      } @@ -968,10 +963,12 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {           LocalFreq == Cost.LocalFreq;  } -void RegBankSelect::MappingCost::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const {    print(dbgs());    dbgs() << '\n';  } +#endif  void RegBankSelect::MappingCost::print(raw_ostream &OS) const {    if (*this == ImpossibleCost()) { diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp index 49d676f11da6..940957d02152 100644 --- a/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -19,10 +19,11 @@ using namespace llvm;  const unsigned RegisterBank::InvalidID = UINT_MAX; -RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size, -                           const uint32_t *CoveredClasses) +RegisterBank::RegisterBank( +    unsigned ID, const char *Name, unsigned Size, +    const uint32_t *CoveredClasses, unsigned NumRegClasses)      : ID(ID), Name(Name), Size(Size) { -  ContainedRegClasses.resize(200); +  ContainedRegClasses.resize(NumRegClasses);    ContainedRegClasses.setBitsInMask(CoveredClasses);  } @@ -75,9 +76,11 @@ bool RegisterBank::operator==(const RegisterBank &OtherRB) const {    return &OtherRB == this;  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const {    print(dbgs(), /* IsForDebug */ true, TRI);  } +#endif  void RegisterBank::print(raw_ostream &OS, bool IsForDebug,                           const TargetRegisterInfo *TRI) const { diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index da5ab0b9fb7b..b2df2f159676 100644 --- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -63,13 +63,6 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,  #endif // NDEBUG  } -RegisterBankInfo::~RegisterBankInfo() { -  for (auto It : MapOfPartialMappings) -    delete It.second; -  for (auto It : MapOfValueMappings) -    delete It.second; -} -  bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {  #ifndef NDEBUG    for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { @@ -133,15 +126,26 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(    return &RC;  } +/// Check whether or not \p MI should be treated like a copy +/// for the mappings. +/// Copy like instruction are special for mapping because +/// they don't have actual register constraints. Moreover, +/// they sometimes have register classes assigned and we can +/// just use that instead of failing to provide a generic mapping. +static bool isCopyLike(const MachineInstr &MI) { +  return MI.isCopy() || MI.isPHI() || +         MI.getOpcode() == TargetOpcode::REG_SEQUENCE; +} +  RegisterBankInfo::InstructionMapping  RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {    // For copies we want to walk over the operands and try to find one    // that has a register bank since the instruction itself will not get    // us any constraint. -  bool isCopyLike = MI.isCopy() || MI.isPHI(); +  bool IsCopyLike = isCopyLike(MI);    // For copy like instruction, only the mapping of the definition    // is important. The rest is not constrained. -  unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands(); +  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();    RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1,                                                 /*OperandsMapping*/ nullptr, @@ -175,7 +179,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {      // For copy-like instruction, we want to reuse the register bank      // that is already set on Reg, if any, since those instructions do      // not have any constraints. -    const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr; +    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;      if (!CurRegBank) {        // If this is a target specific instruction, we can deduce        // the register bank from the encoding constraints. @@ -184,7 +188,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {          // All our attempts failed, give up.          CompleteMapping = false; -        if (!isCopyLike) +        if (!IsCopyLike)            // MI does not carry enough information to guess the mapping.            return InstructionMapping();          continue; @@ -192,7 +196,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {      }      const ValueMapping *ValMapping =          &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank); -    if (isCopyLike) { +    if (IsCopyLike) {        OperandsMapping[0] = ValMapping;        CompleteMapping = true;        break; @@ -200,7 +204,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {      OperandsMapping[OpIdx] = ValMapping;    } -  if (isCopyLike && !CompleteMapping) +  if (IsCopyLike && !CompleteMapping)      // No way to deduce the type from what we have.      return InstructionMapping(); @@ -234,8 +238,8 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,    ++NumPartialMappingsCreated; -  const PartialMapping *&PartMapping = MapOfPartialMappings[Hash]; -  PartMapping = new PartialMapping{StartIdx, Length, RegBank}; +  auto &PartMapping = MapOfPartialMappings[Hash]; +  PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank);    return *PartMapping;  } @@ -268,8 +272,8 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,    ++NumValueMappingsCreated; -  const ValueMapping *&ValMapping = MapOfValueMappings[Hash]; -  ValMapping = new ValueMapping{BreakDown, NumBreakDowns}; +  auto &ValMapping = MapOfValueMappings[Hash]; +  ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns);    return *ValMapping;  } @@ -282,9 +286,9 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {    // The addresses of the value mapping are unique.    // Therefore, we can use them directly to hash the operand mapping.    hash_code Hash = hash_combine_range(Begin, End); -  const auto &It = MapOfOperandsMappings.find(Hash); -  if (It != MapOfOperandsMappings.end()) -    return It->second; +  auto &Res = MapOfOperandsMappings[Hash]; +  if (Res) +    return Res.get();    ++NumOperandsMappingsCreated; @@ -293,8 +297,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {    // mapping, because we use the pointer of the ValueMapping    // to hash and we expect them to uniquely identify an instance    // of value mapping. -  ValueMapping *&Res = MapOfOperandsMappings[Hash]; -  Res = new ValueMapping[std::distance(Begin, End)]; +  Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End));    unsigned Idx = 0;    for (Iterator It = Begin; It != End; ++It, ++Idx) {      const ValueMapping *ValMap = *It; @@ -302,7 +305,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {        continue;      Res[Idx] = *ValMap;    } -  return Res; +  return Res.get();  }  const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping( @@ -349,6 +352,7 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {  void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {    MachineInstr &MI = OpdMapper.getMI(); +  MachineRegisterInfo &MRI = OpdMapper.getMRI();    DEBUG(dbgs() << "Applying default-like mapping\n");    for (unsigned OpIdx = 0,                  EndIdx = OpdMapper.getInstrMapping().getNumOperands(); @@ -359,6 +363,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {        DEBUG(dbgs() << " is not a register, nothing to be done\n");        continue;      } +    if (!MO.getReg()) { +      DEBUG(dbgs() << " is %%noreg, nothing to be done\n"); +      continue; +    } +    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns != +               0 && +           "Invalid mapping");      assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==                 1 &&             "This mapping is too complex for this function"); @@ -368,9 +379,25 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {        DEBUG(dbgs() << " has not been repaired, nothing to be done\n");        continue;      } -    DEBUG(dbgs() << " changed, replace " << MO.getReg()); -    MO.setReg(*NewRegs.begin()); -    DEBUG(dbgs() << " with " << MO.getReg()); +    unsigned OrigReg = MO.getReg(); +    unsigned NewReg = *NewRegs.begin(); +    DEBUG(dbgs() << " changed, replace " << PrintReg(OrigReg, nullptr)); +    MO.setReg(NewReg); +    DEBUG(dbgs() << " with " << PrintReg(NewReg, nullptr)); + +    // The OperandsMapper creates plain scalar, we may have to fix that. +    // Check if the types match and if not, fix that. +    LLT OrigTy = MRI.getType(OrigReg); +    LLT NewTy = MRI.getType(NewReg); +    if (OrigTy != NewTy) { +      assert(OrigTy.getSizeInBits() == NewTy.getSizeInBits() && +             "Types with difference size cannot be handled by the default " +             "mapping"); +      DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to " +                   << OrigTy); +      MRI.setType(NewReg, OrigTy); +    } +    DEBUG(dbgs() << '\n');    }  } @@ -400,10 +427,12 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg,  //------------------------------------------------------------------------------  // Helper classes implementation.  //------------------------------------------------------------------------------ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const {    print(dbgs());    dbgs() << '\n';  } +#endif  bool RegisterBankInfo::PartialMapping::verify() const {    assert(RegBank && "Register bank not set"); @@ -451,10 +480,12 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const {    return true;  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const {    print(dbgs());    dbgs() << '\n';  } +#endif  void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {    OS << "#BreakDown: " << NumBreakDowns << " "; @@ -472,8 +503,7 @@ bool RegisterBankInfo::InstructionMapping::verify(    // Check that all the register operands are properly mapped.    // Check the constructor invariant.    // For PHI, we only care about mapping the definition. -  assert(NumOperands == -             ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) && +  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&           "NumOperands must match, see constructor");    assert(MI.getParent() && MI.getParent()->getParent() &&           "MI must be connected to a MachineFunction"); @@ -503,10 +533,12 @@ bool RegisterBankInfo::InstructionMapping::verify(    return true;  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const {    print(dbgs());    dbgs() << '\n';  } +#endif  void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {    OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: "; @@ -576,6 +608,11 @@ void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {    for (unsigned &NewVReg : NewVRegsForOpIdx) {      assert(PartMap != ValMapping.end() && "Out-of-bound access");      assert(NewVReg == 0 && "Register has already been created"); +    // The new registers are always bound to scalar with the right size. +    // The actual type has to be set when the target does the mapping +    // of the instruction. +    // The rationale is that this generic code cannot guess how the +    // target plans to split the input type.      NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length));      MRI.setRegBank(NewVReg, *PartMap->RegBank);      ++PartMap; @@ -619,10 +656,12 @@ RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,    return Res;  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const {    print(dbgs(), true);    dbgs() << '\n';  } +#endif  void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,                                               bool ForDebug) const { diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp index e50091833c26..606a59680a3d 100644 --- a/lib/CodeGen/GlobalISel/Utils.cpp +++ b/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,10 +11,13 @@  //===----------------------------------------------------------------------===//  #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/ADT/Twine.h"  #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"  #include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetRegisterInfo.h" @@ -43,3 +46,50 @@ unsigned llvm::constrainOperandRegClass(    return Reg;  } + +bool llvm::isTriviallyDead(const MachineInstr &MI, +                           const MachineRegisterInfo &MRI) { +  // If we can move an instruction, we can remove it.  Otherwise, it has +  // a side-effect of some sort. +  bool SawStore = false; +  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore)) +    return false; + +  // Instructions without side-effects are dead iff they only define dead vregs. +  for (auto &MO : MI.operands()) { +    if (!MO.isReg() || !MO.isDef()) +      continue; + +    unsigned Reg = MO.getReg(); +    if (TargetRegisterInfo::isPhysicalRegister(Reg) || +        !MRI.use_nodbg_empty(Reg)) +      return false; +  } +  return true; +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, +                              MachineOptimizationRemarkEmitter &MORE, +                              MachineOptimizationRemarkMissed &R) { +  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + +  // Print the function name explicitly if we don't have a debug location (which +  // makes the diagnostic less useful) or if we're going to emit a raw error. +  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) +    R << (" (in function: " + MF.getName() + ")").str(); + +  if (TPC.isGlobalISelAbortEnabled()) +    report_fatal_error(R.getMsg()); +  else +    MORE.emit(R); +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, +                              MachineOptimizationRemarkEmitter &MORE, +                              const char *PassName, StringRef Msg, +                              const MachineInstr &MI) { +  MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ", +                                    MI.getDebugLoc(), MI.getParent()); +  R << Msg << ": " << ore::MNV("Inst", MI); +  reportGISelFailure(MF, TPC, MORE, R); +} diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index b9f3d86eabd8..37fe41582333 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -588,19 +588,6 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,    return TExit && TExit == FalseBBI.BB;  } -/// Shrink the provided inclusive range by one instruction. -/// If the range was one instruction (\p It == \p Begin), It is not modified, -/// but \p Empty is set to true. -static inline void shrinkInclusiveRange( -    MachineBasicBlock::iterator &Begin, -    MachineBasicBlock::iterator &It, -    bool &Empty) { -  if (It == Begin) -    Empty = true; -  else -    It--; -} -  /// Count duplicated instructions and move the iterators to show where they  /// are.  /// @param TIB True Iterator Begin @@ -633,10 +620,8 @@ bool IfConverter::CountDuplicatedInstructions(    while (TIB != TIE && FIB != FIE) {      // Skip dbg_value instructions. These do not count.      TIB = skipDebugInstructionsForward(TIB, TIE); -    if(TIB == TIE) -      break;      FIB = skipDebugInstructionsForward(FIB, FIE); -    if(FIB == FIE) +    if (TIB == TIE || FIB == FIE)        break;      if (!TIB->isIdenticalTo(*FIB))        break; @@ -656,58 +641,42 @@ bool IfConverter::CountDuplicatedInstructions(    if (TIB == TIE || FIB == FIE)      return true;    // Now, in preparation for counting duplicate instructions at the ends of the -  // blocks, move the end iterators up past any branch instructions. -  --TIE; -  --FIE; - -  // After this point TIB and TIE define an inclusive range, which means that -  // TIB == TIE is true when there is one more instruction to consider, not at -  // the end. Because we may not be able to go before TIB, we need a flag to -  // indicate a completely empty range. -  bool TEmpty = false, FEmpty = false; - -  // Upon exit TIE and FIE will both point at the last non-shared instruction. -  // They need to be moved forward to point past the last non-shared -  // instruction if the range they delimit is non-empty. -  auto IncrementEndIteratorsOnExit = make_scope_exit([&]() { -    if (!TEmpty) -      ++TIE; -    if (!FEmpty) -      ++FIE; -  }); +  // blocks, switch to reverse_iterators. Note that getReverse() returns an +  // iterator that points to the same instruction, unlike std::reverse_iterator. +  // We have to do our own shifting so that we get the same range. +  MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse()); +  MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse()); +  const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse()); +  const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse());    if (!TBB.succ_empty() || !FBB.succ_empty()) {      if (SkipUnconditionalBranches) { -      while (!TEmpty && TIE->isUnconditionalBranch()) -        shrinkInclusiveRange(TIB, TIE, TEmpty); -      while (!FEmpty && FIE->isUnconditionalBranch()) -        shrinkInclusiveRange(FIB, FIE, FEmpty); +      while (RTIE != RTIB && RTIE->isUnconditionalBranch()) +        ++RTIE; +      while (RFIE != RFIB && RFIE->isUnconditionalBranch()) +        ++RFIE;      }    } -  // If Dups1 includes all of a block, then don't count duplicate -  // instructions at the end of the blocks. -  if (TEmpty || FEmpty) -    return true; -    // Count duplicate instructions at the ends of the blocks. -  while (!TEmpty && !FEmpty) { +  while (RTIE != RTIB && RFIE != RFIB) {      // Skip dbg_value instructions. These do not count. -    TIE = skipDebugInstructionsBackward(TIE, TIB); -    FIE = skipDebugInstructionsBackward(FIE, FIB); -    TEmpty = TIE == TIB && TIE->isDebugValue(); -    FEmpty = FIE == FIB && FIE->isDebugValue(); -    if (TEmpty || FEmpty) +    // Note that these are reverse iterators going forward. +    RTIE = skipDebugInstructionsForward(RTIE, RTIB); +    RFIE = skipDebugInstructionsForward(RFIE, RFIB); +    if (RTIE == RTIB || RFIE == RFIB)        break; -    if (!TIE->isIdenticalTo(*FIE)) +    if (!RTIE->isIdenticalTo(*RFIE))        break;      // We have to verify that any branch instructions are the same, and then we      // don't count them toward the # of duplicate instructions. -    if (!TIE->isBranch()) +    if (!RTIE->isBranch())        ++Dups2; -    shrinkInclusiveRange(TIB, TIE, TEmpty); -    shrinkInclusiveRange(FIB, FIE, FEmpty); +    ++RTIE; +    ++RFIE;    } +  TIE = std::next(RTIE.getReverse()); +  FIE = std::next(RFIE.getReverse());    return true;  } @@ -741,25 +710,21 @@ bool IfConverter::RescanInstructions(  static void verifySameBranchInstructions(      MachineBasicBlock *MBB1,      MachineBasicBlock *MBB2) { -  MachineBasicBlock::iterator B1 = MBB1->begin(); -  MachineBasicBlock::iterator B2 = MBB2->begin(); -  MachineBasicBlock::iterator E1 = std::prev(MBB1->end()); -  MachineBasicBlock::iterator E2 = std::prev(MBB2->end()); -  bool Empty1 = false, Empty2 = false; -  while (!Empty1 && !Empty2) { -    E1 = skipDebugInstructionsBackward(E1, B1); -    E2 = skipDebugInstructionsBackward(E2, B2); -    Empty1 = E1 == B1 && E1->isDebugValue(); -    Empty2 = E2 == B2 && E2->isDebugValue(); - -    if (Empty1 && Empty2) +  const MachineBasicBlock::reverse_iterator B1 = MBB1->rend(); +  const MachineBasicBlock::reverse_iterator B2 = MBB2->rend(); +  MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin(); +  MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin(); +  while (E1 != B1 && E2 != B2) { +    skipDebugInstructionsForward(E1, B1); +    skipDebugInstructionsForward(E2, B2); +    if (E1 == B1 && E2 == B2)        break; -    if (Empty1) { +    if (E1 == B1) {        assert(!E2->isBranch() && "Branch mis-match, one block is empty.");        break;      } -    if (Empty2) { +    if (E2 == B2) {        assert(!E1->isBranch() && "Branch mis-match, one block is empty.");        break;      } @@ -769,8 +734,8 @@ static void verifySameBranchInstructions(               "Branch mis-match, branch instructions don't match.");      else        break; -    shrinkInclusiveRange(B1, E1, Empty1); -    shrinkInclusiveRange(B2, E2, Empty2); +    ++E1; +    ++E2;    }  }  #endif @@ -2183,7 +2148,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {    // unknown probabilities into known ones.    // FIXME: This usage is too tricky and in the future we would like to    // eliminate all unknown probabilities in MBB. -  ToBBI.BB->normalizeSuccProbs(); +  if (ToBBI.IsBrAnalyzable) +    ToBBI.BB->normalizeSuccProbs();    SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(),                                                  FromMBB.succ_end()); @@ -2263,7 +2229,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {    // Normalize the probabilities of ToBBI.BB's successors with all adjustment    // we've done above. -  ToBBI.BB->normalizeSuccProbs(); +  if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable) +    ToBBI.BB->normalizeSuccProbs();    ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());    FromBBI.Predicate.clear(); diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index 9588dfb72058..920c2a372a9b 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -22,6 +22,7 @@  // With the help of a runtime that understands the .fault_maps section,  // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs  // a page fault. +// Store and LoadStore are also supported.  //  //===----------------------------------------------------------------------===// @@ -29,6 +30,7 @@  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/FaultMaps.h"  #include "llvm/CodeGen/Passes.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineMemOperand.h" @@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass {    const TargetRegisterInfo *TRI = nullptr;    AliasAnalysis *AA = nullptr;    MachineModuleInfo *MMI = nullptr; +  MachineFrameInfo *MFI = nullptr;    bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,                                   SmallVectorImpl<NullCheck> &NullCheckList); -  MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB, -                                   MachineBasicBlock *HandlerMBB); +  MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB, +                                    MachineBasicBlock *HandlerMBB);    void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList); -  /// Is \p MI a memory operation that can be used to implicitly null check the -  /// value in \p PointerReg?  \p PrevInsts is the set of instruction seen since +  enum AliasResult { +    AR_NoAlias, +    AR_MayAlias, +    AR_WillAliasEverything +  }; +  /// Returns AR_NoAlias if \p MI memory operation does not alias with +  /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if +  /// they may alias and any further memory operation may alias with \p PrevMI. +  AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI); + +  enum SuitabilityResult { +    SR_Suitable, +    SR_Unsuitable, +    SR_Impossible +  }; +  /// Return SR_Suitable if \p MI a memory operation that can be used to +  /// implicitly null check the value in \p PointerReg, SR_Unsuitable if +  /// \p MI cannot be used to null check and SR_Impossible if there is +  /// no sense to continue lookup due to any other instruction will not be able +  /// to be used. \p PrevInsts is the set of instruction seen since    /// the explicit null check on \p PointerReg. -  bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, -                          ArrayRef<MachineInstr *> PrevInsts); +  SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, +                                       ArrayRef<MachineInstr *> PrevInsts);    /// Return true if \p FaultingMI can be hoisted from after the the    /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a    /// non-null value if we also need to (and legally can) hoist a depedency. -  bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg, -                        ArrayRef<MachineInstr *> InstsSeenSoFar, -                        MachineBasicBlock *NullSucc, MachineInstr *&Dependence); +  bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg, +                    ArrayRef<MachineInstr *> InstsSeenSoFar, +                    MachineBasicBlock *NullSucc, MachineInstr *&Dependence);  public:    static char ID; @@ -193,7 +214,7 @@ public:  }  bool ImplicitNullChecks::canHandle(const MachineInstr *MI) { -  if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects()) +  if (MI->isCall() || MI->hasUnmodeledSideEffects())      return false;    auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); };    (void)IsRegMask; @@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,        unsigned RegB = MOB.getReg(); -      if (TRI->regsOverlap(RegA, RegB)) +      if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))          return false;      }    } @@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {    TII = MF.getSubtarget().getInstrInfo();    TRI = MF.getRegInfo().getTargetRegisterInfo();    MMI = &MF.getMMI(); +  MFI = &MF.getFrameInfo();    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    SmallVector<NullCheck, 16> NullCheckList; @@ -283,36 +305,91 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,    return false;  } -bool ImplicitNullChecks::isSuitableMemoryOp( -    MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) { +ImplicitNullChecks::AliasResult +ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI, +                                        MachineInstr *PrevMI) { +  // If it is not memory access, skip the check. +  if (!(PrevMI->mayStore() || PrevMI->mayLoad())) +    return AR_NoAlias; +  // Load-Load may alias +  if (!(MI.mayStore() || PrevMI->mayStore())) +    return AR_NoAlias; +  // We lost info, conservatively alias. If it was store then no sense to +  // continue because we won't be able to check against it further. +  if (MI.memoperands_empty()) +    return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias; +  if (PrevMI->memoperands_empty()) +    return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias; + +  for (MachineMemOperand *MMO1 : MI.memoperands()) { +    // MMO1 should have a value due it comes from operation we'd like to use +    // as implicit null check. +    assert(MMO1->getValue() && "MMO1 should have a Value!"); +    for (MachineMemOperand *MMO2 : PrevMI->memoperands()) { +      if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) { +        if (PSV->mayAlias(MFI)) +          return AR_MayAlias; +        continue; +      } +      llvm::AliasResult AAResult = AA->alias( +          MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, +                         MMO1->getAAInfo()), +          MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, +                         MMO2->getAAInfo())); +      if (AAResult != NoAlias) +        return AR_MayAlias; +    } +  } +  return AR_NoAlias; +} + +ImplicitNullChecks::SuitabilityResult +ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, +                                       ArrayRef<MachineInstr *> PrevInsts) {    int64_t Offset;    unsigned BaseReg;    if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) ||        BaseReg != PointerReg) -    return false; - -  // We want the load to be issued at a sane offset from PointerReg, so that -  // if PointerReg is null then the load reliably page faults. -  if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize)) -    return false; - -  // Finally, we need to make sure that the load instruction actually is -  // loading from PointerReg, and there isn't some re-definition of PointerReg -  // between the compare and the load. +    return SR_Unsuitable; + +  // We want the mem access to be issued at a sane offset from PointerReg, +  // so that if PointerReg is null then the access reliably page faults. +  if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() && +        Offset < PageSize)) +    return SR_Unsuitable; + +  // Finally, we need to make sure that the access instruction actually is +  // accessing from PointerReg, and there isn't some re-definition of PointerReg +  // between the compare and the memory access. +  // If PointerReg has been redefined before then there is no sense to continue +  // lookup due to this condition will fail for any further instruction. +  SuitabilityResult Suitable = SR_Suitable;    for (auto *PrevMI : PrevInsts) -    for (auto &PrevMO : PrevMI->operands()) -      if (PrevMO.isReg() && PrevMO.getReg() && +    for (auto &PrevMO : PrevMI->operands()) { +      if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() &&            TRI->regsOverlap(PrevMO.getReg(), PointerReg)) -        return false; - -  return true; +        return SR_Impossible; + +      // Check whether the current memory access aliases with previous one. +      // If we already found that it aliases then no need to continue. +      // But we continue base pointer check as it can result in SR_Impossible. +      if (Suitable == SR_Suitable) { +        AliasResult AR = areMemoryOpsAliased(MI, PrevMI); +        if (AR == AR_WillAliasEverything) +          return SR_Impossible; +        if (AR == AR_MayAlias) +          Suitable = SR_Unsuitable; +      } +    } +  return Suitable;  } -bool ImplicitNullChecks::canHoistLoadInst( -    MachineInstr *FaultingMI, unsigned PointerReg, -    ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc, -    MachineInstr *&Dependence) { +bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, +                                      unsigned PointerReg, +                                      ArrayRef<MachineInstr *> InstsSeenSoFar, +                                      MachineBasicBlock *NullSucc, +                                      MachineInstr *&Dependence) {    auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar);    if (!DepResult.CanReorder)      return false; @@ -359,7 +436,8 @@ bool ImplicitNullChecks::canHoistLoadInst(      // The Dependency can't be re-defining the base register -- then we won't      // get the memory operation on the address we want.  This is already      // checked in \c IsSuitableMemoryOp. -    assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) && +    assert(!(DependenceMO.isDef() && +             TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) &&             "Should have been checked before!");    } @@ -481,9 +559,11 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(        return false;      MachineInstr *Dependence; -    if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) && -        canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, -                         Dependence)) { +    SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar); +    if (SR == SR_Impossible) +      return false; +    if (SR == SR_Suitable && +        canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) {        NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,                                   NullSucc, Dependence);        return true; @@ -495,36 +575,42 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(    return false;  } -/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine -/// instruction.  The FAULTING_LOAD_OP instruction does the same load as LoadMI -/// (defining the same register), and branches to HandlerMBB if the load -/// faults.  The FAULTING_LOAD_OP instruction is inserted at the end of MBB. -MachineInstr * -ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI, -                                       MachineBasicBlock *MBB, -                                       MachineBasicBlock *HandlerMBB) { +/// Wrap a machine instruction, MI, into a FAULTING machine instruction. +/// The FAULTING instruction does the same load/store as MI +/// (defining the same register), and branches to HandlerMBB if the mem access +/// faults.  The FAULTING instruction is inserted at the end of MBB. +MachineInstr *ImplicitNullChecks::insertFaultingInstr( +    MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) {    const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for                                   // all targets.    DebugLoc DL; -  unsigned NumDefs = LoadMI->getDesc().getNumDefs(); +  unsigned NumDefs = MI->getDesc().getNumDefs();    assert(NumDefs <= 1 && "other cases unhandled!");    unsigned DefReg = NoRegister;    if (NumDefs != 0) { -    DefReg = LoadMI->defs().begin()->getReg(); -    assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 && +    DefReg = MI->defs().begin()->getReg(); +    assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 &&             "expected exactly one def!");    } -  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg) +  FaultMaps::FaultKind FK; +  if (MI->mayLoad()) +    FK = +        MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad; +  else +    FK = FaultMaps::FaultingStore; + +  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg) +                 .addImm(FK)                   .addMBB(HandlerMBB) -                 .addImm(LoadMI->getOpcode()); +                 .addImm(MI->getOpcode()); -  for (auto &MO : LoadMI->uses()) -    MIB.addOperand(MO); +  for (auto &MO : MI->uses()) +    MIB.add(MO); -  MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end()); +  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());    return MIB;  } @@ -545,18 +631,18 @@ void ImplicitNullChecks::rewriteNullChecks(        NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);      } -    // Insert a faulting load where the conditional branch was originally.  We -    // check earlier ensures that this bit of code motion is legal.  We do not -    // touch the successors list for any basic block since we haven't changed -    // control flow, we've just made it implicit. -    MachineInstr *FaultingLoad = insertFaultingLoad( +    // Insert a faulting instruction where the conditional branch was +    // originally. We check earlier ensures that this bit of code motion +    // is legal.  We do not touch the successors list for any basic block +    // since we haven't changed control flow, we've just made it implicit. +    MachineInstr *FaultingInstr = insertFaultingInstr(          NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());      // Now the values defined by MemOperation, if any, are live-in of      // the block of MemOperation. -    // The original load operation may define implicit-defs alongside -    // the loaded value. +    // The original operation may define implicit-defs alongside +    // the value.      MachineBasicBlock *MBB = NC.getMemOperation()->getParent(); -    for (const MachineOperand &MO : FaultingLoad->operands()) { +    for (const MachineOperand &MO : FaultingInstr->operands()) {        if (!MO.isReg() || !MO.isDef())          continue;        unsigned Reg = MO.getReg(); diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index 3d81184f774a..a1cb0a0695bf 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {        Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);    // We take the DebugLoc from MI, since OrigMI may be attributed to a -  // different source location.  +  // different source location.    auto *NewMI = LIS.getInstructionFromIndex(DefIdx);    NewMI->setDebugLoc(MI.getDebugLoc()); @@ -686,7 +686,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {    return true;  } -#if !defined(NDEBUG) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  // Dump the range of instructions from B to E with their slot indexes.  static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,                                                 MachineBasicBlock::iterator E, diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp index afd24067ace7..c6cc909e25d3 100644 --- a/lib/CodeGen/IntrinsicLowering.cpp +++ b/lib/CodeGen/IntrinsicLowering.cpp @@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) {            Type::getInt8PtrTy(Context),                                Type::getInt8PtrTy(Context),                                 Type::getInt8PtrTy(Context),  -                              DL.getIntPtrType(Context), nullptr); +                              DL.getIntPtrType(Context));          break;        case Intrinsic::memmove:          M.getOrInsertFunction("memmove",            Type::getInt8PtrTy(Context),                                Type::getInt8PtrTy(Context),                                 Type::getInt8PtrTy(Context),  -                              DL.getIntPtrType(Context), nullptr); +                              DL.getIntPtrType(Context));          break;        case Intrinsic::memset:          M.getOrInsertFunction("memset",            Type::getInt8PtrTy(Context),                                Type::getInt8PtrTy(Context),                                 Type::getInt32Ty(M.getContext()),  -                              DL.getIntPtrType(Context), nullptr); +                              DL.getIntPtrType(Context));          break;        case Intrinsic::sqrt:          EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl"); diff --git a/lib/CodeGen/LLVMBuild.txt b/lib/CodeGen/LLVMBuild.txt index 86d3624a9d6e..07ea9dcaea7a 100644 --- a/lib/CodeGen/LLVMBuild.txt +++ b/lib/CodeGen/LLVMBuild.txt @@ -22,4 +22,4 @@ subdirectories = AsmPrinter SelectionDAG MIRParser GlobalISel  type = Library  name = CodeGen  parent = Libraries -required_libraries = Analysis BitReader BitWriter Core MC Scalar Support Target TransformUtils +required_libraries = Analysis BitReader BitWriter Core MC ProfileData Scalar Support Target TransformUtils diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 26794e28020e..7b1706f0f4ba 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -42,8 +42,8 @@ static cl::opt<cl::boolOrDefault>  EnableFastISelOption("fast-isel", cl::Hidden,    cl::desc("Enable the \"fast\" instruction selector")); -static cl::opt<bool> -    EnableGlobalISel("global-isel", cl::Hidden, cl::init(false), +static cl::opt<cl::boolOrDefault> +    EnableGlobalISel("global-isel", cl::Hidden,                       cl::desc("Enable the \"global\" instruction selector"));  void LLVMTargetMachine::initAsmInfo() { @@ -85,7 +85,7 @@ void LLVMTargetMachine::initAsmInfo() {  LLVMTargetMachine::LLVMTargetMachine(const Target &T,                                       StringRef DataLayoutString,                                       const Triple &TT, StringRef CPU, -                                     StringRef FS, TargetOptions Options, +                                     StringRef FS, const TargetOptions &Options,                                       Reloc::Model RM, CodeModel::Model CM,                                       CodeGenOpt::Level OL)      : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) { @@ -149,7 +149,9 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,      TM->setFastISel(true);    // Ask the target for an isel. -  if (LLVM_UNLIKELY(EnableGlobalISel)) { +  // Enable GlobalISel if the target wants to, but allow that to be overriden. +  if (EnableGlobalISel == cl::BOU_TRUE || (EnableGlobalISel == cl::BOU_UNSET && +                                           PassConfig->isGlobalISelEnabled())) {      if (PassConfig->addIRTranslator())        return nullptr; @@ -172,11 +174,12 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,      // Pass to reset the MachineFunction if the ISel failed.      PM.add(createResetMachineFunctionPass( -        PassConfig->reportDiagnosticWhenGlobalISelFallback())); +        PassConfig->reportDiagnosticWhenGlobalISelFallback(), +        PassConfig->isGlobalISelAbortEnabled()));      // Provide a fallback path when we do not want to abort on      // not-yet-supported input. -    if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) && +    if (!PassConfig->isGlobalISelAbortEnabled() &&          PassConfig->addInstSelector())        return nullptr; diff --git a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp new file mode 100644 index 000000000000..996d40ca6e1e --- /dev/null +++ b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp @@ -0,0 +1,97 @@ +///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===// +/// +///                     The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// This is an alternative analysis pass to MachineBlockFrequencyInfo.  The +/// difference is that with this pass the block frequencies are not computed +/// when the analysis pass is executed but rather when the BFI result is +/// explicitly requested by the analysis client. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "lazy-machine-block-freq" + +INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, +                      "Lazy Machine Block Frequency Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, +                    "Lazy Machine Block Frequency Analysis", true, true) + +char LazyMachineBlockFrequencyInfoPass::ID = 0; + +LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass() +    : MachineFunctionPass(ID) { +  initializeLazyMachineBlockFrequencyInfoPassPass( +      *PassRegistry::getPassRegistry()); +} + +void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS, +                                              const Module *M) const { +  getBFI().print(OS, M); +} + +void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage( +    AnalysisUsage &AU) const { +  AU.addRequired<MachineBranchProbabilityInfo>(); +  AU.setPreservesAll(); +  MachineFunctionPass::getAnalysisUsage(AU); +} + +void LazyMachineBlockFrequencyInfoPass::releaseMemory() { +  OwnedMBFI.reset(); +  OwnedMLI.reset(); +  OwnedMDT.reset(); +} + +MachineBlockFrequencyInfo & +LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const { +  auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>(); +  if (MBFI) { +    DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n"); +    return *MBFI; +  } + +  auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>(); +  auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>(); +  auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>(); +  DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n"); +  DEBUG(if (MLI) dbgs() << "LoopInfo is available\n"); + +  if (!MLI) { +    DEBUG(dbgs() << "Building LoopInfo on the fly\n"); +    // First create a dominator tree. +    DEBUG(if (MDT) dbgs() << "DominatorTree is available\n"); + +    if (!MDT) { +      DEBUG(dbgs() << "Building DominatorTree on the fly\n"); +      OwnedMDT = make_unique<MachineDominatorTree>(); +      OwnedMDT->getBase().recalculate(*MF); +      MDT = OwnedMDT.get(); +    } + +    // Generate LoopInfo from it. +    OwnedMLI = make_unique<MachineLoopInfo>(); +    OwnedMLI->getBase().analyze(MDT->getBase()); +    MLI = OwnedMLI.get(); +  } + +  OwnedMBFI = make_unique<MachineBlockFrequencyInfo>(); +  OwnedMBFI->calculate(*MF, MBPI, *MLI); +  return *OwnedMBFI.get(); +} + +bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction( +    MachineFunction &F) { +  MF = &F; +  return false; +} diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp index 834ed5f06c94..275d84e2c185 100644 --- a/lib/CodeGen/LexicalScopes.cpp +++ b/lib/CodeGen/LexicalScopes.cpp @@ -14,14 +14,23 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <string> +#include <tuple> +#include <utility> +  using namespace llvm;  #define DEBUG_TYPE "lexicalscopes" @@ -38,6 +47,10 @@ void LexicalScopes::reset() {  /// initialize - Scan machine function and constuct lexical scope nest.  void LexicalScopes::initialize(const MachineFunction &Fn) { +  // Don't attempt any lexical scope creation for a NoDebug compile unit. +  if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() == +      DICompileUnit::NoDebug) +    return;    reset();    MF = &Fn;    SmallVector<InsnRange, 4> MIRanges; @@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) {  void LexicalScopes::extractLexicalScopes(      SmallVectorImpl<InsnRange> &MIRanges,      DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { -    // Scan each instruction and create scopes. First build working set of scopes.    for (const auto &MBB : *MF) {      const MachineInstr *RangeBeginMI = nullptr; @@ -127,6 +139,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {  LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,                                                       const DILocation *IA) {    if (IA) { +    // Skip scopes inlined from a NoDebug compile unit. +    if (Scope->getSubprogram()->getUnit()->getEmissionKind() == +        DICompileUnit::NoDebug) +      return getOrCreateLexicalScope(IA);      // Create an abstract scope for inlined function.      getOrCreateAbstractScope(Scope);      // Create an inlined scope for inlined function. @@ -181,10 +197,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,    else      Parent = getOrCreateLexicalScope(InlinedAt); -  I = InlinedLexicalScopeMap.emplace(std::piecewise_construct, -                                     std::forward_as_tuple(P), -                                     std::forward_as_tuple(Parent, Scope, -                                                           InlinedAt, false)) +  I = InlinedLexicalScopeMap +          .emplace(std::piecewise_construct, std::forward_as_tuple(P), +                   std::forward_as_tuple(Parent, Scope, InlinedAt, false))            .first;    return &I->second;  } @@ -241,7 +256,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) {  void LexicalScopes::assignInstructionRanges(      SmallVectorImpl<InsnRange> &MIRanges,      DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { -    LexicalScope *PrevLexicalScope = nullptr;    for (const auto &R : MIRanges) {      LexicalScope *S = MI2ScopeMap.lookup(R.first); @@ -299,9 +313,8 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {    return Result;  } -/// dump - Print data structures. -void LexicalScope::dump(unsigned Indent) const { -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const {    raw_ostream &err = dbgs();    err.indent(Indent);    err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n"; @@ -316,5 +329,5 @@ void LexicalScope::dump(unsigned Indent) const {    for (unsigned i = 0, e = Children.size(); i != e; ++i)      if (Children[i] != this)        Children[i]->dump(Indent + 2); -#endif  } +#endif diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp index c945376560f7..f956974b1aaf 100644 --- a/lib/CodeGen/LiveDebugValues.cpp +++ b/lib/CodeGen/LiveDebugValues.cpp @@ -24,13 +24,16 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/ADT/UniqueVector.h"  #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineFunctionPass.h"  #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h"  #include "llvm/CodeGen/Passes.h"  #include "llvm/IR/DebugInfo.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetLowering.h"  #include "llvm/Target/TargetRegisterInfo.h" @@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass {  private:    const TargetRegisterInfo *TRI;    const TargetInstrInfo *TII; +  const TargetFrameLowering *TFI;    LexicalScopes LS;    /// Keeps track of lexical scopes associated with a user value's source @@ -127,11 +131,13 @@ private:        if (int RegNo = isDbgValueDescribedByReg(MI)) {          Kind = RegisterKind;          Loc.RegisterLoc.RegNo = RegNo; -        uint64_t Offset = +        int64_t Offset =              MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0;          // We don't support offsets larger than 4GiB here. They are          // slated to be replaced with DIExpressions anyway. -        if (Offset >= (1ULL << 32)) +        // With indirect debug values used for spill locations, Offset  +        // can be negative. +        if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32))            Kind = InvalidKind;          else            Loc.RegisterLoc.Offset = Offset; @@ -150,7 +156,9 @@ private:      /// dominates MBB.      bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); } -    void dump() const { MI.dump(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +    LLVM_DUMP_METHOD void dump() const { MI.dump(); } +#endif      bool operator==(const VarLoc &Other) const {        return Var == Other.Var && Loc.Hash == Other.Loc.Hash; @@ -167,6 +175,11 @@ private:    typedef UniqueVector<VarLoc> VarLocMap;    typedef SparseBitVector<> VarLocSet;    typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB; +  struct SpillDebugPair { +    MachineInstr *SpillInst; +    MachineInstr *DebugInst; +  }; +  typedef SmallVector<SpillDebugPair, 4> SpillMap;    /// This holds the working set of currently open ranges. For fast    /// access, this is done both as a set of VarLocIDs, and a map of @@ -216,14 +229,21 @@ private:      }    }; +  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF, +                          unsigned &Reg); +  int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg); +    void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,                            VarLocMap &VarLocIDs); +  void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges, +                         VarLocMap &VarLocIDs, SpillMap &Spills);    void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,                             const VarLocMap &VarLocIDs);    bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,                                VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);    bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, -                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs); +                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills, +                bool transferSpills);    bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,              const VarLocMap &VarLocIDs, @@ -282,6 +302,7 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {  //            Debug Range Extension Implementation  //===----------------------------------------------------------------------===// +#ifndef NDEBUG  void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,                                         const VarLocInMBB &V,                                         const VarLocMap &VarLocIDs, @@ -300,6 +321,22 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,    }    Out << "\n";  } +#endif + +/// Given a spill instruction, extract the register and offset used to +/// address the spill location in a target independent way. +int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI, +                                                  unsigned &Reg) { +  assert(MI.hasOneMemOperand() &&  +         "Spill instruction does not have exactly one memory operand?"); +  auto MMOI = MI.memoperands_begin(); +  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue(); +  assert(PVal->kind() == PseudoSourceValue::FixedStack && +         "Inconsistent memory operand in spill instruction"); +  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex(); +  const MachineBasicBlock *MBB = MI.getParent(); +  return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg); +}  /// End all previous ranges related to @MI and start a new range from @MI  /// if it is a DBG_VALUE instr. @@ -336,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,    unsigned SP = TLI->getStackPointerRegisterToSaveRestore();    SparseBitVector<> KillSet;    for (const MachineOperand &MO : MI.operands()) { +    // Determine whether the operand is a register def.  Assume that call +    // instructions never clobber SP, because some backends (e.g., AArch64) +    // never list SP in the regmask.      if (MO.isReg() && MO.isDef() && MO.getReg() && -        TRI->isPhysicalRegister(MO.getReg())) { +        TRI->isPhysicalRegister(MO.getReg()) && +        !(MI.isCall() && MO.getReg() == SP)) {        // Remove ranges of all aliased registers.        for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)          for (unsigned ID : OpenRanges.getVarLocs()) @@ -358,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,    OpenRanges.erase(KillSet, VarLocIDs);  } +/// Decide if @MI is a spill instruction and return true if it is. We use 2 +/// criteria to make this decision: +/// - Is this instruction a store to a spill slot? +/// - Is there a register operand that is both used and killed? +/// TODO: Store optimization can fold spills into other stores (including +/// other spills). We do not handle this yet (more than one memory operand). +bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI, +                                         MachineFunction *MF, unsigned &Reg) { +  const MachineFrameInfo &FrameInfo = MF->getFrameInfo(); +  int FI; +  const MachineMemOperand *MMO; + +  // TODO: Handle multiple stores folded into one.  +  if (!MI.hasOneMemOperand()) +    return false; + +  // To identify a spill instruction, use the same criteria as in AsmPrinter. +  if (!((TII->isStoreToStackSlotPostFE(MI, FI) || +         TII->hasStoreToStackSlot(MI, MMO, FI)) && +        FrameInfo.isSpillSlotObjectIndex(FI))) +    return false; + +  // In a spill instruction generated by the InlineSpiller the spilled register +  // has its kill flag set. Return false if we don't find such a register. +  Reg = 0; +  for (const MachineOperand &MO : MI.operands()) { +    if (MO.isReg() && MO.isUse() && MO.isKill()) { +      Reg = MO.getReg(); +      break; +    } +  } +  return Reg != 0; +} + +/// A spilled register may indicate that we have to end the current range of +/// a variable and create a new one for the spill location. +/// We don't want to insert any instructions in transfer(), so we just create +/// the DBG_VALUE witout inserting it and keep track of it in @Spills. +/// It will be inserted into the BB when we're done iterating over the +/// instructions. +void LiveDebugValues::transferSpillInst(MachineInstr &MI, +                                        OpenRangesSet &OpenRanges, +                                        VarLocMap &VarLocIDs, +                                        SpillMap &Spills) { +  unsigned Reg; +  MachineFunction *MF = MI.getParent()->getParent(); +  if (!isSpillInstruction(MI, MF, Reg)) +    return; + +  // Check if the register is the location of a debug value. +  for (unsigned ID : OpenRanges.getVarLocs()) { +    if (VarLocIDs[ID].isDescribedByReg() == Reg) { +      DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '(' +                   << VarLocIDs[ID].Var.getVar()->getName() << ")\n"); + +      // Create a DBG_VALUE instruction to describe the Var in its spilled +      // location, but don't insert it yet to avoid invalidating the +      // iterator in our caller. +      unsigned SpillBase; +      int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase); +      const MachineInstr *DMI = &VarLocIDs[ID].MI; +      MachineInstr *SpDMI = +          BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0, +                  DMI->getDebugVariable(), DMI->getDebugExpression()); +      SpDMI->getOperand(1).setImm(SpillOffset); +      DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: "; +            SpDMI->print(dbgs(), false, TII)); + +      // The newly created DBG_VALUE instruction SpDMI must be inserted after +      // MI. Keep track of the pairing. +      SpillDebugPair MIP = {&MI, SpDMI}; +      Spills.push_back(MIP); + +      // End all previous ranges of Var. +      OpenRanges.erase(VarLocIDs[ID].Var); + +      // Add the VarLoc to OpenRanges. +      VarLoc VL(*SpDMI, LS); +      unsigned SpillLocID = VarLocIDs.insert(VL); +      OpenRanges.insert(SpillLocID, VL.Var); +      return; +    } +  } +} +  /// Terminate all open ranges at the end of the current basic block.  bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,                                               OpenRangesSet &OpenRanges, @@ -383,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,  /// This routine creates OpenRanges and OutLocs.  bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, -                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) { +                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, +                               SpillMap &Spills, bool transferSpills) {    bool Changed = false;    transferDebugValue(MI, OpenRanges, VarLocIDs);    transferRegisterDef(MI, OpenRanges, VarLocIDs); +  if (transferSpills) +    transferSpillInst(MI, OpenRanges, VarLocIDs, Spills);    Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);    return Changed;  } @@ -475,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {    bool OLChanged = false;    bool MBBJoined = false; -  VarLocMap VarLocIDs;   // Map VarLoc<>unique ID for use in bitvectors. +  VarLocMap VarLocIDs;      // Map VarLoc<>unique ID for use in bitvectors.    OpenRangesSet OpenRanges; // Ranges that are open until end of bb. -  VarLocInMBB OutLocs;   // Ranges that exist beyond bb. -  VarLocInMBB InLocs;    // Ranges that are incoming after joining. +  VarLocInMBB OutLocs;      // Ranges that exist beyond bb. +  VarLocInMBB InLocs;       // Ranges that are incoming after joining. +  SpillMap Spills;          // DBG_VALUEs associated with spills.    DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;    DenseMap<MachineBasicBlock *, unsigned int> BBToOrder; @@ -490,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {        Pending;    // Initialize every mbb with OutLocs. +  // We are not looking at any spill instructions during the initial pass +  // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE +  // instructions for spills of registers that are known to be user variables +  // within the BB in which the spill occurs.    for (auto &MBB : MF)      for (auto &MI : MBB) -      transfer(MI, OpenRanges, OutLocs, VarLocIDs); +      transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, +               /*transferSpills=*/false);    DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",                           dbgs())); @@ -524,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {        if (MBBJoined) {          MBBJoined = false;          Changed = true; +        // Now that we have started to extend ranges across BBs we need to +        // examine spill instructions to see whether they spill registers that +        // correspond to user variables.          for (auto &MI : *MBB) -          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs); +          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, +                                /*transferSpills=*/true); + +        // Add any DBG_VALUE instructions necessitated by spills. +        for (auto &SP : Spills) +          MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst), +                           SP.DebugInst); +        Spills.clear();          DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,                                 "OutLocs after propagating", dbgs())); @@ -559,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {    TRI = MF.getSubtarget().getRegisterInfo();    TII = MF.getSubtarget().getInstrInfo(); +  TFI = MF.getSubtarget().getFrameLowering();    LS.initialize(MF);    bool Changed = ExtendRanges(MF); diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index 0934d8cfeaa1..bcf7c8e99c7f 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -944,7 +944,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,              IsIndirect, Loc.getReg(), offset, Variable, Expression);    else      BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE)) -        .addOperand(Loc) +        .add(Loc)          .addImm(offset)          .addMetadata(Variable)          .addMetadata(Expression); @@ -1005,7 +1005,7 @@ bool LiveDebugVariables::doInitialization(Module &M) {    return Pass::doInitialization(M);  } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void LiveDebugVariables::dump() {    if (pImpl)      static_cast<LDVImpl*>(pImpl)->print(dbgs()); diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp index 623af492fcd4..9ef9f238fdce 100644 --- a/lib/CodeGen/LiveInterval.cpp +++ b/lib/CodeGen/LiveInterval.cpp @@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() {    SubRanges = nullptr;  } +void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator, +    LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) { + +  LaneBitmask ToApply = LaneMask; +  for (SubRange &SR : subranges()) { +    LaneBitmask SRMask = SR.LaneMask; +    LaneBitmask Matching = SRMask & LaneMask; +    if (Matching.none()) +      continue; + +    SubRange *MatchingRange; +    if (SRMask == Matching) { +      // The subrange fits (it does not cover bits outside \p LaneMask). +      MatchingRange = &SR; +    } else { +      // We have to split the subrange into a matching and non-matching part. +      // Reduce lanemask of existing lane to non-matching part. +      SR.LaneMask = SRMask & ~Matching; +      // Create a new subrange for the matching part +      MatchingRange = createSubRangeFrom(Allocator, Matching, SR); +    } +    Apply(*MatchingRange); +    ToApply &= ~Matching; +  } +  // Create a new subrange if there are uncovered bits left. +  if (ToApply.any()) { +    SubRange *NewRange = createSubRange(Allocator, ToApply); +    Apply(*NewRange); +  } +} +  unsigned LiveInterval::getSize() const {    unsigned Sum = 0;    for (const Segment &S : segments) @@ -1032,6 +1063,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const {  // When they exist, Spills.back().start <= LastStart,  //                 and WriteI[-1].start <= LastStart. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  void LiveRangeUpdater::print(raw_ostream &OS) const {    if (!isDirty()) {      if (LR) @@ -1058,6 +1090,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {  LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const {    print(errs());  } +#endif  // Determine if A and B should be coalesced.  static inline bool coalescable(const LiveRange::Segment &A, diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp index 70d34838b237..3f5b8e19d1f0 100644 --- a/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -7,10 +7,10 @@  //  //===----------------------------------------------------------------------===//  // -// This file implements the LiveInterval analysis pass which is used -// by the Linear Scan Register allocator. This pass linearizes the -// basic blocks of the function in DFS order and computes live intervals for -// each virtual and physical register. +/// \file This file implements the LiveInterval analysis pass which is used +/// by the Linear Scan Register allocator. This pass linearizes the +/// basic blocks of the function in DFS order and computes live intervals for +/// each virtual and physical register.  //  //===----------------------------------------------------------------------===// @@ -96,16 +96,14 @@ void LiveIntervals::releaseMemory() {    RegMaskBits.clear();    RegMaskBlocks.clear(); -  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) -    delete RegUnitRanges[i]; +  for (LiveRange *LR : RegUnitRanges) +    delete LR;    RegUnitRanges.clear();    // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.    VNInfoAllocator.Reset();  } -/// runOnMachineFunction - calculates LiveIntervals -///  bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {    MF = &fn;    MRI = &MF->getRegInfo(); @@ -135,14 +133,13 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {    return true;  } -/// print - Implement the dump method.  void LiveIntervals::print(raw_ostream &OS, const Module* ) const {    OS << "********** INTERVALS **********\n";    // Dump the regunits. -  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) -    if (LiveRange *LR = RegUnitRanges[i]) -      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n'; +  for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit) +    if (LiveRange *LR = RegUnitRanges[Unit]) +      OS << PrintRegUnit(Unit, TRI) << ' ' << *LR << '\n';    // Dump the virtregs.    for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { @@ -152,8 +149,8 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {    }    OS << "RegMasks:"; -  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i) -    OS << ' ' << RegMaskSlots[i]; +  for (SlotIndex Idx : RegMaskSlots) +    OS << ' ' << Idx;    OS << '\n';    printInstrs(OS); @@ -165,7 +162,7 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const {  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void LiveIntervals::dumpInstrs() const { +LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {    printInstrs(dbgs());  }  #endif @@ -177,8 +174,7 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {  } -/// computeVirtRegInterval - Compute the live interval of a virtual register, -/// based on defs and uses. +/// Compute the live interval of a virtual register, based on defs and uses.  void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {    assert(LRCalc && "LRCalc not initialized.");    assert(LI.empty() && "Should only compute empty intervals."); @@ -200,7 +196,7 @@ void LiveIntervals::computeRegMasks() {    RegMaskBlocks.resize(MF->getNumBlockIDs());    // Find all instructions with regmask operands. -  for (MachineBasicBlock &MBB : *MF) { +  for (const MachineBasicBlock &MBB : *MF) {      std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()];      RMB.first = RegMaskSlots.size(); @@ -210,7 +206,7 @@ void LiveIntervals::computeRegMasks() {        RegMaskBits.push_back(Mask);      } -    for (MachineInstr &MI : MBB) { +    for (const MachineInstr &MI : MBB) {        for (const MachineOperand &MO : MI.operands()) {          if (!MO.isRegMask())            continue; @@ -245,9 +241,9 @@ void LiveIntervals::computeRegMasks() {  // interference.  // -/// computeRegUnitInterval - Compute the live range of a register unit, based -/// on the uses and defs of aliasing registers.  The range should be empty, -/// or contain only dead phi-defs from ABI blocks. +/// Compute the live range of a register unit, based on the uses and defs of +/// aliasing registers.  The range should be empty, or contain only dead +/// phi-defs from ABI blocks.  void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {    assert(LRCalc && "LRCalc not initialized.");    LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -257,22 +253,30 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {    // may share super-registers. That's OK because createDeadDefs() is    // idempotent. It is very rare for a register unit to have multiple roots, so    // uniquing super-registers is probably not worthwhile. -  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { -    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); -         Supers.isValid(); ++Supers) { -      if (!MRI->reg_empty(*Supers)) -        LRCalc->createDeadDefs(LR, *Supers); +  bool IsReserved = true; +  for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { +    for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); +         Super.isValid(); ++Super) { +      unsigned Reg = *Super; +      if (!MRI->reg_empty(Reg)) +        LRCalc->createDeadDefs(LR, Reg); +      // A register unit is considered reserved if all its roots and all their +      // super registers are reserved. +      if (!MRI->isReserved(Reg)) +        IsReserved = false;      }    }    // Now extend LR to reach all uses.    // Ignore uses of reserved registers. We only track defs of those. -  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { -    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); -         Supers.isValid(); ++Supers) { -      unsigned Reg = *Supers; -      if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg)) -        LRCalc->extendToUses(LR, Reg); +  if (!IsReserved) { +    for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { +      for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); +           Super.isValid(); ++Super) { +        unsigned Reg = *Super; +        if (!MRI->reg_empty(Reg)) +          LRCalc->extendToUses(LR, Reg); +      }      }    } @@ -281,11 +285,9 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {      LR.flushSegmentSet();  } - -/// computeLiveInRegUnits - Precompute the live ranges of any register units -/// that are live-in to an ABI block somewhere. Register values can appear -/// without a corresponding def when entering the entry block or a landing pad. -/// +/// Precompute the live ranges of any register units that are live-in to an ABI +/// block somewhere. Register values can appear without a corresponding def when +/// entering the entry block or a landing pad.  void LiveIntervals::computeLiveInRegUnits() {    RegUnitRanges.resize(TRI->getNumRegUnits());    DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); @@ -294,18 +296,15 @@ void LiveIntervals::computeLiveInRegUnits() {    SmallVector<unsigned, 8> NewRanges;    // Check all basic blocks for live-ins. -  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); -       MFI != MFE; ++MFI) { -    const MachineBasicBlock *MBB = &*MFI; - +  for (const MachineBasicBlock &MBB : *MF) {      // We only care about ABI blocks: Entry + landing pads. -    if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty()) +    if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty())        continue;      // Create phi-defs at Begin for all live-in registers. -    SlotIndex Begin = Indexes->getMBBStartIdx(MBB); -    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber()); -    for (const auto &LI : MBB->liveins()) { +    SlotIndex Begin = Indexes->getMBBStartIdx(&MBB); +    DEBUG(dbgs() << Begin << "\tBB#" << MBB.getNumber()); +    for (const auto &LI : MBB.liveins()) {        for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) {          unsigned Unit = *Units;          LiveRange *LR = RegUnitRanges[Unit]; @@ -324,16 +323,13 @@ void LiveIntervals::computeLiveInRegUnits() {    DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");    // Compute the 'normal' part of the ranges. -  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) { -    unsigned Unit = NewRanges[i]; +  for (unsigned Unit : NewRanges)      computeRegUnitRange(*RegUnitRanges[Unit], Unit); -  }  } -  static void createSegmentsForValues(LiveRange &LR, -      iterator_range<LiveInterval::vni_iterator> VNIs) { -  for (auto VNI : VNIs) { +    iterator_range<LiveInterval::vni_iterator> VNIs) { +  for (VNInfo *VNI : VNIs) {      if (VNI->isUnused())        continue;      SlotIndex Def = VNI->def; @@ -349,7 +345,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,    // Keep track of the PHIs that are in use.    SmallPtrSet<VNInfo*, 8> UsedPHIs;    // Blocks that have already been added to WorkList as live-out. -  SmallPtrSet<MachineBasicBlock*, 16> LiveOut; +  SmallPtrSet<const MachineBasicBlock*, 16> LiveOut;    // Extend intervals to reach all uses in WorkList.    while (!WorkList.empty()) { @@ -368,7 +364,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,            !UsedPHIs.insert(VNI).second)          continue;        // The PHI is live, make sure the predecessors are live-out. -      for (auto &Pred : MBB->predecessors()) { +      for (const MachineBasicBlock *Pred : MBB->predecessors()) {          if (!LiveOut.insert(Pred).second)            continue;          SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -384,7 +380,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,      LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));      // Make sure VNI is live-out from the predecessors. -    for (auto &Pred : MBB->predecessors()) { +    for (const MachineBasicBlock *Pred : MBB->predecessors()) {        if (!LiveOut.insert(Pred).second)          continue;        SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -415,22 +411,20 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,    ShrinkToUsesWorkList WorkList;    // Visit all instructions reading li->reg. -  for (MachineRegisterInfo::reg_instr_iterator -       I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end(); -       I != E; ) { -    MachineInstr *UseMI = &*(I++); -    if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg)) +  unsigned Reg = li->reg; +  for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) { +    if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg))        continue; -    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot(); +    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();      LiveQueryResult LRQ = li->Query(Idx);      VNInfo *VNI = LRQ.valueIn();      if (!VNI) {        // This shouldn't happen: readsVirtualRegister returns true, but there is        // no live value. It is likely caused by a target getting <undef> flags        // wrong. -      DEBUG(dbgs() << Idx << '\t' << *UseMI +      DEBUG(dbgs() << Idx << '\t' << UseMI                     << "Warning: Instr claims to read non-existent value in " -                    << *li << '\n'); +                   << *li << '\n');        continue;      }      // Special case: An early-clobber tied operand reads and writes the @@ -458,7 +452,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,  bool LiveIntervals::computeDeadValues(LiveInterval &LI,                                        SmallVectorImpl<MachineInstr*> *dead) {    bool MayHaveSplitComponents = false; -  for (auto VNI : LI.valnos) { +  for (VNInfo *VNI : LI.valnos) {      if (VNI->isUnused())        continue;      SlotIndex Def = VNI->def; @@ -548,7 +542,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {    SR.segments.swap(NewLR.segments);    // Remove dead PHI value numbers -  for (auto VNI : SR.valnos) { +  for (VNInfo *VNI : SR.valnos) {      if (VNI->isUnused())        continue;      const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def); @@ -571,8 +565,8 @@ void LiveIntervals::extendToIndices(LiveRange &LR,                                      ArrayRef<SlotIndex> Undefs) {    assert(LRCalc && "LRCalc not initialized.");    LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); -  for (unsigned i = 0, e = Indices.size(); i != e; ++i) -    LRCalc->extend(LR, Indices[i], /*PhysReg=*/0, Undefs); +  for (SlotIndex Idx : Indices) +    LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs);  }  void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, @@ -601,11 +595,9 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,    // from each successor.    typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy;    VisitedTy Visited; -  for (MachineBasicBlock::succ_iterator -       SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end(); -       SuccI != SuccE; ++SuccI) { +  for (MachineBasicBlock *Succ : KillMBB->successors()) {      for (df_ext_iterator<MachineBasicBlock*, VisitedTy> -         I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited); +         I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited);           I != E;) {        MachineBasicBlock *MBB = *I; @@ -657,9 +649,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {      // Find the regunit intervals for the assigned register. They may overlap      // the virtual register live range, cancelling any kills.      RU.clear(); -    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid(); -         ++Units) { -      const LiveRange &RURange = getRegUnit(*Units); +    for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); +         ++Unit) { +      const LiveRange &RURange = getRegUnit(*Unit);        if (RURange.empty())          continue;        RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end))); @@ -802,9 +794,8 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {      // Conservatively return true instead of scanning huge predecessor lists.      if (PHIMBB->pred_size() > 100)        return true; -    for (MachineBasicBlock::const_pred_iterator -         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI) -      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI))) +    for (const MachineBasicBlock *Pred : PHIMBB->predecessors()) +      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred)))          return true;    }    return false; @@ -895,7 +886,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,  //                         IntervalUpdate class.  //===----------------------------------------------------------------------===// -// HMEditor is a toolkit used by handleMove to trim or extend live intervals. +/// Toolkit used by handleMove to trim or extend live intervals.  class LiveIntervals::HMEditor {  private:    LiveIntervals& LIS; @@ -1241,10 +1232,12 @@ private:            LiveRange::iterator NewIdxIn = NewIdxOut;            assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));            const SlotIndex SplitPos = NewIdxDef; +          OldIdxVNI = OldIdxIn->valno;            // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut. +          OldIdxOut->valno->def = OldIdxIn->start;            *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end, -                                          OldIdxIn->valno); +                                          OldIdxOut->valno);            // OldIdxIn and OldIdxVNI are now undef and can be overridden.            // We Slide [NewIdxIn, OldIdxIn) down one position.            //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -| @@ -1514,8 +1507,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,      }    } -  for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) { -    unsigned Reg = OrigRegs[i]; +  for (unsigned Reg : OrigRegs) {      if (!TargetRegisterInfo::isVirtualRegister(Reg))        continue; @@ -1524,16 +1516,16 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,      if (!LI.hasAtLeastOneValue())        continue; -    for (LiveInterval::SubRange &S : LI.subranges()) { +    for (LiveInterval::SubRange &S : LI.subranges())        repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask); -    } +      repairOldRegInRange(Begin, End, endIdx, LI, Reg);    }  }  void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) { -  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { -    if (LiveRange *LR = getCachedRegUnit(*Units)) +  for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { +    if (LiveRange *LR = getCachedRegUnit(*Unit))        if (VNInfo *VNI = LR->getVNInfoAt(Pos))          LR->removeValNo(VNI);    } diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp index fc2f233f6d68..b4aa0dc326a5 100644 --- a/lib/CodeGen/LiveIntervalUnion.cpp +++ b/lib/CodeGen/LiveIntervalUnion.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===// +//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -13,19 +13,19 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveIntervalUnion.h" -#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SparseBitVector.h" -#include "llvm/Support/Debug.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalUnion.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetRegisterInfo.h" -#include <algorithm> +#include <cassert> +#include <cstdlib>  using namespace llvm;  #define DEBUG_TYPE "regalloc" -  // Merge a LiveInterval's segments. Guarantee no overlaps.  void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {    if (Range.empty()) @@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {    LiveRange::const_iterator RegEnd = Range.end();    SegmentIter SegPos = Segments.find(RegPos->start); -  for (;;) { +  while (true) {      assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");      SegPos.erase();      if (!SegPos.valid()) @@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {      CheckedFirstInterference = true;      // Quickly skip interference check for empty sets. -    if (VirtReg->empty() || LiveUnion->empty()) { +    if (LR->empty() || LiveUnion->empty()) {        SeenAllInterferences = true;        return 0;      } -    // In most cases, the union will start before VirtReg. -    VirtRegI = VirtReg->begin(); +    // In most cases, the union will start before LR. +    LRI = LR->begin();      LiveUnionI.setMap(LiveUnion->getMap()); -    LiveUnionI.find(VirtRegI->start); +    LiveUnionI.find(LRI->start);    } -  LiveInterval::iterator VirtRegEnd = VirtReg->end(); +  LiveRange::const_iterator LREnd = LR->end();    LiveInterval *RecentReg = nullptr;    while (LiveUnionI.valid()) { -    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg"); +    assert(LRI != LREnd && "Reached end of LR");      // Check for overlapping interference. -    while (VirtRegI->start < LiveUnionI.stop() && -           VirtRegI->end > LiveUnionI.start()) { +    while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) {        // This is an overlap, record the interfering register.        LiveInterval *VReg = LiveUnionI.value();        if (VReg != RecentReg && !isSeenInterference(VReg)) { @@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {      }      // The iterators are now not overlapping, LiveUnionI has been advanced -    // beyond VirtRegI. -    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap"); +    // beyond LRI. +    assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap");      // Advance the iterator that ends first. -    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start()); -    if (VirtRegI == VirtRegEnd) +    LRI = LR->advanceTo(LRI, LiveUnionI.start()); +    if (LRI == LREnd)        break;      // Detect overlap, handle above. -    if (VirtRegI->start < LiveUnionI.stop()) +    if (LRI->start < LiveUnionI.stop())        continue;      // Still not overlapping. Catch up LiveUnionI. -    LiveUnionI.advanceTo(VirtRegI->start); +    LiveUnionI.advanceTo(LRI->start);    }    SeenAllInterferences = true;    return InterferingVRegs.size(); diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp index dcc41c1718a6..9f7d7cf54848 100644 --- a/lib/CodeGen/LivePhysRegs.cpp +++ b/lib/CodeGen/LivePhysRegs.cpp @@ -120,12 +120,11 @@ void LivePhysRegs::print(raw_ostream &OS) const {    OS << "\n";  } -/// Dumps the currently live registers to the debug output. -LLVM_DUMP_METHOD void LivePhysRegs::dump() const {  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LivePhysRegs::dump() const {    dbgs() << "  " << *this; -#endif  } +#endif  bool LivePhysRegs::available(const MachineRegisterInfo &MRI,                               unsigned Reg) const { @@ -161,7 +160,9 @@ void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {  static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,                           const MachineFrameInfo &MFI,                           const TargetRegisterInfo &TRI) { -  for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) +  const MachineRegisterInfo &MRI = MF.getRegInfo(); +  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; +       ++CSR)      LiveRegs.addReg(*CSR);    for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())      LiveRegs.removeReg(Info.getReg()); @@ -180,7 +181,8 @@ void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {      if (MBB.isReturnBlock()) {        // The return block has no successors whose live-ins we could merge        // below. So instead we add the callee saved registers manually. -      for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) +      const MachineRegisterInfo &MRI = MF.getRegInfo(); +      for (const MCPhysReg *I = MRI.getCalleeSavedRegs(); *I; ++I)          addReg(*I);      } else {        addPristines(*this, MF, MFI, *TRI); diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp index 012837608628..398066bf8903 100644 --- a/lib/CodeGen/LiveRangeCalc.cpp +++ b/lib/CodeGen/LiveRangeCalc.cpp @@ -75,34 +75,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {          LI.createSubRangeFrom(*Alloc, ClassMask, LI);        } -      LaneBitmask Mask = SubMask; -      for (LiveInterval::SubRange &S : LI.subranges()) { -        // A Mask for subregs common to the existing subrange and current def. -        LaneBitmask Common = S.LaneMask & Mask; -        if (Common.none()) -          continue; -        LiveInterval::SubRange *CommonRange; -        // A Mask for subregs covered by the subrange but not the current def. -        LaneBitmask RM = S.LaneMask & ~Mask; -        if (RM.any()) { -          // Split the subrange S into two parts: one covered by the current -          // def (CommonRange), and the one not affected by it (updated S). -          S.LaneMask = RM; -          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S); -        } else { -          assert(Common == S.LaneMask); -          CommonRange = &S; -        } +      LI.refineSubRanges(*Alloc, SubMask, +          [&MO, this](LiveInterval::SubRange &SR) {          if (MO.isDef()) -          createDeadDef(*Indexes, *Alloc, *CommonRange, MO); -        Mask &= ~Common; -      } -      // Create a new SubRange for subregs we did not cover yet. -      if (Mask.any()) { -        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask); -        if (MO.isDef()) -          createDeadDef(*Indexes, *Alloc, *NewRange, MO); -      } +          createDeadDef(*Indexes, *Alloc, SR, MO); +      });      }      // Create the def in the main liverange. We do not have to do this if @@ -289,8 +266,7 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,    if (UndefOnEntry[BN])      return false; -  auto MarkDefined = -        [this,BN,&DefOnEntry,&UndefOnEntry] (MachineBasicBlock &B) -> bool { +  auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool {      for (MachineBasicBlock *S : B.successors())        DefOnEntry[S->getNumber()] = true;      DefOnEntry[BN] = true; @@ -311,7 +287,12 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,        return MarkDefined(B);      SlotIndex Begin, End;      std::tie(Begin, End) = Indexes->getMBBRange(&B); -    LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), End); +    // Treat End as not belonging to B. +    // If LR has a segment S that starts at the next block, i.e. [End, ...), +    // std::upper_bound will return the segment following S. Instead, +    // S should be treated as the first segment that does not overlap B. +    LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), +                                              End.getPrevSlot());      if (UB != LR.begin()) {        LiveRange::Segment &Seg = *std::prev(UB);        if (Seg.end > Begin) { diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp index 7f1c69c0b4a2..92cca1a54951 100644 --- a/lib/CodeGen/LiveRangeEdit.cpp +++ b/lib/CodeGen/LiveRangeEdit.cpp @@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {      VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));    }    LiveInterval &LI = LIS.createEmptyInterval(VReg); +  if (Parent && !Parent->isSpillable()) +    LI.markNotSpillable();    // Create empty subranges if the OldReg's interval has them. Do not create    // the main range here---it will be constructed later after the subranges    // have been finalized. @@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) {    if (VRM) {      VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));    } +  // FIXME: Getting the interval here actually computes it. +  // In theory, this may not be what we want, but in practice +  // the createEmptyIntervalFrom API is used when this is not +  // the case. Generally speaking we just want to annotate the +  // LiveInterval when it gets created but we cannot do that at +  // the moment. +  if (Parent && !Parent->isSpillable()) +    LIS.getInterval(VReg).markNotSpillable();    return VReg;  } @@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)    if (VRM)      VRM->grow(); -  if (Parent && !Parent->isSpillable()) -    LIS.getInterval(VReg).markNotSpillable(); -    NewRegs.push_back(VReg);  } diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp index 7a51386aa9ca..882de1a3fad9 100644 --- a/lib/CodeGen/LiveRegMatrix.cpp +++ b/lib/CodeGen/LiveRegMatrix.cpp @@ -1,4 +1,4 @@ -//===-- LiveRegMatrix.cpp - Track register interference -------------------===// +//===- LiveRegMatrix.cpp - Track register interference --------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -11,15 +11,22 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveRegMatrix.h"  #include "RegisterCoalescer.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveRegMatrix.h"  #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Pass.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert>  using namespace llvm; @@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)  INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",                      "Live Register Matrix", false, false) -LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID), -  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {} +LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {}  void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {    AU.setPreservesAll(); @@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,    return Result;  } -LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg, +LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,                                                 unsigned RegUnit) {    LiveIntervalUnion::Query &Q = Queries[RegUnit]; -  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]); +  Q.init(UserTag, LR, Matrix[RegUnit]);    return Q;  } @@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {      return IK_RegUnit;    // Check the matrix for virtual register interference. -  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) -    if (query(VirtReg, *Units).checkInterference()) -      return IK_VirtReg; +  bool Interference = foreachUnit(TRI, VirtReg, PhysReg, +                                  [&](unsigned Unit, const LiveRange &LR) { +    return query(LR, Unit).checkInterference(); +  }); +  if (Interference) +    return IK_VirtReg;    return IK_Free;  } diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp new file mode 100644 index 000000000000..dff555f49565 --- /dev/null +++ b/lib/CodeGen/LiveRegUnits.cpp @@ -0,0 +1,126 @@ +//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file imlements the LiveRegUnits set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) { +  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { +    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { +      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) +        Units.reset(U); +    } +  } +} + +void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) { +  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { +    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { +      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) +        Units.set(U); +    } +  } +} + +void LiveRegUnits::stepBackward(const MachineInstr &MI) { +  // Remove defined registers and regmask kills from the set. +  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { +    if (O->isReg()) { +      if (!O->isDef()) +        continue; +      unsigned Reg = O->getReg(); +      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) +        continue; +      removeReg(Reg); +    } else if (O->isRegMask()) +      removeRegsNotPreserved(O->getRegMask()); +  } + +  // Add uses to the set. +  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { +    if (!O->isReg() || !O->readsReg()) +      continue; +    unsigned Reg = O->getReg(); +    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) +      continue; +    addReg(Reg); +  } +} + +void LiveRegUnits::accumulateBackward(const MachineInstr &MI) { +  // Add defs, uses and regmask clobbers to the set. +  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { +    if (O->isReg()) { +      unsigned Reg = O->getReg(); +      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) +        continue; +      if (!O->isDef() && !O->readsReg()) +        continue; +      addReg(Reg); +    } else if (O->isRegMask()) +      addRegsInMask(O->getRegMask()); +  } +} + +/// Add live-in registers of basic block \p MBB to \p LiveUnits. +static void addLiveIns(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { +  for (const auto &LI : MBB.liveins()) +    LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask); +} + +static void addLiveOuts(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { +  // To get the live-outs we simply merge the live-ins of all successors. +  for (const MachineBasicBlock *Succ : MBB.successors()) +    addLiveIns(LiveUnits, *Succ); +} + +/// Add pristine registers to the given \p LiveUnits. This function removes +/// actually saved callee save registers when \p InPrologueEpilogue is false. +static void removeSavedRegs(LiveRegUnits &LiveUnits, const MachineFunction &MF, +                            const MachineFrameInfo &MFI, +                            const TargetRegisterInfo &TRI) { +  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) +    LiveUnits.removeReg(Info.getReg()); +} + +void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) { +  const MachineFunction &MF = *MBB.getParent(); +  const MachineFrameInfo &MFI = MF.getFrameInfo(); +  if (MFI.isCalleeSavedInfoValid()) { +    for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) +      addReg(*I); +    if (!MBB.isReturnBlock()) +      removeSavedRegs(*this, MF, MFI, *TRI); +  } +  ::addLiveOuts(*this, MBB); +} + +void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) { +  const MachineFunction &MF = *MBB.getParent(); +  const MachineFrameInfo &MFI = MF.getFrameInfo(); +  if (MFI.isCalleeSavedInfoValid()) { +    for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) +      addReg(*I); +    if (&MBB != &MF.front()) +      removeSavedRegs(*this, MF, MFI, *TRI); +  } +  ::addLiveIns(*this, MBB); +} diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp index 269b990a3149..3568b0294ad9 100644 --- a/lib/CodeGen/LiveVariables.cpp +++ b/lib/CodeGen/LiveVariables.cpp @@ -64,8 +64,8 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {    return nullptr;  } -LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {    dbgs() << "  Alive in blocks: ";    for (SparseBitVector<>::iterator I = AliveBlocks.begin(),             E = AliveBlocks.end(); I != E; ++I) @@ -78,8 +78,8 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {        dbgs() << "\n    #" << i << ": " << *Kills[i];      dbgs() << "\n";    } -#endif  } +#endif  /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.  LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) { diff --git a/lib/CodeGen/LowLevelType.cpp b/lib/CodeGen/LowLevelType.cpp index d74b7306e0f4..c4b9068fa905 100644 --- a/lib/CodeGen/LowLevelType.cpp +++ b/lib/CodeGen/LowLevelType.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===// +//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -18,54 +18,21 @@  #include "llvm/Support/raw_ostream.h"  using namespace llvm; -LLT::LLT(Type &Ty, const DataLayout &DL) { +LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {    if (auto VTy = dyn_cast<VectorType>(&Ty)) { -    SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits(); -    ElementsOrAddrSpace = VTy->getNumElements(); -    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; +    auto NumElements = VTy->getNumElements(); +    auto ScalarSizeInBits = VTy->getElementType()->getPrimitiveSizeInBits(); +    if (NumElements == 1) +      return LLT::scalar(ScalarSizeInBits); +    return LLT::vector(NumElements, ScalarSizeInBits);    } else if (auto PTy = dyn_cast<PointerType>(&Ty)) { -    Kind = Pointer; -    SizeInBits = DL.getTypeSizeInBits(&Ty); -    ElementsOrAddrSpace = PTy->getAddressSpace(); +    return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty));    } else if (Ty.isSized()) {      // Aggregates are no different from real scalars as far as GlobalISel is      // concerned. -    Kind = Scalar; -    SizeInBits = DL.getTypeSizeInBits(&Ty); -    ElementsOrAddrSpace = 1; +    auto SizeInBits = DL.getTypeSizeInBits(&Ty);      assert(SizeInBits != 0 && "invalid zero-sized type"); -  } else { -    Kind = Invalid; -    SizeInBits = ElementsOrAddrSpace = 0; +    return LLT::scalar(SizeInBits);    } -} - -LLT::LLT(MVT VT) { -  if (VT.isVector()) { -    SizeInBits = VT.getVectorElementType().getSizeInBits(); -    ElementsOrAddrSpace = VT.getVectorNumElements(); -    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; -  } else if (VT.isValid()) { -    // Aggregates are no different from real scalars as far as GlobalISel is -    // concerned. -    Kind = Scalar; -    SizeInBits = VT.getSizeInBits(); -    ElementsOrAddrSpace = 1; -    assert(SizeInBits != 0 && "invalid zero-sized type"); -  } else { -    Kind = Invalid; -    SizeInBits = ElementsOrAddrSpace = 0; -  } -} - -void LLT::print(raw_ostream &OS) const { -  if (isVector()) -    OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">"; -  else if (isPointer()) -    OS << "p" << getAddressSpace(); -  else if (isValid()) { -    assert(isScalar() && "unexpected type"); -    OS << "s" << getScalarSizeInBits(); -  } else -    llvm_unreachable("trying to print an invalid type"); +  return LLT();  } diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index c8bed0890dd6..cac22af32956 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -41,8 +41,11 @@  using namespace llvm;  PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF, -    SourceMgr &SM, const SlotMapping &IRSlots) -  : MF(MF), SM(&SM), IRSlots(IRSlots) { +    SourceMgr &SM, const SlotMapping &IRSlots, +    const Name2RegClassMap &Names2RegClasses, +    const Name2RegBankMap &Names2RegBanks) +  : MF(MF), SM(&SM), IRSlots(IRSlots), Names2RegClasses(Names2RegClasses), +    Names2RegBanks(Names2RegBanks) {  }  VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) { @@ -139,6 +142,7 @@ public:    bool parseVirtualRegister(VRegInfo *&Info);    bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo);    bool parseRegisterFlag(unsigned &Flags); +  bool parseRegisterClassOrBank(VRegInfo &RegInfo);    bool parseSubRegisterIndex(unsigned &SubReg);    bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);    bool parseRegisterOperand(MachineOperand &Dest, @@ -172,6 +176,7 @@ public:    bool parseIntrinsicOperand(MachineOperand &Dest);    bool parsePredicateOperand(MachineOperand &Dest);    bool parseTargetIndexOperand(MachineOperand &Dest); +  bool parseCustomRegisterMaskOperand(MachineOperand &Dest);    bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);    bool parseMachineOperand(MachineOperand &Dest,                             Optional<unsigned> &TiedDefIdx); @@ -184,6 +189,7 @@ public:    bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);    bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);    bool parseMachinePointerInfo(MachinePointerInfo &Dest); +  bool parseOptionalAtomicOrdering(AtomicOrdering &Order);    bool parseMachineMemoryOperand(MachineMemOperand *&Dest);  private: @@ -878,6 +884,66 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) {    }  } +bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) { +  if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore)) +    return error("expected '_', register class, or register bank name"); +  StringRef::iterator Loc = Token.location(); +  StringRef Name = Token.stringValue(); + +  // Was it a register class? +  auto RCNameI = PFS.Names2RegClasses.find(Name); +  if (RCNameI != PFS.Names2RegClasses.end()) { +    lex(); +    const TargetRegisterClass &RC = *RCNameI->getValue(); + +    switch (RegInfo.Kind) { +    case VRegInfo::UNKNOWN: +    case VRegInfo::NORMAL: +      RegInfo.Kind = VRegInfo::NORMAL; +      if (RegInfo.Explicit && RegInfo.D.RC != &RC) { +        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); +        return error(Loc, Twine("conflicting register classes, previously: ") + +                     Twine(TRI.getRegClassName(RegInfo.D.RC))); +      } +      RegInfo.D.RC = &RC; +      RegInfo.Explicit = true; +      return false; + +    case VRegInfo::GENERIC: +    case VRegInfo::REGBANK: +      return error(Loc, "register class specification on generic register"); +    } +    llvm_unreachable("Unexpected register kind"); +  } + +  // Should be a register bank or a generic register. +  const RegisterBank *RegBank = nullptr; +  if (Name != "_") { +    auto RBNameI = PFS.Names2RegBanks.find(Name); +    if (RBNameI == PFS.Names2RegBanks.end()) +      return error(Loc, "expected '_', register class, or register bank name"); +    RegBank = RBNameI->getValue(); +  } + +  lex(); + +  switch (RegInfo.Kind) { +  case VRegInfo::UNKNOWN: +  case VRegInfo::GENERIC: +  case VRegInfo::REGBANK: +    RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC; +    if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank) +      return error(Loc, "conflicting generic register banks"); +    RegInfo.D.RegBank = RegBank; +    RegInfo.Explicit = true; +    return false; + +  case VRegInfo::NORMAL: +    return error(Loc, "register bank specification on normal register"); +  } +  llvm_unreachable("Unexpected register kind"); +} +  bool MIParser::parseRegisterFlag(unsigned &Flags) {    const unsigned OldFlags = Flags;    switch (Token.kind()) { @@ -1004,6 +1070,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,      if (!TargetRegisterInfo::isVirtualRegister(Reg))        return error("subregister index expects a virtual register");    } +  if (Token.is(MIToken::colon)) { +    if (!TargetRegisterInfo::isVirtualRegister(Reg)) +      return error("register class specification expects a virtual register"); +    lex(); +    if (parseRegisterClassOrBank(*RegInfo)) +        return true; +  }    MachineRegisterInfo &MRI = MF.getRegInfo();    if ((Flags & RegState::Define) == 0) {      if (consumeIfPresent(MIToken::lparen)) { @@ -1598,6 +1671,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {    return false;  } +bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) { +  assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask"); +  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); +  assert(TRI && "Expected target register info"); +  lex(); +  if (expectAndConsume(MIToken::lparen)) +    return true; + +  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs()); +  while (true) { +    if (Token.isNot(MIToken::NamedRegister)) +      return error("expected a named register"); +    unsigned Reg; +    if (parseNamedRegister(Reg)) +      return true; +    lex(); +    Mask[Reg / 32] |= 1U << (Reg % 32); +    // TODO: Report an error if the same register is used more than once. +    if (Token.isNot(MIToken::comma)) +      break; +    lex(); +  } + +  if (expectAndConsume(MIToken::rparen)) +    return true; +  Dest = MachineOperand::CreateRegMask(Mask); +  return false; +} +  bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {    assert(Token.is(MIToken::kw_liveout));    const auto *TRI = MF.getSubtarget().getRegisterInfo(); @@ -1695,8 +1797,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,        Dest = MachineOperand::CreateRegMask(RegMask);        lex();        break; -    } -    LLVM_FALLTHROUGH; +    } else +      return parseCustomRegisterMaskOperand(Dest);    default:      // FIXME: Parse the MCSymbol machine operand.      return error("expected a machine operand"); @@ -1969,6 +2071,28 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {    return false;  } +bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) { +  Order = AtomicOrdering::NotAtomic; +  if (Token.isNot(MIToken::Identifier)) +    return false; + +  Order = StringSwitch<AtomicOrdering>(Token.stringValue()) +              .Case("unordered", AtomicOrdering::Unordered) +              .Case("monotonic", AtomicOrdering::Monotonic) +              .Case("acquire", AtomicOrdering::Acquire) +              .Case("release", AtomicOrdering::Release) +              .Case("acq_rel", AtomicOrdering::AcquireRelease) +              .Case("seq_cst", AtomicOrdering::SequentiallyConsistent) +              .Default(AtomicOrdering::NotAtomic); + +  if (Order != AtomicOrdering::NotAtomic) { +    lex(); +    return false; +  } + +  return error("expected an atomic scope, ordering or a size integer literal"); +} +  bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {    if (expectAndConsume(MIToken::lparen))      return true; @@ -1986,6 +2110,21 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {      Flags |= MachineMemOperand::MOStore;    lex(); +  // Optional "singlethread" scope. +  SynchronizationScope Scope = SynchronizationScope::CrossThread; +  if (Token.is(MIToken::Identifier) && Token.stringValue() == "singlethread") { +    Scope = SynchronizationScope::SingleThread; +    lex(); +  } + +  // Up to two atomic orderings (cmpxchg provides guarantees on failure). +  AtomicOrdering Order, FailureOrder; +  if (parseOptionalAtomicOrdering(Order)) +    return true; + +  if (parseOptionalAtomicOrdering(FailureOrder)) +    return true; +    if (Token.isNot(MIToken::IntegerLiteral))      return error("expected the size integer literal after memory operation");    uint64_t Size; @@ -2040,8 +2179,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {    }    if (expectAndConsume(MIToken::rparen))      return true; -  Dest = -      MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range); +  Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range, +                                 Scope, Order, FailureOrder);    return false;  } diff --git a/lib/CodeGen/MIRParser/MIParser.h b/lib/CodeGen/MIRParser/MIParser.h index 93a4d84ba62f..9b3879cf8377 100644 --- a/lib/CodeGen/MIRParser/MIParser.h +++ b/lib/CodeGen/MIRParser/MIParser.h @@ -45,11 +45,16 @@ struct VRegInfo {    unsigned PreferredReg = 0;  }; +typedef StringMap<const TargetRegisterClass*> Name2RegClassMap; +typedef StringMap<const RegisterBank*> Name2RegBankMap; +  struct PerFunctionMIParsingState {    BumpPtrAllocator Allocator;    MachineFunction &MF;    SourceMgr *SM;    const SlotMapping &IRSlots; +  const Name2RegClassMap &Names2RegClasses; +  const Name2RegBankMap &Names2RegBanks;    DenseMap<unsigned, MachineBasicBlock *> MBBSlots;    DenseMap<unsigned, VRegInfo*> VRegInfos; @@ -59,7 +64,9 @@ struct PerFunctionMIParsingState {    DenseMap<unsigned, unsigned> JumpTableSlots;    PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM, -                            const SlotMapping &IRSlots); +                            const SlotMapping &IRSlots, +                            const Name2RegClassMap &Names2RegClasses, +                            const Name2RegBankMap &Names2RegBanks);    VRegInfo &getVRegInfo(unsigned VReg);  }; diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp index 3dff1147631b..a2773cccc5db 100644 --- a/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/lib/CodeGen/MIRParser/MIRParser.cpp @@ -55,9 +55,9 @@ class MIRParserImpl {    StringMap<std::unique_ptr<yaml::MachineFunction>> Functions;    SlotMapping IRSlots;    /// Maps from register class names to register classes. -  StringMap<const TargetRegisterClass *> Names2RegClasses; +  Name2RegClassMap Names2RegClasses;    /// Maps from register bank names to register banks. -  StringMap<const RegisterBank *> Names2RegBanks; +  Name2RegBankMap Names2RegBanks;  public:    MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename, @@ -325,11 +325,15 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {      return error(Twine("no machine function information for function '") +                   MF.getName() + "' in the MIR file");    // TODO: Recreate the machine function. +  initNames2RegClasses(MF); +  initNames2RegBanks(MF);    const yaml::MachineFunction &YamlMF = *It->getValue();    if (YamlMF.Alignment)      MF.setAlignment(YamlMF.Alignment);    MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); +  if (YamlMF.NoVRegs) +    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);    if (YamlMF.Legalized)      MF.getProperties().set(MachineFunctionProperties::Property::Legalized);    if (YamlMF.RegBankSelected) @@ -338,7 +342,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {    if (YamlMF.Selected)      MF.getProperties().set(MachineFunctionProperties::Property::Selected); -  PerFunctionMIParsingState PFS(MF, SM, IRSlots); +  PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses, +                                Names2RegBanks);    if (parseRegisterInfo(PFS, YamlMF))      return true;    if (!YamlMF.Constants.empty()) { @@ -362,9 +367,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {    }    PFS.SM = &SM; -  if (MF.empty()) -    return error(Twine("machine function '") + Twine(MF.getName()) + -                 "' requires at least one machine basic block in its body");    // Initialize the frame information after creating all the MBBs so that the    // MBB references in the frame information can be resolved.    if (initializeFrameInfo(PFS, YamlMF)) @@ -462,17 +464,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,      RegInfo.addLiveIn(Reg, VReg);    } -  // Parse the callee saved register mask. -  BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size()); -  if (!YamlMF.CalleeSavedRegisters) -    return false; -  for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { -    unsigned Reg = 0; -    if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) -      return error(Error, RegSource.SourceRange); -    CalleeSavedRegisterMask[Reg] = true; +  // Parse the callee saved registers (Registers that will +  // be saved for the caller). +  if (YamlMF.CalleeSavedRegisters) { +    SmallVector<MCPhysReg, 16> CalleeSavedRegisters; +    for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { +      unsigned Reg = 0; +      if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) +        return error(Error, RegSource.SourceRange); +      CalleeSavedRegisters.push_back(Reg); +    } +    RegInfo.setCalleeSavedRegs(CalleeSavedRegisters);    } -  RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip()); +    return false;  } @@ -505,14 +509,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,    }    // Compute MachineRegisterInfo::UsedPhysRegMask -  if (!YamlMF.CalleeSavedRegisters) { -    for (const MachineBasicBlock &MBB : MF) { -      for (const MachineInstr &MI : MBB) { -        for (const MachineOperand &MO : MI.operands()) { -          if (!MO.isRegMask()) -            continue; -          MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); -        } +  for (const MachineBasicBlock &MBB : MF) { +    for (const MachineInstr &MI : MBB) { +      for (const MachineOperand &MO : MI.operands()) { +        if (!MO.isRegMask()) +          continue; +        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());        }      }    } @@ -818,7 +820,6 @@ void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) {  const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,                                                        StringRef Name) { -  initNames2RegClasses(MF);    auto RegClassInfo = Names2RegClasses.find(Name);    if (RegClassInfo == Names2RegClasses.end())      return nullptr; @@ -827,7 +828,6 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,  const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF,                                                StringRef Name) { -  initNames2RegBanks(MF);    auto RegBankInfo = Names2RegBanks.find(Name);    if (RegBankInfo == Names2RegBanks.end())      return nullptr; diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index db87092177ca..6da174a53666 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -175,6 +175,8 @@ void MIRPrinter::print(const MachineFunction &MF) {    YamlMF.Alignment = MF.getAlignment();    YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); +  YamlMF.NoVRegs = MF.getProperties().hasProperty( +      MachineFunctionProperties::Property::NoVRegs);    YamlMF.Legalized = MF.getProperties().hasProperty(        MachineFunctionProperties::Property::Legalized);    YamlMF.RegBankSelected = MF.getProperties().hasProperty( @@ -205,6 +207,25 @@ void MIRPrinter::print(const MachineFunction &MF) {    Out << YamlMF;  } +static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS, +                               const TargetRegisterInfo *TRI) { +  assert(RegMask && "Can't print an empty register mask"); +  OS << StringRef("CustomRegMask("); + +  bool IsRegInRegMaskFound = false; +  for (int I = 0, E = TRI->getNumRegs(); I < E; I++) { +    // Check whether the register is asserted in regmask. +    if (RegMask[I / 32] & (1u << (I % 32))) { +      if (IsRegInRegMaskFound) +        OS << ','; +      printReg(I, OS, TRI); +      IsRegInRegMaskFound = true; +    } +  } + +  OS << ')'; +} +  void MIRPrinter::convert(yaml::MachineFunction &MF,                           const MachineRegisterInfo &RegInfo,                           const TargetRegisterInfo *TRI) { @@ -239,20 +260,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,        printReg(I->second, LiveIn.VirtualRegister, TRI);      MF.LiveIns.push_back(LiveIn);    } -  // The used physical register mask is printed as an inverted callee saved -  // register mask. -  const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask(); -  if (UsedPhysRegMask.none()) -    return; -  std::vector<yaml::FlowStringValue> CalleeSavedRegisters; -  for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) { -    if (!UsedPhysRegMask[I]) { + +  // Prints the callee saved registers. +  if (RegInfo.isUpdatedCSRsInitialized()) { +    const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs(); +    std::vector<yaml::FlowStringValue> CalleeSavedRegisters; +    for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) {        yaml::FlowStringValue Reg; -      printReg(I, Reg, TRI); +      printReg(*I, Reg, TRI);        CalleeSavedRegisters.push_back(Reg);      } +    MF.CalleeSavedRegisters = CalleeSavedRegisters;    } -  MF.CalleeSavedRegisters = CalleeSavedRegisters;  }  void MIRPrinter::convert(ModuleSlotTracker &MST, @@ -860,7 +879,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,      if (RegMaskInfo != RegisterMaskIds.end())        OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower();      else -      llvm_unreachable("Can't print this machine register mask yet."); +      printCustomRegMask(Op.getRegMask(), OS, TRI);      break;    }    case MachineOperand::MO_RegisterLiveOut: { @@ -906,6 +925,9 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,         << CmpInst::getPredicateName(Pred) << ')';      break;    } +  case MachineOperand::MO_Placeholder: +    OS << "<placeholder>"; +    break;    }  } @@ -926,6 +948,15 @@ void MIPrinter::print(const MachineMemOperand &Op) {      assert(Op.isStore() && "Non load machine operand must be a store");      OS << "store ";    } + +  if (Op.getSynchScope() == SynchronizationScope::SingleThread) +    OS << "singlethread "; + +  if (Op.getOrdering() != AtomicOrdering::NotAtomic) +    OS << toIRString(Op.getOrdering()) << ' '; +  if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic) +    OS << toIRString(Op.getFailureOrdering()) << ' '; +    OS << Op.getSize();    if (const Value *Val = Op.getValue()) {      OS << (Op.isLoad() ? " from " : " into "); diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 3869f976854d..06112723497b 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -23,6 +23,7 @@  #include "llvm/CodeGen/SlotIndexes.h"  #include "llvm/IR/BasicBlock.h"  #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h"  #include "llvm/IR/ModuleSlotTracker.h"  #include "llvm/MC/MCAsmInfo.h"  #include "llvm/MC/MCContext.h" @@ -148,8 +149,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {  MachineBasicBlock::iterator  MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { +  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); +    iterator E = end(); -  while (I != E && (I->isPHI() || I->isPosition())) +  while (I != E && (I->isPHI() || I->isPosition() || +                    TII->isBasicBlockPrologue(*I)))      ++I;    // FIXME: This needs to change if we wish to bundle labels    // inside the bundle. @@ -160,8 +164,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {  MachineBasicBlock::iterator  MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) { +  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); +    iterator E = end(); -  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue())) +  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() || +                    TII->isBasicBlockPrologue(*I)))      ++I;    // FIXME: This needs to change if we wish to bundle labels / dbg_values    // inside the bundle. @@ -225,7 +232,7 @@ StringRef MachineBasicBlock::getName() const {    if (const BasicBlock *LBB = getBasicBlock())      return LBB->getName();    else -    return "(null)"; +    return StringRef("", 0);  }  /// Return a hopefully unique identifier for this block. @@ -417,7 +424,7 @@ void MachineBasicBlock::updateTerminator() {    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;    SmallVector<MachineOperand, 4> Cond; -  DebugLoc DL;  // FIXME: this is nowhere +  DebugLoc DL = findBranchDebugLoc();    bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);    (void) B;    assert(!B && "UpdateTerminators requires analyzable predecessors!"); @@ -485,7 +492,7 @@ void MachineBasicBlock::updateTerminator() {        // FIXME: This does not seem like a reasonable pattern to support, but it        // has been seen in the wild coming out of degenerate ARM test cases.        TII->removeBranch(*this); -   +        // Finally update the unconditional successor to be reached via a branch if        // it would not be reached by fallthrough.        if (!isLayoutSuccessor(TBB)) @@ -681,16 +688,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {    return std::next(I) == MachineFunction::const_iterator(MBB);  } -bool MachineBasicBlock::canFallThrough() { +MachineBasicBlock *MachineBasicBlock::getFallThrough() {    MachineFunction::iterator Fallthrough = getIterator();    ++Fallthrough;    // If FallthroughBlock is off the end of the function, it can't fall through.    if (Fallthrough == getParent()->end()) -    return false; +    return nullptr;    // If FallthroughBlock isn't a successor, no fallthrough is possible.    if (!isSuccessor(&*Fallthrough)) -    return false; +    return nullptr;    // Analyze the branches, if any, at the end of the block.    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; @@ -702,25 +709,31 @@ bool MachineBasicBlock::canFallThrough() {      // is possible. The isPredicated check is needed because this code can be      // called during IfConversion, where an instruction which is normally a      // Barrier is predicated and thus no longer an actual control barrier. -    return empty() || !back().isBarrier() || TII->isPredicated(back()); +    return (empty() || !back().isBarrier() || TII->isPredicated(back())) +               ? &*Fallthrough +               : nullptr;    }    // If there is no branch, control always falls through. -  if (!TBB) return true; +  if (!TBB) return &*Fallthrough;    // If there is some explicit branch to the fallthrough block, it can obviously    // reach, even though the branch should get folded to fall through implicitly.    if (MachineFunction::iterator(TBB) == Fallthrough ||        MachineFunction::iterator(FBB) == Fallthrough) -    return true; +    return &*Fallthrough;    // If it's an unconditional branch to some block not the fall through, it    // doesn't fall through. -  if (Cond.empty()) return false; +  if (Cond.empty()) return nullptr;    // Otherwise, if it is conditional and has no explicit false block, it falls    // through. -  return FBB == nullptr; +  return (FBB == nullptr) ? &*Fallthrough : nullptr; +} + +bool MachineBasicBlock::canFallThrough() { +  return getFallThrough() != nullptr;  }  MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, @@ -1144,6 +1157,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {    return {};  } +/// Find and return the merged DebugLoc of the branch instructions of the block. +/// Return UnknownLoc if there is none. +DebugLoc +MachineBasicBlock::findBranchDebugLoc() { +  DebugLoc DL; +  auto TI = getFirstTerminator(); +  while (TI != end() && !TI->isBranch()) +    ++TI; + +  if (TI != end()) { +    DL = TI->getDebugLoc(); +    for (++TI ; TI != end() ; ++TI) +      if (TI->isBranch()) +        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc()); +  } +  return DL; +} +  /// Return probability of the edge from this block to MBB.  BranchProbability  MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 7d5124d30a04..9c7367b4c780 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -28,7 +28,6 @@ using namespace llvm;  #define DEBUG_TYPE "block-freq" -#ifndef NDEBUG  static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(      "view-machine-block-freq-propagation-dags", cl::Hidden, @@ -43,10 +42,37 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(                            "integer fractional block frequency representation."),                 clEnumValN(GVDT_Count, "count", "display a graph using the real "                                                 "profile count if available."))); +// Similar option above, but used to control BFI display only after MBP pass +cl::opt<GVDAGType> ViewBlockLayoutWithBFI( +    "view-block-layout-with-bfi", cl::Hidden, +    cl::desc( +        "Pop up a window to show a dag displaying MBP layout and associated " +        "block frequencies of the CFG."), +    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), +               clEnumValN(GVDT_Fraction, "fraction", +                          "display a graph using the " +                          "fractional block frequency representation."), +               clEnumValN(GVDT_Integer, "integer", +                          "display a graph using the raw " +                          "integer fractional block frequency representation."), +               clEnumValN(GVDT_Count, "count", +                          "display a graph using the real " +                          "profile count if available."))); +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=  extern cl::opt<std::string> ViewBlockFreqFuncName; +// Command line option to specify hot frequency threshold. +// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=  extern cl::opt<unsigned> ViewHotFreqPercent; +static GVDAGType getGVDT() { +  if (ViewBlockLayoutWithBFI != GVDT_None) +    return ViewBlockLayoutWithBFI; + +  return ViewMachineBlockFreqPropagationDAG; +} +  namespace llvm {  template <> struct GraphTraits<MachineBlockFrequencyInfo *> { @@ -80,12 +106,32 @@ template <>  struct DOTGraphTraits<MachineBlockFrequencyInfo *>      : public MBFIDOTGraphTraitsBase {    explicit DOTGraphTraits(bool isSimple = false) -      : MBFIDOTGraphTraitsBase(isSimple) {} +      : MBFIDOTGraphTraitsBase(isSimple), CurFunc(nullptr), LayoutOrderMap() {} + +  const MachineFunction *CurFunc; +  DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;    std::string getNodeLabel(const MachineBasicBlock *Node,                             const MachineBlockFrequencyInfo *Graph) { -    return MBFIDOTGraphTraitsBase::getNodeLabel( -        Node, Graph, ViewMachineBlockFreqPropagationDAG); + +    int layout_order = -1; +    // Attach additional ordering information if 'isSimple' is false. +    if (!isSimple()) { +      const MachineFunction *F = Node->getParent(); +      if (!CurFunc || F != CurFunc) { +        if (CurFunc) +          LayoutOrderMap.clear(); + +        CurFunc = F; +        int O = 0; +        for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) { +          LayoutOrderMap[&*MBI] = O; +        } +      } +      layout_order = LayoutOrderMap[Node]; +    } +    return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(), +                                                layout_order);    }    std::string getNodeAttributes(const MachineBasicBlock *Node, @@ -102,7 +148,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>  };  } // end namespace llvm -#endif  INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",                        "Machine Block Frequency Analysis", true, true) @@ -127,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {    MachineFunctionPass::getAnalysisUsage(AU);  } -bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { -  MachineBranchProbabilityInfo &MBPI = -      getAnalysis<MachineBranchProbabilityInfo>(); -  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); +void MachineBlockFrequencyInfo::calculate( +    const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, +    const MachineLoopInfo &MLI) {    if (!MBFI)      MBFI.reset(new ImplType);    MBFI->calculate(F, MBPI, MLI); -#ifndef NDEBUG    if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&        (ViewBlockFreqFuncName.empty() ||         F.getName().equals(ViewBlockFreqFuncName))) { -    view(); +    view("MachineBlockFrequencyDAGS." + F.getName());    } -#endif +} + +bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { +  MachineBranchProbabilityInfo &MBPI = +      getAnalysis<MachineBranchProbabilityInfo>(); +  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); +  calculate(F, MBPI, MLI);    return false;  } @@ -148,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); }  /// Pop up a ghostview window with the current block frequency propagation  /// rendered using dot. -void MachineBlockFrequencyInfo::view() const { -// This code is only for debugging. -#ifndef NDEBUG -  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), -            "MachineBlockFrequencyDAGs"); -#else -  errs() << "MachineBlockFrequencyInfo::view is only available in debug builds " -            "on systems with Graphviz or gv!\n"; -#endif // NDEBUG +void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const { +  // This code is only for debugging. +  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple);  }  BlockFrequency diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 40e3840e6b0b..4cfc128a8c1d 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -32,14 +32,15 @@  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineDominators.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineFunctionPass.h"  #include "llvm/CodeGen/MachineLoopInfo.h"  #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h"  #include "llvm/CodeGen/TailDuplicator.h"  #include "llvm/Support/Allocator.h"  #include "llvm/Support/CommandLine.h" @@ -49,6 +50,8 @@  #include "llvm/Target/TargetLowering.h"  #include "llvm/Target/TargetSubtargetInfo.h"  #include <algorithm> +#include <functional> +#include <utility>  using namespace llvm;  #define DEBUG_TYPE "block-placement" @@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias(  // Definition:  // - Outlining: placement of a basic block outside the chain or hot path. -static cl::opt<bool> OutlineOptionalBranches( -    "outline-optional-branches", -    cl::desc("Outlining optional branches will place blocks that are optional " -              "branches, i.e. branches with a common post dominator, outside " -              "the hot path or chain"), -    cl::init(false), cl::Hidden); - -static cl::opt<unsigned> OutlineOptionalThreshold( -    "outline-optional-threshold", -    cl::desc("Don't outline optional branches that are a single block with an " -             "instruction count below this threshold"), -    cl::init(4), cl::Hidden); -  static cl::opt<unsigned> LoopToColdBlockRatio(      "loop-to-cold-block-ratio",      cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " @@ -136,20 +126,47 @@ BranchFoldPlacement("branch-fold-placement",                cl::init(true), cl::Hidden);  // Heuristic for tail duplication. -static cl::opt<unsigned> TailDuplicatePlacementThreshold( +static cl::opt<unsigned> TailDupPlacementThreshold(      "tail-dup-placement-threshold",      cl::desc("Instruction cutoff for tail duplication during layout. "               "Tail merging during layout is forced to have a threshold "               "that won't conflict."), cl::init(2),      cl::Hidden); +// Heuristic for tail duplication. +static cl::opt<unsigned> TailDupPlacementPenalty( +    "tail-dup-placement-penalty", +    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " +             "Copying can increase fallthrough, but it also increases icache " +             "pressure. This parameter controls the penalty to account for that. " +             "Percent as integer."), +    cl::init(2), +    cl::Hidden); + +// Heuristic for triangle chains. +static cl::opt<unsigned> TriangleChainCount( +    "triangle-chain-count", +    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " +             "triangle tail duplication heuristic to kick in. 0 to disable."), +    cl::init(2), +    cl::Hidden); +  extern cl::opt<unsigned> StaticLikelyProb;  extern cl::opt<unsigned> ProfileLikelyProb; +// Internal option used to control BFI display only after MBP pass. +// Defined in CodeGen/MachineBlockFrequencyInfo.cpp: +// -view-block-layout-with-bfi= +extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; + +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name= +extern cl::opt<std::string> ViewBlockFreqFuncName; +  namespace {  class BlockChain;  /// \brief Type for our function-wide basic block -> block chain mapping. -typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType; +typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType;  }  namespace { @@ -193,12 +210,15 @@ public:    /// \brief Iterator over blocks within the chain.    typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator; +  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator;    /// \brief Beginning of blocks within the chain.    iterator begin() { return Blocks.begin(); } +  const_iterator begin() const { return Blocks.begin(); }    /// \brief End of blocks within the chain.    iterator end() { return Blocks.end(); } +  const_iterator end() const { return Blocks.end(); }    bool remove(MachineBasicBlock* BB) {      for(iterator i = begin(); i != end(); ++i) { @@ -264,12 +284,28 @@ public:  namespace {  class MachineBlockPlacement : public MachineFunctionPass {    /// \brief A typedef for a block filter set. -  typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet; +  typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet; + +  /// Pair struct containing basic block and taildup profitiability +  struct BlockAndTailDupResult { +    MachineBasicBlock *BB; +    bool ShouldTailDup; +  }; + +  /// Triple struct containing edge weight and the edge. +  struct WeightedEdge { +    BlockFrequency Weight; +    MachineBasicBlock *Src; +    MachineBasicBlock *Dest; +  };    /// \brief work lists of blocks that are ready to be laid out    SmallVector<MachineBasicBlock *, 16> BlockWorkList;    SmallVector<MachineBasicBlock *, 16> EHPadWorkList; +  /// Edges that have already been computed as optimal. +  DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges; +    /// \brief Machine Function    MachineFunction *F; @@ -294,7 +330,7 @@ class MachineBlockPlacement : public MachineFunctionPass {    const TargetLoweringBase *TLI;    /// \brief A handle to the post dominator tree. -  MachineDominatorTree *MDT; +  MachinePostDominatorTree *MPDT;    /// \brief Duplicator used to duplicate tails during placement.    /// @@ -303,10 +339,6 @@ class MachineBlockPlacement : public MachineFunctionPass {    /// must be done inline.    TailDuplicator TailDup; -  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate -  /// all terminators of the MachineFunction. -  SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks; -    /// \brief Allocator and owner of BlockChain structures.    ///    /// We build BlockChains lazily while processing the loop structure of @@ -322,7 +354,7 @@ class MachineBlockPlacement : public MachineFunctionPass {    /// BlockChain it participates in, if any. We use it to, among other things,    /// allow implicitly defining edges between chains as the existing edges    /// between basic blocks. -  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; +  DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;  #ifndef NDEBUG    /// The set of basic blocks that have terminators that cannot be fully @@ -334,75 +366,107 @@ class MachineBlockPlacement : public MachineFunctionPass {    /// Decrease the UnscheduledPredecessors count for all blocks in chain, and    /// if the count goes to 0, add them to the appropriate work list. -  void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, -                           const BlockFilterSet *BlockFilter = nullptr); +  void markChainSuccessors( +      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, +      const BlockFilterSet *BlockFilter = nullptr);    /// Decrease the UnscheduledPredecessors count for a single block, and    /// if the count goes to 0, add them to the appropriate work list.    void markBlockSuccessors( -      BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB, +      const BlockChain &Chain, const MachineBasicBlock *BB, +      const MachineBasicBlock *LoopHeaderBB,        const BlockFilterSet *BlockFilter = nullptr); -    BranchProbability -  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, -                          const BlockFilterSet *BlockFilter, -                          SmallVector<MachineBasicBlock *, 4> &Successors); -  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ, -                                 BlockChain &Chain, -                                 const BlockFilterSet *BlockFilter, -                                 BranchProbability SuccProb, -                                 BranchProbability HotProb); +  collectViableSuccessors( +      const MachineBasicBlock *BB, const BlockChain &Chain, +      const BlockFilterSet *BlockFilter, +      SmallVector<MachineBasicBlock *, 4> &Successors); +  bool shouldPredBlockBeOutlined( +      const MachineBasicBlock *BB, const MachineBasicBlock *Succ, +      const BlockChain &Chain, const BlockFilterSet *BlockFilter, +      BranchProbability SuccProb, BranchProbability HotProb);    bool repeatedlyTailDuplicateBlock(        MachineBasicBlock *BB, MachineBasicBlock *&LPred, -      MachineBasicBlock *LoopHeaderBB, +      const MachineBasicBlock *LoopHeaderBB,        BlockChain &Chain, BlockFilterSet *BlockFilter,        MachineFunction::iterator &PrevUnplacedBlockIt); -  bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, -                               const BlockChain &Chain, -                               BlockFilterSet *BlockFilter, -                               MachineFunction::iterator &PrevUnplacedBlockIt, -                               bool &DuplicatedToPred); -  bool -  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, -                             BlockChain &SuccChain, BranchProbability SuccProb, -                             BranchProbability RealSuccProb, BlockChain &Chain, -                             const BlockFilterSet *BlockFilter); -  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, -                                         BlockChain &Chain, -                                         const BlockFilterSet *BlockFilter); -  MachineBasicBlock * -  selectBestCandidateBlock(BlockChain &Chain, -                           SmallVectorImpl<MachineBasicBlock *> &WorkList); -  MachineBasicBlock * -  getFirstUnplacedBlock(const BlockChain &PlacedChain, -                        MachineFunction::iterator &PrevUnplacedBlockIt, -                        const BlockFilterSet *BlockFilter); +  bool maybeTailDuplicateBlock( +      MachineBasicBlock *BB, MachineBasicBlock *LPred, +      BlockChain &Chain, BlockFilterSet *BlockFilter, +      MachineFunction::iterator &PrevUnplacedBlockIt, +      bool &DuplicatedToPred); +  bool hasBetterLayoutPredecessor( +      const MachineBasicBlock *BB, const MachineBasicBlock *Succ, +      const BlockChain &SuccChain, BranchProbability SuccProb, +      BranchProbability RealSuccProb, const BlockChain &Chain, +      const BlockFilterSet *BlockFilter); +  BlockAndTailDupResult selectBestSuccessor( +      const MachineBasicBlock *BB, const BlockChain &Chain, +      const BlockFilterSet *BlockFilter); +  MachineBasicBlock *selectBestCandidateBlock( +      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList); +  MachineBasicBlock *getFirstUnplacedBlock( +      const BlockChain &PlacedChain, +      MachineFunction::iterator &PrevUnplacedBlockIt, +      const BlockFilterSet *BlockFilter);    /// \brief Add a basic block to the work list if it is appropriate.    ///    /// If the optional parameter BlockFilter is provided, only MBB    /// present in the set will be added to the worklist. If nullptr    /// is provided, no filtering occurs. -  void fillWorkLists(MachineBasicBlock *MBB, +  void fillWorkLists(const MachineBasicBlock *MBB,                       SmallPtrSetImpl<BlockChain *> &UpdatedPreds,                       const BlockFilterSet *BlockFilter); -  void buildChain(MachineBasicBlock *BB, BlockChain &Chain, +  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,                    BlockFilterSet *BlockFilter = nullptr); -  MachineBasicBlock *findBestLoopTop(MachineLoop &L, -                                     const BlockFilterSet &LoopBlockSet); -  MachineBasicBlock *findBestLoopExit(MachineLoop &L, -                                      const BlockFilterSet &LoopBlockSet); -  BlockFilterSet collectLoopBlockSet(MachineLoop &L); -  void buildLoopChains(MachineLoop &L); -  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, -                  const BlockFilterSet &LoopBlockSet); -  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, -                             const BlockFilterSet &LoopBlockSet); -  void collectMustExecuteBBs(); +  MachineBasicBlock *findBestLoopTop( +      const MachineLoop &L, const BlockFilterSet &LoopBlockSet); +  MachineBasicBlock *findBestLoopExit( +      const MachineLoop &L, const BlockFilterSet &LoopBlockSet); +  BlockFilterSet collectLoopBlockSet(const MachineLoop &L); +  void buildLoopChains(const MachineLoop &L); +  void rotateLoop( +      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, +      const BlockFilterSet &LoopBlockSet); +  void rotateLoopWithProfile( +      BlockChain &LoopChain, const MachineLoop &L, +      const BlockFilterSet &LoopBlockSet);    void buildCFGChains();    void optimizeBranches();    void alignBlocks(); +  /// Returns true if a block should be tail-duplicated to increase fallthrough +  /// opportunities. +  bool shouldTailDuplicate(MachineBasicBlock *BB); +  /// Check the edge frequencies to see if tail duplication will increase +  /// fallthroughs. +  bool isProfitableToTailDup( +    const MachineBasicBlock *BB, const MachineBasicBlock *Succ, +    BranchProbability AdjustedSumProb, +    const BlockChain &Chain, const BlockFilterSet *BlockFilter); +  /// Check for a trellis layout. +  bool isTrellis(const MachineBasicBlock *BB, +                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, +                 const BlockChain &Chain, const BlockFilterSet *BlockFilter); +  /// Get the best successor given a trellis layout. +  BlockAndTailDupResult getBestTrellisSuccessor( +      const MachineBasicBlock *BB, +      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, +      BranchProbability AdjustedSumProb, const BlockChain &Chain, +      const BlockFilterSet *BlockFilter); +  /// Get the best pair of non-conflicting edges. +  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges( +      const MachineBasicBlock *BB, +      MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges); +  /// Returns true if a block can tail duplicate into all unplaced +  /// predecessors. Filters based on loop. +  bool canTailDuplicateUnplacedPreds( +      const MachineBasicBlock *BB, MachineBasicBlock *Succ, +      const BlockChain &Chain, const BlockFilterSet *BlockFilter); +  /// Find chains of triangles to tail-duplicate where a global analysis works, +  /// but a local analysis would not find them. +  void precomputeTriangleChains();  public:    static char ID; // Pass identification, replacement for typeid @@ -415,7 +479,8 @@ public:    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<MachineBranchProbabilityInfo>();      AU.addRequired<MachineBlockFrequencyInfo>(); -    AU.addRequired<MachineDominatorTree>(); +    if (TailDupPlacement) +      AU.addRequired<MachinePostDominatorTree>();      AU.addRequired<MachineLoopInfo>();      AU.addRequired<TargetPassConfig>();      MachineFunctionPass::getAnalysisUsage(AU); @@ -429,7 +494,7 @@ INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",                        "Branch Probability Basic Block Placement", false, false)  INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)  INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)  INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)  INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",                      "Branch Probability Basic Block Placement", false, false) @@ -438,7 +503,7 @@ INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",  /// \brief Helper to print the name of a MBB.  ///  /// Only used by debug logging. -static std::string getBlockName(MachineBasicBlock *BB) { +static std::string getBlockName(const MachineBasicBlock *BB) {    std::string Result;    raw_string_ostream OS(Result);    OS << "BB#" << BB->getNumber(); @@ -455,7 +520,7 @@ static std::string getBlockName(MachineBasicBlock *BB) {  /// having one fewer active predecessor. It also adds any successors of this  /// chain which reach the zero-predecessor state to the appropriate worklist.  void MachineBlockPlacement::markChainSuccessors( -    BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, +    const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,      const BlockFilterSet *BlockFilter) {    // Walk all the blocks in this chain, marking their successors as having    // a predecessor placed. @@ -471,8 +536,8 @@ void MachineBlockPlacement::markChainSuccessors(  /// and was duplicated into the chain end, we need to redo markBlockSuccessors  /// for just that block.  void MachineBlockPlacement::markBlockSuccessors( -    BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB, -    const BlockFilterSet *BlockFilter) { +    const BlockChain &Chain, const MachineBasicBlock *MBB, +    const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {    // Add any successors for which this is the only un-placed in-loop    // predecessor to the worklist as a viable candidate for CFG-neutral    // placement. No subsequent placement of this block will violate the CFG @@ -504,7 +569,8 @@ void MachineBlockPlacement::markBlockSuccessors(  /// the total branch probability of edges from \p BB to those  /// blocks.  BranchProbability MachineBlockPlacement::collectViableSuccessors( -    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, +    const MachineBasicBlock *BB, const BlockChain &Chain, +    const BlockFilterSet *BlockFilter,      SmallVector<MachineBasicBlock *, 4> &Successors) {    // Adjust edge probabilities by excluding edges pointing to blocks that is    // either not in BlockFilter or is already in the current chain. Consider the @@ -561,46 +627,573 @@ getAdjustedProbability(BranchProbability OrigProb,    return SuccProb;  } -/// When the option OutlineOptionalBranches is on, this method -/// checks if the fallthrough candidate block \p Succ (of block -/// \p BB) also has other unscheduled predecessor blocks which -/// are also successors of \p BB (forming triangular shape CFG). -/// If none of such predecessors are small, it returns true. -/// The caller can choose to select \p Succ as the layout successors -/// so that \p Succ's predecessors (optional branches) can be -/// outlined. -/// FIXME: fold this with more general layout cost analysis. -bool MachineBlockPlacement::shouldPredBlockBeOutlined( -    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, -    const BlockFilterSet *BlockFilter, BranchProbability SuccProb, -    BranchProbability HotProb) { -  if (!OutlineOptionalBranches) +/// Check if \p BB has exactly the successors in \p Successors. +static bool +hasSameSuccessors(MachineBasicBlock &BB, +                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) { +  if (BB.succ_size() != Successors.size())      return false; -  // If we outline optional branches, look whether Succ is unavoidable, i.e. -  // dominates all terminators of the MachineFunction. If it does, other -  // successors must be optional. Don't do this for cold branches. -  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) { -    for (MachineBasicBlock *Pred : Succ->predecessors()) { -      // Check whether there is an unplaced optional branch. -      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || -          BlockToChain[Pred] == &Chain) +  // We don't want to count self-loops +  if (Successors.count(&BB)) +    return false; +  for (MachineBasicBlock *Succ : BB.successors()) +    if (!Successors.count(Succ)) +      return false; +  return true; +} + +/// Check if a block should be tail duplicated to increase fallthrough +/// opportunities. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { +  // Blocks with single successors don't create additional fallthrough +  // opportunities. Don't duplicate them. TODO: When conditional exits are +  // analyzable, allow them to be duplicated. +  bool IsSimple = TailDup.isSimpleBB(BB); + +  if (BB->succ_size() == 1) +    return false; +  return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Compare 2 BlockFrequency's with a small penalty for \p A. +/// In order to be conservative, we apply a X% penalty to account for +/// increased icache pressure and static heuristics. For small frequencies +/// we use only the numerators to improve accuracy. For simplicity, we assume the +/// penalty is less than 100% +/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. +static bool greaterWithBias(BlockFrequency A, BlockFrequency B, +                            uint64_t EntryFreq) { +  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); +  BlockFrequency Gain = A - B; +  return (Gain / ThresholdProb).getFrequency() >= EntryFreq; +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. It only makes sense to call this function when +/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is +/// always locally profitable if we would have picked \p Succ without +/// considering duplication. +bool MachineBlockPlacement::isProfitableToTailDup( +    const MachineBasicBlock *BB, const MachineBasicBlock *Succ, +    BranchProbability QProb, +    const BlockChain &Chain, const BlockFilterSet *BlockFilter) { +  // We need to do a probability calculation to make sure this is profitable. +  // First: does succ have a successor that post-dominates? This affects the +  // calculation. The 2 relevant cases are: +  //    BB         BB +  //    | \Qout    | \Qout +  //   P|  C       |P C +  //    =   C'     =   C' +  //    |  /Qin    |  /Qin +  //    | /        | / +  //    Succ       Succ +  //    / \        | \  V +  //  U/   =V      |U \ +  //  /     \      =   D +  //  D      E     |  / +  //               | / +  //               |/ +  //               PDom +  //  '=' : Branch taken for that CFG edge +  // In the second case, Placing Succ while duplicating it into C prevents the +  // fallthrough of Succ into either D or PDom, because they now have C as an +  // unplaced predecessor + +  // Start by figuring out which case we fall into +  MachineBasicBlock *PDom = nullptr; +  SmallVector<MachineBasicBlock *, 4> SuccSuccs; +  // Only scan the relevant successors +  auto AdjustedSuccSumProb = +      collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs); +  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ); +  auto BBFreq = MBFI->getBlockFreq(BB); +  auto SuccFreq = MBFI->getBlockFreq(Succ); +  BlockFrequency P = BBFreq * PProb; +  BlockFrequency Qout = BBFreq * QProb; +  uint64_t EntryFreq = MBFI->getEntryFreq(); +  // If there are no more successors, it is profitable to copy, as it strictly +  // increases fallthrough. +  if (SuccSuccs.size() == 0) +    return greaterWithBias(P, Qout, EntryFreq); + +  auto BestSuccSucc = BranchProbability::getZero(); +  // Find the PDom or the best Succ if no PDom exists. +  for (MachineBasicBlock *SuccSucc : SuccSuccs) { +    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); +    if (Prob > BestSuccSucc) +      BestSuccSucc = Prob; +    if (PDom == nullptr) +      if (MPDT->dominates(SuccSucc, Succ)) { +        PDom = SuccSucc; +        break; +      } +  } +  // For the comparisons, we need to know Succ's best incoming edge that isn't +  // from BB. +  auto SuccBestPred = BlockFrequency(0); +  for (MachineBasicBlock *SuccPred : Succ->predecessors()) { +    if (SuccPred == Succ || SuccPred == BB +        || BlockToChain[SuccPred] == &Chain +        || (BlockFilter && !BlockFilter->count(SuccPred))) +      continue; +    auto Freq = MBFI->getBlockFreq(SuccPred) +        * MBPI->getEdgeProbability(SuccPred, Succ); +    if (Freq > SuccBestPred) +      SuccBestPred = Freq; +  } +  // Qin is Succ's best unplaced incoming edge that isn't BB +  BlockFrequency Qin = SuccBestPred; +  // If it doesn't have a post-dominating successor, here is the calculation: +  //    BB        BB +  //    | \Qout   |  \ +  //   P|  C      |   = +  //    =   C'    |    C +  //    |  /Qin   |     | +  //    | /       |     C' (+Succ) +  //    Succ      Succ /| +  //    / \       |  \/ | +  //  U/   =V     |  == | +  //  /     \     | /  \| +  //  D      E    D     E +  //  '=' : Branch taken for that CFG edge +  //  Cost in the first case is: P + V +  //  For this calculation, we always assume P > Qout. If Qout > P +  //  The result of this function will be ignored at the caller. +  //  Let F = SuccFreq - Qin +  //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V + +  if (PDom == nullptr || !Succ->isSuccessor(PDom)) { +    BranchProbability UProb = BestSuccSucc; +    BranchProbability VProb = AdjustedSuccSumProb - UProb; +    BlockFrequency F = SuccFreq - Qin; +    BlockFrequency V = SuccFreq * VProb; +    BlockFrequency QinU = std::min(Qin, F) * UProb; +    BlockFrequency BaseCost = P + V; +    BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb; +    return greaterWithBias(BaseCost, DupCost, EntryFreq); +  } +  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); +  BranchProbability VProb = AdjustedSuccSumProb - UProb; +  BlockFrequency U = SuccFreq * UProb; +  BlockFrequency V = SuccFreq * VProb; +  BlockFrequency F = SuccFreq - Qin; +  // If there is a post-dominating successor, here is the calculation: +  // BB         BB                 BB          BB +  // | \Qout    |   \               | \Qout     |  \ +  // |P C       |    =              |P C        |   = +  // =   C'     |P    C             =   C'      |P   C +  // |  /Qin    |      |            |  /Qin     |     | +  // | /        |      C' (+Succ)   | /         |     C' (+Succ) +  // Succ       Succ  /|            Succ        Succ /| +  // | \  V     |   \/ |            | \  V      |  \/ | +  // |U \       |U  /\ =?           |U =        |U /\ | +  // =   D      = =  =?|            |   D       | =  =| +  // |  /       |/     D            |  /        |/    D +  // | /        |     /             | =         |    / +  // |/         |    /              |/          |   = +  // Dom         Dom                Dom         Dom +  //  '=' : Branch taken for that CFG edge +  // The cost for taken branches in the first case is P + U +  // Let F = SuccFreq - Qin +  // The cost in the second case (assuming independence), given the layout: +  // BB, Succ, (C+Succ), D, Dom or the layout: +  // BB, Succ, D, Dom, (C+Succ) +  // is Qout + max(F, Qin) * U + min(F, Qin) +  // compare P + U vs Qout + P * U + Qin. +  // +  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. +  // +  // For the 3rd case, the cost is P + 2 * V +  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V +  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V +  if (UProb > AdjustedSuccSumProb / 2 && +      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb, +                                  Chain, BlockFilter)) +    // Cases 3 & 4 +    return greaterWithBias( +        (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb), +        EntryFreq); +  // Cases 1 & 2 +  return greaterWithBias((P + U), +                         (Qout + std::min(Qin, F) * AdjustedSuccSumProb + +                          std::max(Qin, F) * UProb), +                         EntryFreq); +} + +/// Check for a trellis layout. \p BB is the upper part of a trellis if its +/// successors form the lower part of a trellis. A successor set S forms the +/// lower part of a trellis if all of the predecessors of S are either in S or +/// have all of S as successors. We ignore trellises where BB doesn't have 2 +/// successors because for fewer than 2, it's trivial, and for 3 or greater they +/// are very uncommon and complex to compute optimally. Allowing edges within S +/// is not strictly a trellis, but the same algorithm works, so we allow it. +bool MachineBlockPlacement::isTrellis( +    const MachineBasicBlock *BB, +    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, +    const BlockChain &Chain, const BlockFilterSet *BlockFilter) { +  // Technically BB could form a trellis with branching factor higher than 2. +  // But that's extremely uncommon. +  if (BB->succ_size() != 2 || ViableSuccs.size() != 2) +    return false; + +  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(), +                                                       BB->succ_end()); +  // To avoid reviewing the same predecessors twice. +  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds; + +  for (MachineBasicBlock *Succ : ViableSuccs) { +    int PredCount = 0; +    for (auto SuccPred : Succ->predecessors()) { +      // Allow triangle successors, but don't count them. +      if (Successors.count(SuccPred)) { +        // Make sure that it is actually a triangle. +        for (MachineBasicBlock *CheckSucc : SuccPred->successors()) +          if (!Successors.count(CheckSucc)) +            return false;          continue; -      // Check whether the optional branch has exactly one BB. -      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) +      } +      const BlockChain *PredChain = BlockToChain[SuccPred]; +      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) || +          PredChain == &Chain || PredChain == BlockToChain[Succ])          continue; -      // Check whether the optional branch is small. -      if (Pred->size() < OutlineOptionalThreshold) +      ++PredCount; +      // Perform the successor check only once. +      if (!SeenPreds.insert(SuccPred).second) +        continue; +      if (!hasSameSuccessors(*SuccPred, Successors))          return false;      } -    return true; -  } else +    // If one of the successors has only BB as a predecessor, it is not a +    // trellis. +    if (PredCount < 1) +      return false; +  } +  return true; +} + +/// Pick the highest total weight pair of edges that can both be laid out. +/// The edges in \p Edges[0] are assumed to have a different destination than +/// the edges in \p Edges[1]. Simple counting shows that the best pair is either +/// the individual highest weight edges to the 2 different destinations, or in +/// case of a conflict, one of them should be replaced with a 2nd best edge. +std::pair<MachineBlockPlacement::WeightedEdge, +          MachineBlockPlacement::WeightedEdge> +MachineBlockPlacement::getBestNonConflictingEdges( +    const MachineBasicBlock *BB, +    MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>> +        Edges) { +  // Sort the edges, and then for each successor, find the best incoming +  // predecessor. If the best incoming predecessors aren't the same, +  // then that is clearly the best layout. If there is a conflict, one of the +  // successors will have to fallthrough from the second best predecessor. We +  // compare which combination is better overall. + +  // Sort for highest frequency. +  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; }; + +  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp); +  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp); +  auto BestA = Edges[0].begin(); +  auto BestB = Edges[1].begin(); +  // Arrange for the correct answer to be in BestA and BestB +  // If the 2 best edges don't conflict, the answer is already there. +  if (BestA->Src == BestB->Src) { +    // Compare the total fallthrough of (Best + Second Best) for both pairs +    auto SecondBestA = std::next(BestA); +    auto SecondBestB = std::next(BestB); +    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight; +    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight; +    if (BestAScore < BestBScore) +      BestA = SecondBestA; +    else +      BestB = SecondBestB; +  } +  // Arrange for the BB edge to be in BestA if it exists. +  if (BestB->Src == BB) +    std::swap(BestA, BestB); +  return std::make_pair(*BestA, *BestB); +} + +/// Get the best successor from \p BB based on \p BB being part of a trellis. +/// We only handle trellises with 2 successors, so the algorithm is +/// straightforward: Find the best pair of edges that don't conflict. We find +/// the best incoming edge for each successor in the trellis. If those conflict, +/// we consider which of them should be replaced with the second best. +/// Upon return the two best edges will be in \p BestEdges. If one of the edges +/// comes from \p BB, it will be in \p BestEdges[0] +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::getBestTrellisSuccessor( +    const MachineBasicBlock *BB, +    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, +    BranchProbability AdjustedSumProb, const BlockChain &Chain, +    const BlockFilterSet *BlockFilter) { + +  BlockAndTailDupResult Result = {nullptr, false}; +  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), +                                                       BB->succ_end()); + +  // We assume size 2 because it's common. For general n, we would have to do +  // the Hungarian algorithm, but it's not worth the complexity because more +  // than 2 successors is fairly uncommon, and a trellis even more so. +  if (Successors.size() != 2 || ViableSuccs.size() != 2) +    return Result; + +  // Collect the edge frequencies of all edges that form the trellis. +  SmallVector<WeightedEdge, 8> Edges[2]; +  int SuccIndex = 0; +  for (auto Succ : ViableSuccs) { +    for (MachineBasicBlock *SuccPred : Succ->predecessors()) { +      // Skip any placed predecessors that are not BB +      if (SuccPred != BB) +        if ((BlockFilter && !BlockFilter->count(SuccPred)) || +            BlockToChain[SuccPred] == &Chain || +            BlockToChain[SuccPred] == BlockToChain[Succ]) +          continue; +      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) * +                                MBPI->getEdgeProbability(SuccPred, Succ); +      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ}); +    } +    ++SuccIndex; +  } + +  // Pick the best combination of 2 edges from all the edges in the trellis. +  WeightedEdge BestA, BestB; +  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); + +  if (BestA.Src != BB) { +    // If we have a trellis, and BB doesn't have the best fallthrough edges, +    // we shouldn't choose any successor. We've already looked and there's a +    // better fallthrough edge for all the successors. +    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n"); +    return Result; +  } + +  // Did we pick the triangle edge? If tail-duplication is profitable, do +  // that instead. Otherwise merge the triangle edge now while we know it is +  // optimal. +  if (BestA.Dest == BestB.Src) { +    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2 +    // would be better. +    MachineBasicBlock *Succ1 = BestA.Dest; +    MachineBasicBlock *Succ2 = BestB.Dest; +    // Check to see if tail-duplication would be profitable. +    if (TailDupPlacement && shouldTailDuplicate(Succ2) && +        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) && +        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1), +                              Chain, BlockFilter)) { +      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability( +                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb); +            dbgs() << "    Selected: " << getBlockName(Succ2) +                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n"); +      Result.BB = Succ2; +      Result.ShouldTailDup = true; +      return Result; +    } +  } +  // We have already computed the optimal edge for the other side of the +  // trellis. +  ComputedEdges[BestB.Src] = { BestB.Dest, false }; + +  auto TrellisSucc = BestA.Dest; +  DEBUG(BranchProbability SuccProb = getAdjustedProbability( +            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb); +        dbgs() << "    Selected: " << getBlockName(TrellisSucc) +               << ", probability: " << SuccProb << " (Trellis)\n"); +  Result.BB = TrellisSucc; +  return Result; +} + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( +    const MachineBasicBlock *BB, MachineBasicBlock *Succ, +    const BlockChain &Chain, const BlockFilterSet *BlockFilter) { +  if (!shouldTailDuplicate(Succ))      return false; + +  // For CFG checking. +  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), +                                                       BB->succ_end()); +  for (MachineBasicBlock *Pred : Succ->predecessors()) { +    // Make sure all unplaced and unfiltered predecessors can be +    // tail-duplicated into. +    // Skip any blocks that are already placed or not in this loop. +    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) +        || BlockToChain[Pred] == &Chain) +      continue; +    if (!TailDup.canTailDuplicate(Succ, Pred)) { +      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) +        // This will result in a trellis after tail duplication, so we don't +        // need to copy Succ into this predecessor. In the presence +        // of a trellis tail duplication can continue to be profitable. +        // For example: +        // A            A +        // |\           |\ +        // | \          | \ +        // |  C         |  C+BB +        // | /          |  | +        // |/           |  | +        // BB    =>     BB | +        // |\           |\/| +        // | \          |/\| +        // |  D         |  D +        // | /          | / +        // |/           |/ +        // Succ         Succ +        // +        // After BB was duplicated into C, the layout looks like the one on the +        // right. BB and C now have the same successors. When considering +        // whether Succ can be duplicated into all its unplaced predecessors, we +        // ignore C. +        // We can do this because C already has a profitable fallthrough, namely +        // D. TODO(iteratee): ignore sufficiently cold predecessors for +        // duplication and for this test. +        // +        // This allows trellises to be laid out in 2 separate chains +        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic +        // because it allows the creation of 2 fallthrough paths with links +        // between them, and we correctly identify the best layout for these +        // CFGs. We want to extend trellises that the user created in addition +        // to trellises created by tail-duplication, so we just look for the +        // CFG. +        continue; +      return false; +    } +  } +  return true; +} + +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +///    longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +///    U-shaped distribution. +///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +///    If the branches in a chain are likely to be from the same side of the +///    distribution as their predecessor, but are independent at runtime, this +///    transformation is profitable. (Because the cost of being wrong is a small +///    fixed cost, unlike the standard triangle layout where the cost of being +///    wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +///    branch was taken positively influences whether the next branch will be +///    taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { +  struct TriangleChain { +    std::vector<MachineBasicBlock *> Edges; +    TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst) +        : Edges({src, dst}) {} + +    void append(MachineBasicBlock *dst) { +      assert(getKey()->isSuccessor(dst) && +             "Attempting to append a block that is not a successor."); +      Edges.push_back(dst); +    } + +    unsigned count() const { return Edges.size() - 1; } + +    MachineBasicBlock *getKey() const { +      return Edges.back(); +    } +  }; + +  if (TriangleChainCount == 0) +    return; + +  DEBUG(dbgs() << "Pre-computing triangle chains.\n"); +  // Map from last block to the chain that contains it. This allows us to extend +  // chains as we find new triangles. +  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap; +  for (MachineBasicBlock &BB : *F) { +    // If BB doesn't have 2 successors, it doesn't start a triangle. +    if (BB.succ_size() != 2) +      continue; +    MachineBasicBlock *PDom = nullptr; +    for (MachineBasicBlock *Succ : BB.successors()) { +      if (!MPDT->dominates(Succ, &BB)) +        continue; +      PDom = Succ; +      break; +    } +    // If BB doesn't have a post-dominating successor, it doesn't form a +    // triangle. +    if (PDom == nullptr) +      continue; +    // If PDom has a hint that it is low probability, skip this triangle. +    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) +      continue; +    // If PDom isn't eligible for duplication, this isn't the kind of triangle +    // we're looking for. +    if (!shouldTailDuplicate(PDom)) +      continue; +    bool CanTailDuplicate = true; +    // If PDom can't tail-duplicate into it's non-BB predecessors, then this +    // isn't the kind of triangle we're looking for. +    for (MachineBasicBlock* Pred : PDom->predecessors()) { +      if (Pred == &BB) +        continue; +      if (!TailDup.canTailDuplicate(PDom, Pred)) { +        CanTailDuplicate = false; +        break; +      } +    } +    // If we can't tail-duplicate PDom to its predecessors, then skip this +    // triangle. +    if (!CanTailDuplicate) +      continue; + +    // Now we have an interesting triangle. Insert it if it's not part of an +    // existing chain +    // Note: This cannot be replaced with a call insert() or emplace() because +    // the find key is BB, but the insert/emplace key is PDom. +    auto Found = TriangleChainMap.find(&BB); +    // If it is, remove the chain from the map, grow it, and put it back in the +    // map with the end as the new key. +    if (Found != TriangleChainMap.end()) { +      TriangleChain Chain = std::move(Found->second); +      TriangleChainMap.erase(Found); +      Chain.append(PDom); +      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); +    } else { +      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); +      assert(InsertResult.second && "Block seen twice."); +      (void)InsertResult; +    } +  } + +  // Iterating over a DenseMap is safe here, because the only thing in the body +  // of the loop is inserting into another DenseMap (ComputedEdges). +  // ComputedEdges is never iterated, so this doesn't lead to non-determinism. +  for (auto &ChainPair : TriangleChainMap) { +    TriangleChain &Chain = ChainPair.second; +    // Benchmarking has shown that due to branch correlation duplicating 2 or +    // more triangles is profitable, despite the calculations assuming +    // independence. +    if (Chain.count() < TriangleChainCount) +      continue; +    MachineBasicBlock *dst = Chain.Edges.back(); +    Chain.Edges.pop_back(); +    for (MachineBasicBlock *src : reverse(Chain.Edges)) { +      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << +            getBlockName(dst) << " as pre-computed based on triangles.\n"); + +      auto InsertResult = ComputedEdges.insert({src, {dst, true}}); +      assert(InsertResult.second && "Block seen twice."); +      (void)InsertResult; + +      dst = src; +    } +  }  }  // When profile is not present, return the StaticLikelyProb.  // When profile is available, we need to handle the triangle-shape CFG.  static BranchProbability getLayoutSuccessorProbThreshold( -      MachineBasicBlock *BB) { +      const MachineBasicBlock *BB) {    if (!BB->getParent()->getFunction()->getEntryCount())      return BranchProbability(StaticLikelyProb, 100);    if (BB->succ_size() == 2) { @@ -609,11 +1202,11 @@ static BranchProbability getLayoutSuccessorProbThreshold(      if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {        /* See case 1 below for the cost analysis. For BB->Succ to         * be taken with smaller cost, the following needs to hold: -       *   Prob(BB->Succ) > 2* Prob(BB->Pred) -       *   So the threshold T -       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, -       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified -       * branch bias, we have +       *   Prob(BB->Succ) > 2 * Prob(BB->Pred) +       *   So the threshold T in the calculation below +       *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) +       *   So T / (1 - T) = 2, Yielding T = 2/3 +       * Also adding user specified branch bias, we have         *   T = (2/3)*(ProfileLikelyProb/50)         *     = (2*ProfileLikelyProb)/150)         */ @@ -625,10 +1218,17 @@ static BranchProbability getLayoutSuccessorProbThreshold(  /// Checks to see if the layout candidate block \p Succ has a better layout  /// predecessor than \c BB. If yes, returns true. +/// \p SuccProb: The probability adjusted for only remaining blocks. +///   Only used for logging +/// \p RealSuccProb: The un-adjusted probability. +/// \p Chain: The chain that BB belongs to and Succ is being considered for. +/// \p BlockFilter: if non-null, the set of blocks that make up the loop being +///    considered  bool MachineBlockPlacement::hasBetterLayoutPredecessor( -    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, -    BranchProbability SuccProb, BranchProbability RealSuccProb, -    BlockChain &Chain, const BlockFilterSet *BlockFilter) { +    const MachineBasicBlock *BB, const MachineBasicBlock *Succ, +    const BlockChain &SuccChain, BranchProbability SuccProb, +    BranchProbability RealSuccProb, const BlockChain &Chain, +    const BlockFilterSet *BlockFilter) {    // There isn't a better layout when there are no unscheduled predecessors.    if (SuccChain.UnscheduledPredecessors == 0) @@ -734,11 +1334,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(    //  |  Pred----|                     |  S1----    //  |  |                             |       |    //  --(S1 or S2)                     ---Pred-- +  //                                        | +  //                                       S2    //    // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)    //    + min(freq(Pred->S1), freq(Pred->S2))    // Non-topo-order cost: -  // In the worst case, S2 will not get laid out after Pred.    // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).    // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))    // is 0. Then the non topo layout is better when @@ -756,13 +1357,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(    for (MachineBasicBlock *Pred : Succ->predecessors()) {      if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||          (BlockFilter && !BlockFilter->count(Pred)) || -        BlockToChain[Pred] == &Chain) +        BlockToChain[Pred] == &Chain || +        // This check is redundant except for look ahead. This function is +        // called for lookahead by isProfitableToTailDup when BB hasn't been +        // placed yet. +        (Pred == BB))        continue;      // Do backward checking.      // For all cases above, we need a backward checking to filter out edges that -    // are not 'strongly' biased. With profile data available, the check is -    // mostly redundant for case 2 (when threshold prob is set at 50%) unless S -    // has more than two successors. +    // are not 'strongly' biased.      // BB  Pred      //  \ /      //  Succ @@ -798,14 +1401,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(  /// breaking CFG structure, but cave and break such structures in the case of  /// very hot successor edges.  /// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * -MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, -                                           BlockChain &Chain, -                                           const BlockFilterSet *BlockFilter) { +/// \returns The best successor block found, or null if none are viable, along +/// with a boolean indicating if tail duplication is necessary. +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::selectBestSuccessor( +    const MachineBasicBlock *BB, const BlockChain &Chain, +    const BlockFilterSet *BlockFilter) {    const BranchProbability HotProb(StaticLikelyProb, 100); -  MachineBasicBlock *BestSucc = nullptr; +  BlockAndTailDupResult BestSucc = { nullptr, false };    auto BestProb = BranchProbability::getZero();    SmallVector<MachineBasicBlock *, 4> Successors; @@ -813,22 +1417,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,        collectViableSuccessors(BB, Chain, BlockFilter, Successors);    DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); + +  // if we already precomputed the best successor for BB, return that if still +  // applicable. +  auto FoundEdge = ComputedEdges.find(BB); +  if (FoundEdge != ComputedEdges.end()) { +    MachineBasicBlock *Succ = FoundEdge->second.BB; +    ComputedEdges.erase(FoundEdge); +    BlockChain *SuccChain = BlockToChain[Succ]; +    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) && +        SuccChain != &Chain && Succ == *SuccChain->begin()) +      return FoundEdge->second; +  } + +  // if BB is part of a trellis, Use the trellis to determine the optimal +  // fallthrough edges +  if (isTrellis(BB, Successors, Chain, BlockFilter)) +    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain, +                                   BlockFilter); + +  // For blocks with CFG violations, we may be able to lay them out anyway with +  // tail-duplication. We keep this vector so we can perform the probability +  // calculations the minimum number of times. +  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4> +      DupCandidates;    for (MachineBasicBlock *Succ : Successors) {      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);      BranchProbability SuccProb =          getAdjustedProbability(RealSuccProb, AdjustedSumProb); -    // This heuristic is off by default. -    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, -                                  HotProb)) -      return Succ; -      BlockChain &SuccChain = *BlockToChain[Succ];      // Skip the edge \c BB->Succ if block \c Succ has a better layout      // predecessor that yields lower global cost.      if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, -                                   Chain, BlockFilter)) +                                   Chain, BlockFilter)) { +      // If tail duplication would make Succ profitable, place it. +      if (TailDupPlacement && shouldTailDuplicate(Succ)) +        DupCandidates.push_back(std::make_tuple(SuccProb, Succ));        continue; +    }      DEBUG(          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: " @@ -836,17 +1463,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,                 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")                 << "\n"); -    if (BestSucc && BestProb >= SuccProb) { +    if (BestSucc.BB && BestProb >= SuccProb) {        DEBUG(dbgs() << "    Not the best candidate, continuing\n");        continue;      }      DEBUG(dbgs() << "    Setting it as best candidate\n"); -    BestSucc = Succ; +    BestSucc.BB = Succ;      BestProb = SuccProb;    } -  if (BestSucc) -    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc) << "\n"); +  // Handle the tail duplication candidates in order of decreasing probability. +  // Stop at the first one that is profitable. Also stop if they are less +  // profitable than BestSucc. Position is important because we preserve it and +  // prefer first best match. Here we aren't comparing in order, so we capture +  // the position instead. +  if (DupCandidates.size() != 0) { +    auto cmp = +        [](const std::tuple<BranchProbability, MachineBasicBlock *> &a, +           const std::tuple<BranchProbability, MachineBasicBlock *> &b) { +          return std::get<0>(a) > std::get<0>(b); +        }; +    std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp); +  } +  for(auto &Tup : DupCandidates) { +    BranchProbability DupProb; +    MachineBasicBlock *Succ; +    std::tie(DupProb, Succ) = Tup; +    if (DupProb < BestProb) +      break; +    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) +        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { +      DEBUG( +          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: " +                 << DupProb +                 << " (Tail Duplicate)\n"); +      BestSucc.BB = Succ; +      BestSucc.ShouldTailDup = true; +      break; +    } +  } + +  if (BestSucc.BB) +    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");    return BestSucc;  } @@ -862,7 +1520,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,  ///  /// \returns The best block found, or null if none are viable.  MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( -    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { +    const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {    // Once we need to walk the worklist looking for a candidate, cleanup the    // worklist of already placed entries.    // FIXME: If this shows up on profiles, it could be folded (at the cost of @@ -948,7 +1606,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(  }  void MachineBlockPlacement::fillWorkLists( -    MachineBasicBlock *MBB, +    const MachineBasicBlock *MBB,      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,      const BlockFilterSet *BlockFilter = nullptr) {    BlockChain &Chain = *BlockToChain[MBB]; @@ -970,23 +1628,23 @@ void MachineBlockPlacement::fillWorkLists(    if (Chain.UnscheduledPredecessors != 0)      return; -  MBB = *Chain.begin(); -  if (MBB->isEHPad()) -    EHPadWorkList.push_back(MBB); +  MachineBasicBlock *BB = *Chain.begin(); +  if (BB->isEHPad()) +    EHPadWorkList.push_back(BB);    else -    BlockWorkList.push_back(MBB); +    BlockWorkList.push_back(BB);  }  void MachineBlockPlacement::buildChain( -    MachineBasicBlock *BB, BlockChain &Chain, +    const MachineBasicBlock *HeadBB, BlockChain &Chain,      BlockFilterSet *BlockFilter) { -  assert(BB && "BB must not be null.\n"); -  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); +  assert(HeadBB && "BB must not be null.\n"); +  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");    MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); -  MachineBasicBlock *LoopHeaderBB = BB; +  const MachineBasicBlock *LoopHeaderBB = HeadBB;    markChainSuccessors(Chain, LoopHeaderBB, BlockFilter); -  BB = *std::prev(Chain.end()); +  MachineBasicBlock *BB = *std::prev(Chain.end());    for (;;) {      assert(BB && "null block found at end of chain in loop.");      assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); @@ -995,7 +1653,11 @@ void MachineBlockPlacement::buildChain(      // Look for the best viable successor if there is one to place immediately      // after this block. -    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); +    auto Result = selectBestSuccessor(BB, Chain, BlockFilter); +    MachineBasicBlock* BestSucc = Result.BB; +    bool ShouldTailDup = Result.ShouldTailDup; +    if (TailDupPlacement) +      ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));      // If an immediate successor isn't available, look for the best viable      // block among those we've identified as not violating the loop's CFG at @@ -1016,7 +1678,7 @@ void MachineBlockPlacement::buildChain(      // Placement may have changed tail duplication opportunities.      // Check for that now. -    if (TailDupPlacement && BestSucc) { +    if (TailDupPlacement && BestSucc && ShouldTailDup) {        // If the chosen successor was duplicated into all its predecessors,        // don't bother laying it out, just go round the loop again with BB as        // the chain end. @@ -1052,7 +1714,7 @@ void MachineBlockPlacement::buildChain(  /// unconditional jump (for the backedge) rotating it in front of the loop  /// header is always profitable.  MachineBasicBlock * -MachineBlockPlacement::findBestLoopTop(MachineLoop &L, +MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,                                         const BlockFilterSet &LoopBlockSet) {    // Placing the latch block before the header may introduce an extra branch    // that skips this block the first time the loop is executed, which we want @@ -1116,7 +1778,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,  /// block to layout at the top of the loop. Typically this is done to maximize  /// fallthrough opportunities.  MachineBasicBlock * -MachineBlockPlacement::findBestLoopExit(MachineLoop &L, +MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,                                          const BlockFilterSet &LoopBlockSet) {    // We don't want to layout the loop linearly in all cases. If the loop header    // is just a normal basic block in the loop, we want to look for what block @@ -1235,7 +1897,7 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L,  /// branches. For example, if the loop has fallthrough into its header and out  /// of its bottom already, don't rotate it.  void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, -                                       MachineBasicBlock *ExitingBB, +                                       const MachineBasicBlock *ExitingBB,                                         const BlockFilterSet &LoopBlockSet) {    if (!ExitingBB)      return; @@ -1285,7 +1947,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,  ///  Therefore, the cost for a given rotation is the sum of costs listed above.  ///  We select the best rotation with the smallest cost.  void MachineBlockPlacement::rotateLoopWithProfile( -    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { +    BlockChain &LoopChain, const MachineLoop &L, +    const BlockFilterSet &LoopBlockSet) {    auto HeaderBB = L.getHeader();    auto HeaderIter = find(LoopChain, HeaderBB);    auto RotationPos = LoopChain.end(); @@ -1422,7 +2085,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(  /// When profile data is available, exclude cold blocks from the returned set;  /// otherwise, collect all blocks in the loop.  MachineBlockPlacement::BlockFilterSet -MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { +MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {    BlockFilterSet LoopBlockSet;    // Filter cold blocks off from LoopBlockSet when profile data is available. @@ -1459,10 +2122,10 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {  /// as much as possible. We can then stitch the chains together in a way which  /// both preserves the topological structure and minimizes taken conditional  /// branches. -void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { +void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {    // First recurse through any nested loops, building chains for those inner    // loops. -  for (MachineLoop *InnerLoop : L) +  for (const MachineLoop *InnerLoop : L)      buildLoopChains(*InnerLoop);    assert(BlockWorkList.empty()); @@ -1499,7 +2162,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {    assert(LoopChain.UnscheduledPredecessors == 0);    UpdatedPreds.insert(&LoopChain); -  for (MachineBasicBlock *LoopBB : LoopBlockSet) +  for (const MachineBasicBlock *LoopBB : LoopBlockSet)      fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);    buildChain(LoopTop, LoopChain, &LoopBlockSet); @@ -1533,7 +2196,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {      if (!LoopBlockSet.empty()) {        BadLoop = true; -      for (MachineBasicBlock *LoopBB : LoopBlockSet) +      for (const MachineBasicBlock *LoopBB : LoopBlockSet)          dbgs() << "Loop contains blocks never placed into a chain!\n"                 << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"                 << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n" @@ -1546,31 +2209,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {    EHPadWorkList.clear();  } -/// When OutlineOpitonalBranches is on, this method collects BBs that -/// dominates all terminator blocks of the function \p F. -void MachineBlockPlacement::collectMustExecuteBBs() { -  if (OutlineOptionalBranches) { -    // Find the nearest common dominator of all of F's terminators. -    MachineBasicBlock *Terminator = nullptr; -    for (MachineBasicBlock &MBB : *F) { -      if (MBB.succ_size() == 0) { -        if (Terminator == nullptr) -          Terminator = &MBB; -        else -          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); -      } -    } - -    // MBBs dominating this common dominator are unavoidable. -    UnavoidableBlocks.clear(); -    for (MachineBasicBlock &MBB : *F) { -      if (MDT->dominates(&MBB, Terminator)) { -        UnavoidableBlocks.insert(&MBB); -      } -    } -  } -} -  void MachineBlockPlacement::buildCFGChains() {    // Ensure that every BB in the function has an associated chain to simplify    // the assumptions of the remaining algorithm. @@ -1605,9 +2243,6 @@ void MachineBlockPlacement::buildCFGChains() {      }    } -  // Turned on with OutlineOptionalBranches option -  collectMustExecuteBBs(); -    // Build any loop-based chains.    PreferredLoopExit = nullptr;    for (MachineLoop *L : *MLI) @@ -1839,7 +2474,7 @@ void MachineBlockPlacement::alignBlocks() {  /// @return true if \p BB was removed.  bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(      MachineBasicBlock *BB, MachineBasicBlock *&LPred, -    MachineBasicBlock *LoopHeaderBB, +    const MachineBasicBlock *LoopHeaderBB,      BlockChain &Chain, BlockFilterSet *BlockFilter,      MachineFunction::iterator &PrevUnplacedBlockIt) {    bool Removed, DuplicatedToLPred; @@ -1901,21 +2536,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(  /// \return  - True if the block was duplicated into all preds and removed.  bool MachineBlockPlacement::maybeTailDuplicateBlock(      MachineBasicBlock *BB, MachineBasicBlock *LPred, -    const BlockChain &Chain, BlockFilterSet *BlockFilter, +    BlockChain &Chain, BlockFilterSet *BlockFilter,      MachineFunction::iterator &PrevUnplacedBlockIt,      bool &DuplicatedToLPred) { -    DuplicatedToLPred = false; +  if (!shouldTailDuplicate(BB)) +    return false; +    DEBUG(dbgs() << "Redoing tail duplication for Succ#"          << BB->getNumber() << "\n"); -  bool IsSimple = TailDup.isSimpleBB(BB); -  // Blocks with single successors don't create additional fallthrough -  // opportunities. Don't duplicate them. TODO: When conditional exits are -  // analyzable, allow them to be duplicated. -  if (!IsSimple && BB->succ_size() == 1) -    return false; -  if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) -    return false; +    // This has to be a callback because none of it can be done after    // BB is deleted.    bool Removed = false; @@ -1967,6 +2597,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(        llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);    SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; +  bool IsSimple = TailDup.isSimpleBB(BB);    TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,                                   &DuplicatedPreds, &RemovalCallbackRef); @@ -2006,21 +2637,24 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {    MLI = &getAnalysis<MachineLoopInfo>();    TII = MF.getSubtarget().getInstrInfo();    TLI = MF.getSubtarget().getTargetLowering(); -  MDT = &getAnalysis<MachineDominatorTree>(); +  MPDT = nullptr;    // Initialize PreferredLoopExit to nullptr here since it may never be set if    // there are no MachineLoops.    PreferredLoopExit = nullptr; +  assert(BlockToChain.empty()); +  assert(ComputedEdges.empty()); +    if (TailDupPlacement) { -    unsigned TailDupSize = TailDuplicatePlacementThreshold; +    MPDT = &getAnalysis<MachinePostDominatorTree>(); +    unsigned TailDupSize = TailDupPlacementThreshold;      if (MF.getFunction()->optForSize())        TailDupSize = 1;      TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); +    precomputeTriangleChains();    } -  assert(BlockToChain.empty()); -    buildCFGChains();    // Changing the layout can create new tail merging opportunities. @@ -2032,7 +2666,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {                           BranchFoldPlacement;    // No tail merging opportunities if the block number is less than four.    if (MF.size() > 3 && EnableTailMerge) { -    unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1; +    unsigned TailMergeSize = TailDupPlacementThreshold + 1;      BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,                      *MBPI, TailMergeSize); @@ -2041,8 +2675,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {                              /*AfterBlockPlacement=*/true)) {        // Redo the layout if tail merging creates/removes/moves blocks.        BlockToChain.clear(); -      // Must redo the dominator tree if blocks were changed. -      MDT->runOnMachineFunction(MF); +      ComputedEdges.clear(); +      // Must redo the post-dominator tree if blocks were changed. +      if (MPDT) +        MPDT->runOnMachineFunction(MF);        ChainAllocator.DestroyAll();        buildCFGChains();      } @@ -2052,6 +2688,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {    alignBlocks();    BlockToChain.clear(); +  ComputedEdges.clear();    ChainAllocator.DestroyAll();    if (AlignAllBlock) @@ -2067,6 +2704,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {          MBI->setAlignment(AlignAllNonFallThruBlocks);      }    } +  if (ViewBlockLayoutWithBFI != GVDT_None && +      (ViewBlockFreqFuncName.empty() || +       F->getFunction()->getName().equals(ViewBlockFreqFuncName))) { +    MBFI->view("MBP." + MF.getName(), false); +  } +    // We always return true as we have no way to track whether the final order    // differs from the original order. diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp index 5beed5f5dd08..50e453e4067c 100644 --- a/lib/CodeGen/MachineCombiner.cpp +++ b/lib/CodeGen/MachineCombiner.cpp @@ -8,7 +8,7 @@  //===----------------------------------------------------------------------===//  //  // The machine combiner pass uses machine trace metrics to ensure the combined -// instructions does not lengthen the critical path or the resource depth. +// instructions do not lengthen the critical path or the resource depth.  //===----------------------------------------------------------------------===//  #define DEBUG_TYPE "machine-combiner" @@ -135,7 +135,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,    // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth    for (auto *InstrPtr : InsInstrs) { // for each Use      unsigned IDepth = 0; -    DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";); +    DEBUG(dbgs() << "NEW INSTR "; +          InstrPtr->print(dbgs(), TII); +          dbgs() << "\n";);      for (const MachineOperand &MO : InstrPtr->operands()) {        // Check for virtual register operand.        if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) @@ -352,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {    return false;  } +static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, +                                     SmallVector<MachineInstr *, 16> InsInstrs, +                                     SmallVector<MachineInstr *, 16> DelInstrs, +                                     MachineTraceMetrics *Traces) { +  for (auto *InstrPtr : InsInstrs) +    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); +  for (auto *InstrPtr : DelInstrs) +    InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); +  ++NumInstCombined; +  Traces->invalidate(MBB); +  Traces->verifyAnalysis(); +} +  /// Substitute a slow code sequence with a faster one by  /// evaluating instruction combining pattern.  /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction @@ -406,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {        DenseMap<unsigned, unsigned> InstrIdxForVirtReg;        if (!MinInstr)          MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); -      MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);        Traces->verifyAnalysis();        TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,                                        InstrIdxForVirtReg); @@ -426,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {        // fewer instructions OR        // the new sequence neither lengthens the critical path nor increases        // resource pressure. -      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || -          (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, -                                   DelInstrs, InstrIdxForVirtReg, P) && -           preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { -        for (auto *InstrPtr : InsInstrs) -          MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); -        for (auto *InstrPtr : DelInstrs) -          InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - -        Changed = true; -        ++NumInstCombined; - -        Traces->invalidate(MBB); -        Traces->verifyAnalysis(); +      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { +        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);          // Eagerly stop after the first pattern fires. +        Changed = true;          break;        } else { +        // Calculating the trace metrics may be expensive, +        // so only do this when necessary. +        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); +        if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, +                                    InstrIdxForVirtReg, P) && +            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { +          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); +          // Eagerly stop after the first pattern fires. +          Changed = true; +          break; +        }          // Cleanup instructions of the alternative code sequence. There is no          // use for them.          MachineFunction *MF = MBB->getParent(); diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp index 5de6dec29fb9..7312dc5e94bd 100644 --- a/lib/CodeGen/MachineCopyPropagation.cpp +++ b/lib/CodeGen/MachineCopyPropagation.cpp @@ -291,17 +291,9 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {        if (MO.isDef()) {          Defs.push_back(Reg); -      } else { +        continue; +      } else if (MO.readsReg())          ReadRegister(Reg); -      } -      // Treat undef use like defs for copy propagation but not for -      // dead copy. We would need to do a liveness check to be sure the copy -      // is dead for undef uses. -      // The backends are allowed to do whatever they want with undef value -      // and we cannot be sure this register will not be rewritten to break -      // some false dependencies for the hardware for instance. -      if (MO.isUndef()) -        Defs.push_back(Reg);      }      // The instruction has a register mask operand which means that it clobbers diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp index 303a6a9263be..e3a6c51c47ad 100644 --- a/lib/CodeGen/MachineDominators.cpp +++ b/lib/CodeGen/MachineDominators.cpp @@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {  bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {    CriticalEdgesToSplit.clear();    NewBBs.clear(); +  DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));    DT->recalculate(F); -    return false;  }  MachineDominatorTree::MachineDominatorTree()      : MachineFunctionPass(ID) {    initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); -  DT = new DominatorTreeBase<MachineBasicBlock>(false); -} - -MachineDominatorTree::~MachineDominatorTree() { -  delete DT;  }  void MachineDominatorTree::releaseMemory() { -  DT->releaseMemory(); +  CriticalEdgesToSplit.clear(); +  DT.reset(nullptr);  }  void MachineDominatorTree::verifyAnalysis() const { -  if (VerifyMachineDomInfo) +  if (DT && VerifyMachineDomInfo)      verifyDomTree();  }  void MachineDominatorTree::print(raw_ostream &OS, const Module*) const { -  DT->print(OS); +  if (DT) +    DT->print(OS);  }  void MachineDominatorTree::applySplitCriticalEdges() const { @@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const {  }  void MachineDominatorTree::verifyDomTree() const { +  if (!DT) +    return;    MachineFunction &F = *getRoot()->getParent(); -  MachineDominatorTree OtherDT; -  OtherDT.DT->recalculate(F); -  if (compare(OtherDT)) { +  DominatorTreeBase<MachineBasicBlock> OtherDT(false); +  OtherDT.recalculate(F); +  if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() || +      DT->compare(OtherDT)) {      errs() << "MachineDominatorTree is not up to date!\nComputed:\n"; -    print(errs(), nullptr); +    DT->print(errs());      errs() << "\nActual:\n"; -    OtherDT.print(errs(), nullptr); +    OtherDT.print(errs());      abort();    }  } diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index c1d5ea96cd17..c9767a25e908 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -169,6 +169,7 @@ void MachineFunction::clear() {    InstructionRecycler.clear(Allocator);    OperandRecycler.clear(Allocator);    BasicBlockRecycler.clear(Allocator); +  VariableDbgInfos.clear();    if (RegInfo) {      RegInfo->~MachineRegisterInfo();      Allocator.Deallocate(RegInfo); @@ -859,7 +860,9 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {    if (!isCalleeSavedInfoValid())      return BV; -  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) +  const MachineRegisterInfo &MRI = MF.getRegInfo(); +  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; +       ++CSR)      BV.set(*CSR);    // Saved CSRs are not pristine. @@ -956,7 +959,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void MachineFrameInfo::dump(const MachineFunction &MF) const { +LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const {    print(MF, dbgs());  }  #endif diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index 2f2e3b3d8e9f..c0a8b95ed8a0 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -262,8 +262,21 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {      return getBlockAddress() == Other.getBlockAddress() &&             getOffset() == Other.getOffset();    case MachineOperand::MO_RegisterMask: -  case MachineOperand::MO_RegisterLiveOut: -    return getRegMask() == Other.getRegMask(); +  case MachineOperand::MO_RegisterLiveOut: { +    // Shallow compare of the two RegMasks +    const uint32_t *RegMask = getRegMask(); +    const uint32_t *OtherRegMask = Other.getRegMask(); +    if (RegMask == OtherRegMask) +      return true; + +    // Calculate the size of the RegMask +    const MachineFunction *MF = getParent()->getParent()->getParent(); +    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); +    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + +    // Deep compare of the two RegMasks +    return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask); +  }    case MachineOperand::MO_MCSymbol:      return getMCSymbol() == Other.getMCSymbol();    case MachineOperand::MO_CFIIndex: @@ -274,6 +287,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {      return getIntrinsicID() == Other.getIntrinsicID();    case MachineOperand::MO_Predicate:      return getPredicate() == Other.getPredicate(); +  case MachineOperand::MO_Placeholder: +    return true;    }    llvm_unreachable("Invalid machine operand type");  } @@ -322,6 +337,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {      return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());    case MachineOperand::MO_Predicate:      return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate()); +  case MachineOperand::MO_Placeholder: +    return hash_combine();    }    llvm_unreachable("Invalid machine operand type");  } @@ -403,6 +420,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,        bool Unused;        APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused);        OS << "half " << APF.convertToFloat(); +    } else if (getFPImm()->getType()->isFP128Ty()) { +      APFloat APF = getFPImm()->getValueAPF(); +      SmallString<16> Str; +      getFPImm()->getValueAPF().toString(Str); +      OS << "quad " << Str;      } else {        OS << getFPImm()->getValueAPF().convertToDouble();      } @@ -491,7 +513,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,      auto Pred = static_cast<CmpInst::Predicate>(getPredicate());      OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred")         << CmpInst::getPredicateName(Pred) << '>'; +    break;    } +  case MachineOperand::MO_Placeholder: +    OS << "<placeholder>"; +    break;    }    if (unsigned TF = getTargetFlags())      OS << "[TF=" << TF << ']'; @@ -1571,6 +1597,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {    return true;  } +bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other, +                            bool UseTBAA) { +  const MachineFunction *MF = getParent()->getParent(); +  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + +  // If neither instruction stores to memory, they can't alias in any +  // meaningful way, even if they read from the same address. +  if (!mayStore() && !Other.mayStore()) +    return false; + +  // Let the target decide if memory accesses cannot possibly overlap. +  if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA)) +    return false; + +  if (!AA) +    return true; + +  // FIXME: Need to handle multiple memory operands to support all targets. +  if (!hasOneMemOperand() || !Other.hasOneMemOperand()) +    return true; + +  MachineMemOperand *MMOa = *memoperands_begin(); +  MachineMemOperand *MMOb = *Other.memoperands_begin(); + +  if (!MMOa->getValue() || !MMOb->getValue()) +    return true; + +  // The following interface to AA is fashioned after DAGCombiner::isAlias +  // and operates with MachineMemOperand offset with some important +  // assumptions: +  //   - LLVM fundamentally assumes flat address spaces. +  //   - MachineOperand offset can *only* result from legalization and +  //     cannot affect queries other than the trivial case of overlap +  //     checking. +  //   - These offsets never wrap and never step outside +  //     of allocated objects. +  //   - There should never be any negative offsets here. +  // +  // FIXME: Modify API to hide this math from "user" +  // FIXME: Even before we go to AA we can reason locally about some +  // memory objects. It can save compile time, and possibly catch some +  // corner cases not currently covered. + +  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); +  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + +  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); +  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; +  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; + +  AliasResult AAResult = +      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, +                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), +                MemoryLocation(MMOb->getValue(), Overlapb, +                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); + +  return (AAResult != NoAlias); +} +  /// hasOrderedMemoryRef - Return true if this instruction may have an ordered  /// or volatile memory reference, or if the information describing the memory  /// reference is not available. Return false if it is known to have no ordered @@ -1692,14 +1777,14 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF,    }  } -LLVM_DUMP_METHOD void MachineInstr::dump(const TargetInstrInfo *TII) const {  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineInstr::dump() const {    dbgs() << "  "; -  print(dbgs(), false /* SkipOpers */, TII); -#endif +  print(dbgs());  } +#endif -void MachineInstr::print(raw_ostream &OS, bool SkipOpers, +void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc,                           const TargetInstrInfo *TII) const {    const Module *M = nullptr;    if (const MachineBasicBlock *MBB = getParent()) @@ -1707,11 +1792,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers,        M = MF->getFunction()->getParent();    ModuleSlotTracker MST(M); -  print(OS, MST, SkipOpers, TII); +  print(OS, MST, SkipOpers, SkipDebugLoc, TII);  }  void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, -                         bool SkipOpers, const TargetInstrInfo *TII) const { +                         bool SkipOpers, bool SkipDebugLoc, +                         const TargetInstrInfo *TII) const {    // We can be a bit tidier if we know the MachineFunction.    const MachineFunction *MF = nullptr;    const TargetRegisterInfo *TRI = nullptr; @@ -1987,6 +2073,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,      }      if (isIndirectDebugValue())        OS << " indirect"; +  } else if (SkipDebugLoc) { +    return;    } else if (debugLoc && MF) {      if (!HaveSemi)        OS << ";"; diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp index fdeaf7b71161..a9aa1d954e70 100644 --- a/lib/CodeGen/MachineLoopInfo.cpp +++ b/lib/CodeGen/MachineLoopInfo.cpp @@ -87,6 +87,22 @@ MachineBasicBlock *MachineLoop::findLoopControlBlock() {    return nullptr;  } +DebugLoc MachineLoop::getStartLoc() const { +  // Try the pre-header first. +  if (MachineBasicBlock *PHeadMBB = getLoopPreheader()) +    if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock()) +      if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc()) +        return DL; + +  // If we have no pre-header or there are no instructions with debug +  // info in it, try the header. +  if (MachineBasicBlock *HeadMBB = getHeader()) +    if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock()) +      return HeadBB->getTerminator()->getDebugLoc(); + +  return DebugLoc(); +} +  MachineBasicBlock *  MachineLoopInfo::findLoopPreheader(MachineLoop *L,                                     bool SpeculativePreheader) const { diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp index 6618857477ed..2f0f4297ef5c 100644 --- a/lib/CodeGen/MachineModuleInfo.cpp +++ b/lib/CodeGen/MachineModuleInfo.cpp @@ -306,6 +306,10 @@ public:      MMI.deleteMachineFunctionFor(F);      return true;    } +   +  StringRef getPassName() const override { +    return "Free MachineFunction"; +  }   };  char FreeMachineFunction::ID;  } // end anonymous namespace diff --git a/lib/CodeGen/MachineModuleInfoImpls.cpp b/lib/CodeGen/MachineModuleInfoImpls.cpp index 22d519e5d88f..4c81fd91cb82 100644 --- a/lib/CodeGen/MachineModuleInfoImpls.cpp +++ b/lib/CodeGen/MachineModuleInfoImpls.cpp @@ -23,6 +23,7 @@ using namespace llvm;  // Out of line virtual method.  void MachineModuleInfoMachO::anchor() {}  void MachineModuleInfoELF::anchor() {} +void MachineModuleInfoWasm::anchor() {}  static int SortSymbolPair(const void *LHS, const void *RHS) {    typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy; diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp new file mode 100644 index 000000000000..6b6b5f2814a9 --- /dev/null +++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -0,0 +1,100 @@ +///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===// +/// +///                     The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// Optimization diagnostic interfaces for machine passes.  It's packaged as an +/// analysis pass so that by using this service passes become dependent on MBFI +/// as well.  MBFI is used to compute the "hotness" of the diagnostic message. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +DiagnosticInfoMIROptimization::MachineArgument::MachineArgument( +    StringRef MKey, const MachineInstr &MI) +    : Argument() { +  Key = MKey; + +  raw_string_ostream OS(Val); +  MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true); +} + +Optional<uint64_t> +MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) { +  if (!MBFI) +    return None; + +  return MBFI->getBlockProfileCount(&MBB); +} + +void MachineOptimizationRemarkEmitter::computeHotness( +    DiagnosticInfoMIROptimization &Remark) { +  const MachineBasicBlock *MBB = Remark.getBlock(); +  if (MBB) +    Remark.setHotness(computeHotness(*MBB)); +} + +void MachineOptimizationRemarkEmitter::emit( +    DiagnosticInfoOptimizationBase &OptDiagCommon) { +  auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon); +  computeHotness(OptDiag); + +  LLVMContext &Ctx = MF.getFunction()->getContext(); +  yaml::Output *Out = Ctx.getDiagnosticsOutputFile(); +  if (Out) { +    auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon); +    *Out << P; +  } +  // FIXME: now that IsVerbose is part of DI, filtering for this will be moved +  // from here to clang. +  if (!OptDiag.isVerbose() || shouldEmitVerbose()) +    Ctx.diagnose(OptDiag); +} + +MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass() +    : MachineFunctionPass(ID) { +  initializeMachineOptimizationRemarkEmitterPassPass( +      *PassRegistry::getPassRegistry()); +} + +bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction( +    MachineFunction &MF) { +  MachineBlockFrequencyInfo *MBFI; + +  if (MF.getFunction()->getContext().getDiagnosticHotnessRequested()) +    MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI(); +  else +    MBFI = nullptr; + +  ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); +  return false; +} + +void MachineOptimizationRemarkEmitterPass::getAnalysisUsage( +    AnalysisUsage &AU) const { +  AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); +  AU.setPreservesAll(); +  MachineFunctionPass::getAnalysisUsage(AU); +} + +char MachineOptimizationRemarkEmitterPass::ID = 0; +static const char ore_name[] = "Machine Optimization Remark Emitter"; +#define ORE_NAME "machine-opt-remark-emitter" + +INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, +                      false, true) +INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass) +INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, +                    false, true) diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp new file mode 100644 index 000000000000..581a8ad81149 --- /dev/null +++ b/lib/CodeGen/MachineOutliner.cpp @@ -0,0 +1,1251 @@ +//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Replaces repeated sequences of instructions with function calls. +/// +/// This works by placing every instruction from every basic block in a +/// suffix tree, and repeatedly querying that tree for repeated sequences of +/// instructions. If a sequence of instructions appears often, then it ought +/// to be beneficial to pull out into a function. +/// +/// This was originally presented at the 2016 LLVM Developers' Meeting in the +/// talk "Reducing Code Size Using Outlining". For a high-level overview of +/// how this pass works, the talk is available on YouTube at +/// +/// https://www.youtube.com/watch?v=yorld-WSOeU +/// +/// The slides for the talk are available at +/// +/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf +/// +/// The talk provides an overview of how the outliner finds candidates and +/// ultimately outlines them. It describes how the main data structure for this +/// pass, the suffix tree, is queried and purged for candidates. It also gives +/// a simplified suffix tree construction algorithm for suffix trees based off +/// of the algorithm actually used here, Ukkonen's algorithm. +/// +/// For the original RFC for this pass, please see +/// +/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html +/// +/// For more information on the suffix tree data structure, please see +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +/// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <functional> +#include <map> +#include <sstream> +#include <tuple> +#include <vector> + +#define DEBUG_TYPE "machine-outliner" + +using namespace llvm; + +STATISTIC(NumOutlined, "Number of candidates outlined"); +STATISTIC(FunctionsCreated, "Number of functions created"); + +namespace { + +/// \brief An individual sequence of instructions to be replaced with a call to +/// an outlined function. +struct Candidate { + +  /// Set to false if the candidate overlapped with another candidate. +  bool InCandidateList = true; + +  /// The start index of this \p Candidate. +  size_t StartIdx; + +  /// The number of instructions in this \p Candidate. +  size_t Len; + +  /// The index of this \p Candidate's \p OutlinedFunction in the list of +  /// \p OutlinedFunctions. +  size_t FunctionIdx; + +  /// \brief The number of instructions that would be saved by outlining every +  /// candidate of this type. +  /// +  /// This is a fixed value which is not updated during the candidate pruning +  /// process. It is only used for deciding which candidate to keep if two +  /// candidates overlap. The true benefit is stored in the OutlinedFunction +  /// for some given candidate. +  unsigned Benefit = 0; + +  Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx) +      : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {} + +  Candidate() {} + +  /// \brief Used to ensure that \p Candidates are outlined in an order that +  /// preserves the start and end indices of other \p Candidates. +  bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; } +}; + +/// \brief The information necessary to create an outlined function for some +/// class of candidate. +struct OutlinedFunction { + +  /// The actual outlined function created. +  /// This is initialized after we go through and create the actual function. +  MachineFunction *MF = nullptr; + +  /// A number assigned to this function which appears at the end of its name. +  size_t Name; + +  /// The number of candidates for this OutlinedFunction. +  size_t OccurrenceCount = 0; + +  /// \brief The sequence of integers corresponding to the instructions in this +  /// function. +  std::vector<unsigned> Sequence; + +  /// The number of instructions this function would save. +  unsigned Benefit = 0; + +  /// \brief Set to true if candidates for this outlined function should be +  /// replaced with tail calls to this OutlinedFunction. +  bool IsTailCall = false; + +  OutlinedFunction(size_t Name, size_t OccurrenceCount, +                   const std::vector<unsigned> &Sequence, +                   unsigned Benefit, bool IsTailCall) +      : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence), +        Benefit(Benefit), IsTailCall(IsTailCall) +        {} +}; + +/// Represents an undefined index in the suffix tree. +const size_t EmptyIdx = -1; + +/// A node in a suffix tree which represents a substring or suffix. +/// +/// Each node has either no children or at least two children, with the root +/// being a exception in the empty tree. +/// +/// Children are represented as a map between unsigned integers and nodes. If +/// a node N has a child M on unsigned integer k, then the mapping represented +/// by N is a proper prefix of the mapping represented by M. Note that this, +/// although similar to a trie is somewhat different: each node stores a full +/// substring of the full mapping rather than a single character state. +/// +/// Each internal node contains a pointer to the internal node representing +/// the same string, but with the first character chopped off. This is stored +/// in \p Link. Each leaf node stores the start index of its respective +/// suffix in \p SuffixIdx. +struct SuffixTreeNode { + +  /// The children of this node. +  /// +  /// A child existing on an unsigned integer implies that from the mapping +  /// represented by the current node, there is a way to reach another +  /// mapping by tacking that character on the end of the current string. +  DenseMap<unsigned, SuffixTreeNode *> Children; + +  /// A flag set to false if the node has been pruned from the tree. +  bool IsInTree = true; + +  /// The start index of this node's substring in the main string. +  size_t StartIdx = EmptyIdx; + +  /// The end index of this node's substring in the main string. +  /// +  /// Every leaf node must have its \p EndIdx incremented at the end of every +  /// step in the construction algorithm. To avoid having to update O(N) +  /// nodes individually at the end of every step, the end index is stored +  /// as a pointer. +  size_t *EndIdx = nullptr; + +  /// For leaves, the start index of the suffix represented by this node. +  /// +  /// For all other nodes, this is ignored. +  size_t SuffixIdx = EmptyIdx; + +  /// \brief For internal nodes, a pointer to the internal node representing +  /// the same sequence with the first character chopped off. +  /// +  /// This has two major purposes in the suffix tree. The first is as a +  /// shortcut in Ukkonen's construction algorithm. One of the things that +  /// Ukkonen's algorithm does to achieve linear-time construction is +  /// keep track of which node the next insert should be at. This makes each +  /// insert O(1), and there are a total of O(N) inserts. The suffix link +  /// helps with inserting children of internal nodes. +  /// +  /// Say we add a child to an internal node with associated mapping S. The  +  /// next insertion must be at the node representing S - its first character. +  /// This is given by the way that we iteratively build the tree in Ukkonen's +  /// algorithm. The main idea is to look at the suffixes of each prefix in the +  /// string, starting with the longest suffix of the prefix, and ending with +  /// the shortest. Therefore, if we keep pointers between such nodes, we can +  /// move to the next insertion point in O(1) time. If we don't, then we'd +  /// have to query from the root, which takes O(N) time. This would make the +  /// construction algorithm O(N^2) rather than O(N). +  /// +  /// The suffix link is also used during the tree pruning process to let us +  /// quickly throw out a bunch of potential overlaps. Say we have a sequence +  /// S we want to outline. Then each of its suffixes contribute to at least +  /// one overlapping case. Therefore, we can follow the suffix links +  /// starting at the node associated with S to the root and "delete" those +  /// nodes, save for the root. For each candidate, this removes +  /// O(|candidate|) overlaps from the search space. We don't actually +  /// completely invalidate these nodes though; doing that is far too +  /// aggressive. Consider the following pathological string: +  /// +  /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 +  /// +  /// If we, for the sake of example, outlined 1 2 3, then we would throw +  /// out all instances of 2 3. This isn't desirable. To get around this, +  /// when we visit a link node, we decrement its occurrence count by the +  /// number of sequences we outlined in the current step. In the pathological +  /// example, the 2 3 node would have an occurrence count of 8, while the +  /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node +  /// would survive to the next round allowing us to outline the extra +  /// instances of 2 3. +  SuffixTreeNode *Link = nullptr; + +  /// The parent of this node. Every node except for the root has a parent. +  SuffixTreeNode *Parent = nullptr; + +  /// The number of times this node's string appears in the tree. +  /// +  /// This is equal to the number of leaf children of the string. It represents +  /// the number of suffixes that the node's string is a prefix of. +  size_t OccurrenceCount = 0; + +  /// The length of the string formed by concatenating the edge labels from the +  /// root to this node. +  size_t ConcatLen = 0; + +  /// Returns true if this node is a leaf. +  bool isLeaf() const { return SuffixIdx != EmptyIdx; } + +  /// Returns true if this node is the root of its owning \p SuffixTree. +  bool isRoot() const { return StartIdx == EmptyIdx; } + +  /// Return the number of elements in the substring associated with this node. +  size_t size() const { + +    // Is it the root? If so, it's the empty string so return 0. +    if (isRoot()) +      return 0; + +    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); + +    // Size = the number of elements in the string. +    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. +    return *EndIdx - StartIdx + 1; +  } + +  SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link, +                 SuffixTreeNode *Parent) +      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {} + +  SuffixTreeNode() {} +}; + +/// A data structure for fast substring queries. +/// +/// Suffix trees represent the suffixes of their input strings in their leaves. +/// A suffix tree is a type of compressed trie structure where each node +/// represents an entire substring rather than a single character. Each leaf +/// of the tree is a suffix. +/// +/// A suffix tree can be seen as a type of state machine where each state is a +/// substring of the full string. The tree is structured so that, for a string +/// of length N, there are exactly N leaves in the tree. This structure allows +/// us to quickly find repeated substrings of the input string. +/// +/// In this implementation, a "string" is a vector of unsigned integers. +/// These integers may result from hashing some data type. A suffix tree can +/// contain 1 or many strings, which can then be queried as one large string. +/// +/// The suffix tree is implemented using Ukkonen's algorithm for linear-time +/// suffix tree construction. Ukkonen's algorithm is explained in more detail +/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +/// paper is available at +/// +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +class SuffixTree { +private: +  /// Each element is an integer representing an instruction in the module. +  ArrayRef<unsigned> Str; + +  /// Maintains each node in the tree. +  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; + +  /// The root of the suffix tree. +  /// +  /// The root represents the empty string. It is maintained by the +  /// \p NodeAllocator like every other node in the tree. +  SuffixTreeNode *Root = nullptr; + +  /// Stores each leaf node in the tree. +  /// +  /// This is used for finding outlining candidates. +  std::vector<SuffixTreeNode *> LeafVector; + +  /// Maintains the end indices of the internal nodes in the tree. +  /// +  /// Each internal node is guaranteed to never have its end index change +  /// during the construction algorithm; however, leaves must be updated at +  /// every step. Therefore, we need to store leaf end indices by reference +  /// to avoid updating O(N) leaves at every step of construction. Thus, +  /// every internal node must be allocated its own end index. +  BumpPtrAllocator InternalEndIdxAllocator; + +  /// The end index of each leaf in the tree. +  size_t LeafEndIdx = -1; + +  /// \brief Helper struct which keeps track of the next insertion point in +  /// Ukkonen's algorithm. +  struct ActiveState { +    /// The next node to insert at. +    SuffixTreeNode *Node; + +    /// The index of the first character in the substring currently being added. +    size_t Idx = EmptyIdx; + +    /// The length of the substring we have to add at the current step. +    size_t Len = 0; +  }; + +  /// \brief The point the next insertion will take place at in the +  /// construction algorithm. +  ActiveState Active; + +  /// Allocate a leaf node and add it to the tree. +  /// +  /// \param Parent The parent of this node. +  /// \param StartIdx The start index of this node's associated string. +  /// \param Edge The label on the edge leaving \p Parent to this node. +  /// +  /// \returns A pointer to the allocated leaf node. +  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx, +                             unsigned Edge) { + +    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + +    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx,  +                                                                   &LeafEndIdx, +                                                                       nullptr, +                                                                      &Parent); +    Parent.Children[Edge] = N; + +    return N; +  } + +  /// Allocate an internal node and add it to the tree. +  /// +  /// \param Parent The parent of this node. Only null when allocating the root. +  /// \param StartIdx The start index of this node's associated string. +  /// \param EndIdx The end index of this node's associated string. +  /// \param Edge The label on the edge leaving \p Parent to this node. +  /// +  /// \returns A pointer to the allocated internal node. +  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx, +                                     size_t EndIdx, unsigned Edge) { + +    assert(StartIdx <= EndIdx && "String can't start after it ends!"); +    assert(!(!Parent && StartIdx != EmptyIdx) && +    "Non-root internal nodes must have parents!"); + +    size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx); +    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, +                                                                      E, +                                                                      Root, +                                                                      Parent); +    if (Parent) +      Parent->Children[Edge] = N; + +    return N; +  } + +  /// \brief Set the suffix indices of the leaves to the start indices of their +  /// respective suffixes. Also stores each leaf in \p LeafVector at its +  /// respective suffix index. +  /// +  /// \param[in] CurrNode The node currently being visited. +  /// \param CurrIdx The current index of the string being visited. +  void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) { + +    bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot(); + +    // Store the length of the concatenation of all strings from the root to +    // this node. +    if (!CurrNode.isRoot()) { +      if (CurrNode.ConcatLen == 0) +        CurrNode.ConcatLen = CurrNode.size(); + +      if (CurrNode.Parent) +       CurrNode.ConcatLen += CurrNode.Parent->ConcatLen; +    } + +    // Traverse the tree depth-first. +    for (auto &ChildPair : CurrNode.Children) { +      assert(ChildPair.second && "Node had a null child!"); +      setSuffixIndices(*ChildPair.second, +                       CurrIdx + ChildPair.second->size()); +    } + +    // Is this node a leaf? +    if (IsLeaf) { +      // If yes, give it a suffix index and bump its parent's occurrence count. +      CurrNode.SuffixIdx = Str.size() - CurrIdx; +      assert(CurrNode.Parent && "CurrNode had no parent!"); +      CurrNode.Parent->OccurrenceCount++; + +      // Store the leaf in the leaf vector for pruning later. +      LeafVector[CurrNode.SuffixIdx] = &CurrNode; +    } +  } + +  /// \brief Construct the suffix tree for the prefix of the input ending at +  /// \p EndIdx. +  /// +  /// Used to construct the full suffix tree iteratively. At the end of each +  /// step, the constructed suffix tree is either a valid suffix tree, or a +  /// suffix tree with implicit suffixes. At the end of the final step, the +  /// suffix tree is a valid tree. +  /// +  /// \param EndIdx The end index of the current prefix in the main string. +  /// \param SuffixesToAdd The number of suffixes that must be added +  /// to complete the suffix tree at the current phase. +  /// +  /// \returns The number of suffixes that have not been added at the end of +  /// this step. +  unsigned extend(size_t EndIdx, size_t SuffixesToAdd) { +    SuffixTreeNode *NeedsLink = nullptr; + +    while (SuffixesToAdd > 0) { +     +      // Are we waiting to add anything other than just the last character? +      if (Active.Len == 0) { +        // If not, then say the active index is the end index. +        Active.Idx = EndIdx; +      } + +      assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + +      // The first character in the current substring we're looking at. +      unsigned FirstChar = Str[Active.Idx]; + +      // Have we inserted anything starting with FirstChar at the current node? +      if (Active.Node->Children.count(FirstChar) == 0) { +        // If not, then we can just insert a leaf and move too the next step. +        insertLeaf(*Active.Node, EndIdx, FirstChar); + +        // The active node is an internal node, and we visited it, so it must +        // need a link if it doesn't have one. +        if (NeedsLink) { +          NeedsLink->Link = Active.Node; +          NeedsLink = nullptr; +        } +      } else { +        // There's a match with FirstChar, so look for the point in the tree to +        // insert a new node. +        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; + +        size_t SubstringLen = NextNode->size(); + +        // Is the current suffix we're trying to insert longer than the size of +        // the child we want to move to? +        if (Active.Len >= SubstringLen) { +          // If yes, then consume the characters we've seen and move to the next +          // node. +          Active.Idx += SubstringLen; +          Active.Len -= SubstringLen; +          Active.Node = NextNode; +          continue; +        } + +        // Otherwise, the suffix we're trying to insert must be contained in the +        // next node we want to move to. +        unsigned LastChar = Str[EndIdx]; + +        // Is the string we're trying to insert a substring of the next node? +        if (Str[NextNode->StartIdx + Active.Len] == LastChar) { +          // If yes, then we're done for this step. Remember our insertion point +          // and move to the next end index. At this point, we have an implicit +          // suffix tree. +          if (NeedsLink && !Active.Node->isRoot()) { +            NeedsLink->Link = Active.Node; +            NeedsLink = nullptr; +          } + +          Active.Len++; +          break; +        } + +        // The string we're trying to insert isn't a substring of the next node, +        // but matches up to a point. Split the node. +        // +        // For example, say we ended our search at a node n and we're trying to +        // insert ABD. Then we'll create a new node s for AB, reduce n to just +        // representing C, and insert a new leaf node l to represent d. This +        // allows us to ensure that if n was a leaf, it remains a leaf. +        // +        //   | ABC  ---split--->  | AB +        //   n                    s +        //                     C / \ D +        //                      n   l + +        // The node s from the diagram +        SuffixTreeNode *SplitNode = +            insertInternalNode(Active.Node, +                               NextNode->StartIdx, +                               NextNode->StartIdx + Active.Len - 1, +                               FirstChar); + +        // Insert the new node representing the new substring into the tree as +        // a child of the split node. This is the node l from the diagram. +        insertLeaf(*SplitNode, EndIdx, LastChar); + +        // Make the old node a child of the split node and update its start +        // index. This is the node n from the diagram. +        NextNode->StartIdx += Active.Len; +        NextNode->Parent = SplitNode; +        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; + +        // SplitNode is an internal node, update the suffix link. +        if (NeedsLink) +          NeedsLink->Link = SplitNode; + +        NeedsLink = SplitNode; +      } + +      // We've added something new to the tree, so there's one less suffix to +      // add. +      SuffixesToAdd--; + +      if (Active.Node->isRoot()) { +        if (Active.Len > 0) { +          Active.Len--; +          Active.Idx = EndIdx - SuffixesToAdd + 1; +        } +      } else { +        // Start the next phase at the next smallest suffix. +        Active.Node = Active.Node->Link; +      } +    } + +    return SuffixesToAdd; +  } + +public: + +  /// Find all repeated substrings that satisfy \p BenefitFn. +  /// +  /// If a substring appears at least twice, then it must be represented by +  /// an internal node which appears in at least two suffixes. Each suffix is +  /// represented by a leaf node. To do this, we visit each internal node in +  /// the tree, using the leaf children of each internal node. If an internal +  /// node represents a beneficial substring, then we use each of its leaf +  /// children to find the locations of its substring. +  /// +  /// \param[out] CandidateList Filled with candidates representing each +  /// beneficial substring. +  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each +  /// type of candidate. +  /// \param BenefitFn The function to satisfy. +  /// +  /// \returns The length of the longest candidate found. +  size_t findCandidates(std::vector<Candidate> &CandidateList, +  std::vector<OutlinedFunction> &FunctionList, +  const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)> +  &BenefitFn) { + +    CandidateList.clear(); +    FunctionList.clear(); +    size_t FnIdx = 0; +    size_t MaxLen = 0; + +    for (SuffixTreeNode* Leaf : LeafVector) { +      assert(Leaf && "Leaves in LeafVector cannot be null!"); +      if (!Leaf->IsInTree) +        continue; + +      assert(Leaf->Parent && "All leaves must have parents!"); +      SuffixTreeNode &Parent = *(Leaf->Parent); + +      // If it doesn't appear enough, or we already outlined from it, skip it. +      if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree) +        continue; + +      size_t StringLen = Leaf->ConcatLen - Leaf->size(); + +      // How many instructions would outlining this string save? +      unsigned Benefit = BenefitFn(Parent, +        StringLen, Str[Leaf->SuffixIdx + StringLen - 1]); + +      // If it's not beneficial, skip it. +      if (Benefit < 1) +        continue; + +      if (StringLen > MaxLen) +        MaxLen = StringLen; + +      unsigned OccurrenceCount = 0; +      for (auto &ChildPair : Parent.Children) { +        SuffixTreeNode *M = ChildPair.second; + +        // Is it a leaf? If so, we have an occurrence of this candidate. +        if (M && M->IsInTree && M->isLeaf()) { +          OccurrenceCount++; +          CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx); +          CandidateList.back().Benefit = Benefit; +          M->IsInTree = false; +        } +      } + +      // Save the function for the new candidate sequence. +      std::vector<unsigned> CandidateSequence; +      for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++) +        CandidateSequence.push_back(Str[i]); + +      FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence, +                                Benefit, false); + +      // Move to the next function. +      FnIdx++; +      Parent.IsInTree = false; +    } + +    return MaxLen; +  } +  +  /// Construct a suffix tree from a sequence of unsigned integers. +  /// +  /// \param Str The string to construct the suffix tree for. +  SuffixTree(const std::vector<unsigned> &Str) : Str(Str) { +    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); +    Root->IsInTree = true; +    Active.Node = Root; +    LeafVector = std::vector<SuffixTreeNode*>(Str.size()); + +    // Keep track of the number of suffixes we have to add of the current +    // prefix. +    size_t SuffixesToAdd = 0; +    Active.Node = Root; + +    // Construct the suffix tree iteratively on each prefix of the string. +    // PfxEndIdx is the end index of the current prefix. +    // End is one past the last element in the string. +    for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { +      SuffixesToAdd++; +      LeafEndIdx = PfxEndIdx; // Extend each of the leaves. +      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); +    } + +    // Set the suffix indices of each leaf. +    assert(Root && "Root node can't be nullptr!"); +    setSuffixIndices(*Root, 0); +  } +}; + +/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings. +struct InstructionMapper { + +  /// \brief The next available integer to assign to a \p MachineInstr that +  /// cannot be outlined. +  /// +  /// Set to -3 for compatability with \p DenseMapInfo<unsigned>. +  unsigned IllegalInstrNumber = -3; + +  /// \brief The next available integer to assign to a \p MachineInstr that can +  /// be outlined. +  unsigned LegalInstrNumber = 0; + +  /// Correspondence from \p MachineInstrs to unsigned integers. +  DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait> +      InstructionIntegerMap; + +  /// Corresponcence from unsigned integers to \p MachineInstrs. +  /// Inverse of \p InstructionIntegerMap. +  DenseMap<unsigned, MachineInstr *> IntegerInstructionMap; + +  /// The vector of unsigned integers that the module is mapped to. +  std::vector<unsigned> UnsignedVec; + +  /// \brief Stores the location of the instruction associated with the integer +  /// at index i in \p UnsignedVec for each index i. +  std::vector<MachineBasicBlock::iterator> InstrList; + +  /// \brief Maps \p *It to a legal integer. +  /// +  /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap, +  /// \p IntegerInstructionMap, and \p LegalInstrNumber. +  /// +  /// \returns The integer that \p *It was mapped to. +  unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) { + +    // Get the integer for this instruction or give it the current +    // LegalInstrNumber. +    InstrList.push_back(It); +    MachineInstr &MI = *It; +    bool WasInserted; +    DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator +    ResultIt; +    std::tie(ResultIt, WasInserted) = +    InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber)); +    unsigned MINumber = ResultIt->second; + +    // There was an insertion. +    if (WasInserted) { +      LegalInstrNumber++; +      IntegerInstructionMap.insert(std::make_pair(MINumber, &MI)); +    } + +    UnsignedVec.push_back(MINumber); + +    // Make sure we don't overflow or use any integers reserved by the DenseMap. +    if (LegalInstrNumber >= IllegalInstrNumber) +      report_fatal_error("Instruction mapping overflow!"); + +    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() +          && "Tried to assign DenseMap tombstone or empty key to instruction."); +    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() +          && "Tried to assign DenseMap tombstone or empty key to instruction."); + +    return MINumber; +  } + +  /// Maps \p *It to an illegal integer. +  /// +  /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber. +  /// +  /// \returns The integer that \p *It was mapped to. +  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) { +    unsigned MINumber = IllegalInstrNumber; + +    InstrList.push_back(It); +    UnsignedVec.push_back(IllegalInstrNumber); +    IllegalInstrNumber--; + +    assert(LegalInstrNumber < IllegalInstrNumber && +           "Instruction mapping overflow!"); + +    assert(IllegalInstrNumber != +      DenseMapInfo<unsigned>::getEmptyKey() && +      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + +    assert(IllegalInstrNumber != +      DenseMapInfo<unsigned>::getTombstoneKey() && +      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + +    return MINumber; +  } + +  /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds +  /// and appends it to \p UnsignedVec and \p InstrList. +  /// +  /// Two instructions are assigned the same integer if they are identical. +  /// If an instruction is deemed unsafe to outline, then it will be assigned an +  /// unique integer. The resulting mapping is placed into a suffix tree and +  /// queried for candidates. +  /// +  /// \param MBB The \p MachineBasicBlock to be translated into integers. +  /// \param TRI \p TargetRegisterInfo for the module. +  /// \param TII \p TargetInstrInfo for the module. +  void convertToUnsignedVec(MachineBasicBlock &MBB, +                            const TargetRegisterInfo &TRI, +                            const TargetInstrInfo &TII) { +    for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et; +         It++) { + +      // Keep track of where this instruction is in the module. +      switch(TII.getOutliningType(*It)) { +        case TargetInstrInfo::MachineOutlinerInstrType::Illegal: +          mapToIllegalUnsigned(It); +          break; + +        case TargetInstrInfo::MachineOutlinerInstrType::Legal: +          mapToLegalUnsigned(It); +          break; + +        case TargetInstrInfo::MachineOutlinerInstrType::Invisible: +          break; +      } +    } + +    // After we're done every insertion, uniquely terminate this part of the +    // "string". This makes sure we won't match across basic block or function +    // boundaries since the "end" is encoded uniquely and thus appears in no +    // repeated substring. +    InstrList.push_back(MBB.end()); +    UnsignedVec.push_back(IllegalInstrNumber); +    IllegalInstrNumber--; +  } + +  InstructionMapper() { +    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't +    // changed. +    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 && +                "DenseMapInfo<unsigned>'s empty key isn't -1!"); +    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 && +                "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); +  } +}; + +/// \brief An interprocedural pass which finds repeated sequences of +/// instructions and replaces them with calls to functions. +/// +/// Each instruction is mapped to an unsigned integer and placed in a string. +/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree +/// is then repeatedly queried for repeated sequences of instructions. Each +/// non-overlapping repeated sequence is then placed in its own +/// \p MachineFunction and each instance is then replaced with a call to that +/// function. +struct MachineOutliner : public ModulePass { + +  static char ID; + +  StringRef getPassName() const override { return "Machine Outliner"; } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<MachineModuleInfo>(); +    AU.addPreserved<MachineModuleInfo>(); +    AU.setPreservesAll(); +    ModulePass::getAnalysisUsage(AU); +  } + +  MachineOutliner() : ModulePass(ID) { +    initializeMachineOutlinerPass(*PassRegistry::getPassRegistry()); +  } + +  /// \brief Replace the sequences of instructions represented by the +  /// \p Candidates in \p CandidateList with calls to \p MachineFunctions +  /// described in \p FunctionList. +  /// +  /// \param M The module we are outlining from. +  /// \param CandidateList A list of candidates to be outlined. +  /// \param FunctionList A list of functions to be inserted into the module. +  /// \param Mapper Contains the instruction mappings for the module. +  bool outline(Module &M, const ArrayRef<Candidate> &CandidateList, +               std::vector<OutlinedFunction> &FunctionList, +               InstructionMapper &Mapper); + +  /// Creates a function for \p OF and inserts it into the module. +  MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF, +                                          InstructionMapper &Mapper); + +  /// Find potential outlining candidates and store them in \p CandidateList. +  /// +  /// For each type of potential candidate, also build an \p OutlinedFunction +  /// struct containing the information to build the function for that +  /// candidate. +  /// +  /// \param[out] CandidateList Filled with outlining candidates for the module. +  /// \param[out] FunctionList Filled with functions corresponding to each type +  /// of \p Candidate. +  /// \param ST The suffix tree for the module. +  /// \param TII TargetInstrInfo for the module. +  /// +  /// \returns The length of the longest candidate found. 0 if there are none. +  unsigned buildCandidateList(std::vector<Candidate> &CandidateList, +                              std::vector<OutlinedFunction> &FunctionList, +                              SuffixTree &ST, +                              InstructionMapper &Mapper, +                              const TargetInstrInfo &TII); + +  /// \brief Remove any overlapping candidates that weren't handled by the +  /// suffix tree's pruning method. +  /// +  /// Pruning from the suffix tree doesn't necessarily remove all overlaps. +  /// If a short candidate is chosen for outlining, then a longer candidate +  /// which has that short candidate as a suffix is chosen, the tree's pruning +  /// method will not find it. Thus, we need to prune before outlining as well. +  /// +  /// \param[in,out] CandidateList A list of outlining candidates. +  /// \param[in,out] FunctionList A list of functions to be outlined. +  /// \param MaxCandidateLen The length of the longest candidate. +  /// \param TII TargetInstrInfo for the module. +  void pruneOverlaps(std::vector<Candidate> &CandidateList, +                     std::vector<OutlinedFunction> &FunctionList, +                     unsigned MaxCandidateLen, +                     const TargetInstrInfo &TII); + +  /// Construct a suffix tree on the instructions in \p M and outline repeated +  /// strings from that tree. +  bool runOnModule(Module &M) override; +}; + +} // Anonymous namespace. + +char MachineOutliner::ID = 0; + +namespace llvm { +ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); } +} + +INITIALIZE_PASS(MachineOutliner, "machine-outliner", +                "Machine Function Outliner", false, false) + +void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList, +                                    std::vector<OutlinedFunction> &FunctionList, +                                    unsigned MaxCandidateLen, +                                    const TargetInstrInfo &TII) { +  // TODO: Experiment with interval trees or other interval-checking structures +  // to lower the time complexity of this function. +  // TODO: Can we do better than the simple greedy choice? +  // Check for overlaps in the range. +  // This is O(MaxCandidateLen * CandidateList.size()). +  for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et; +       It++) { +    Candidate &C1 = *It; +    OutlinedFunction &F1 = FunctionList[C1.FunctionIdx]; + +    // If we removed this candidate, skip it. +    if (!C1.InCandidateList) +      continue; + +    // Is it still worth it to outline C1? +    if (F1.Benefit < 1 || F1.OccurrenceCount < 2) { +      assert(F1.OccurrenceCount > 0 && +               "Can't remove OutlinedFunction with no occurrences!"); +      F1.OccurrenceCount--; +      C1.InCandidateList = false; +      continue; +    } + +    // The minimum start index of any candidate that could overlap with this +    // one. +    unsigned FarthestPossibleIdx = 0; + +    // Either the index is 0, or it's at most MaxCandidateLen indices away. +    if (C1.StartIdx > MaxCandidateLen) +      FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen; + +    // Compare against the candidates in the list that start at at most +    // FarthestPossibleIdx indices away from C1. There are at most +    // MaxCandidateLen of these. +    for (auto Sit = It + 1; Sit != Et; Sit++) { +      Candidate &C2 = *Sit; +      OutlinedFunction &F2 = FunctionList[C2.FunctionIdx]; + +      // Is this candidate too far away to overlap? +      if (C2.StartIdx < FarthestPossibleIdx) +        break; + +      // Did we already remove this candidate in a previous step? +      if (!C2.InCandidateList) +        continue; + +      // Is the function beneficial to outline? +      if (F2.OccurrenceCount < 2 || F2.Benefit < 1) { +        // If not, remove this candidate and move to the next one. +        assert(F2.OccurrenceCount > 0 && +               "Can't remove OutlinedFunction with no occurrences!"); +        F2.OccurrenceCount--; +        C2.InCandidateList = false; +        continue; +      } + +      size_t C2End = C2.StartIdx + C2.Len - 1; + +      // Do C1 and C2 overlap? +      // +      // Not overlapping: +      // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices +      // +      // We sorted our candidate list so C2Start <= C1Start. We know that +      // C2End > C2Start since each candidate has length >= 2. Therefore, all we +      // have to check is C2End < C2Start to see if we overlap. +      if (C2End < C1.StartIdx) +        continue; + +      // C1 and C2 overlap. +      // We need to choose the better of the two. +      // +      // Approximate this by picking the one which would have saved us the +      // most instructions before any pruning. +      if (C1.Benefit >= C2.Benefit) { + +        // C1 is better, so remove C2 and update C2's OutlinedFunction to +        // reflect the removal. +        assert(F2.OccurrenceCount > 0 && +               "Can't remove OutlinedFunction with no occurrences!"); +        F2.OccurrenceCount--; +        F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(), +                                             F2.OccurrenceCount, +                                             F2.IsTailCall +                                             ); + +        C2.InCandidateList = false; + +        DEBUG ( +          dbgs() << "- Removed C2. \n"; +          dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n"; +          dbgs() << "--- C2's benefit: " << F2.Benefit << "\n"; +        ); + +      } else { +        // C2 is better, so remove C1 and update C1's OutlinedFunction to +        // reflect the removal. +        assert(F1.OccurrenceCount > 0 && +               "Can't remove OutlinedFunction with no occurrences!"); +        F1.OccurrenceCount--; +        F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(), +                                             F1.OccurrenceCount, +                                             F1.IsTailCall +                                             ); +        C1.InCandidateList = false; + +        DEBUG ( +          dbgs() << "- Removed C1. \n"; +          dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n"; +          dbgs() << "--- C1's benefit: " << F1.Benefit << "\n"; +        ); + +        // C1 is out, so we don't have to compare it against anyone else. +        break; +      } +    } +  } +} + +unsigned +MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList, +                                    std::vector<OutlinedFunction> &FunctionList, +                                    SuffixTree &ST, +                                    InstructionMapper &Mapper, +                                    const TargetInstrInfo &TII) { + +  std::vector<unsigned> CandidateSequence; // Current outlining candidate. +  size_t MaxCandidateLen = 0; // Length of the longest candidate. + +  // Function for maximizing query in the suffix tree. +  // This allows us to define more fine-grained types of things to outline in +  // the target without putting target-specific info in the suffix tree. +  auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr, +                                   size_t StringLen, unsigned EndVal) { + +    // The root represents the empty string. +    if (Curr.isRoot()) +      return 0u; + +    // Is this long enough to outline? +	// TODO: Let the target decide how "long" a string is in terms of the sizes +	// of the instructions in the string. For example, if a call instruction +	// is smaller than a one instruction string, we should outline that string. +    if (StringLen < 2) +      return 0u; + +    size_t Occurrences = Curr.OccurrenceCount; + +    // Anything we want to outline has to appear at least twice. +    if (Occurrences < 2) +      return 0u; + +    // Check if the last instruction in the sequence is a return. +    MachineInstr *LastInstr = +    Mapper.IntegerInstructionMap[EndVal]; +    assert(LastInstr && "Last instruction in sequence was unmapped!"); + +    // The only way a terminator could be mapped as legal is if it was safe to +    // tail call. +    bool IsTailCall = LastInstr->isTerminator(); +    return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall); +  }; + +  MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn); + +  for (auto &OF : FunctionList) +    OF.IsTailCall = Mapper. +                    IntegerInstructionMap[OF.Sequence.back()]->isTerminator(); + +  // Sort the candidates in decending order. This will simplify the outlining +  // process when we have to remove the candidates from the mapping by +  // allowing us to cut them out without keeping track of an offset. +  std::stable_sort(CandidateList.begin(), CandidateList.end()); + +  return MaxCandidateLen; +} + +MachineFunction * +MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, +  InstructionMapper &Mapper) { + +  // Create the function name. This should be unique. For now, just hash the +  // module name and include it in the function name plus the number of this +  // function. +  std::ostringstream NameStream; +  NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name; + +  // Create the function using an IR-level function. +  LLVMContext &C = M.getContext(); +  Function *F = dyn_cast<Function>( +      M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C))); +  assert(F && "Function was null!"); + +  // NOTE: If this is linkonceodr, then we can take advantage of linker deduping +  // which gives us better results when we outline from linkonceodr functions. +  F->setLinkage(GlobalValue::PrivateLinkage); +  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + +  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); +  IRBuilder<> Builder(EntryBB); +  Builder.CreateRetVoid(); + +  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); +  MachineFunction &MF = MMI.getMachineFunction(*F); +  MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock(); +  const TargetSubtargetInfo &STI = MF.getSubtarget(); +  const TargetInstrInfo &TII = *STI.getInstrInfo(); + +  // Insert the new function into the module. +  MF.insert(MF.begin(), &MBB); + +  TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall); + +  // Copy over the instructions for the function using the integer mappings in +  // its sequence. +  for (unsigned Str : OF.Sequence) { +    MachineInstr *NewMI = +        MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second); +    NewMI->dropMemRefs(); + +    // Don't keep debug information for outlined instructions. +    // FIXME: This means outlined functions are currently undebuggable. +    NewMI->setDebugLoc(DebugLoc()); +    MBB.insert(MBB.end(), NewMI); +  } + +  TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall); + +  return &MF; +} + +bool MachineOutliner::outline(Module &M, +                              const ArrayRef<Candidate> &CandidateList, +                              std::vector<OutlinedFunction> &FunctionList, +                              InstructionMapper &Mapper) { + +  bool OutlinedSomething = false; + +  // Replace the candidates with calls to their respective outlined functions. +  for (const Candidate &C : CandidateList) { + +    // Was the candidate removed during pruneOverlaps? +    if (!C.InCandidateList) +      continue; + +    // If not, then look at its OutlinedFunction. +    OutlinedFunction &OF = FunctionList[C.FunctionIdx]; + +    // Was its OutlinedFunction made unbeneficial during pruneOverlaps? +    if (OF.OccurrenceCount < 2 || OF.Benefit < 1) +      continue; + +    // If not, then outline it. +    assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); +    MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent(); +    MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx]; +    unsigned EndIdx = C.StartIdx + C.Len - 1; + +    assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); +    MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; +    assert(EndIt != MBB->end() && "EndIt out of bounds!"); + +    EndIt++; // Erase needs one past the end index. + +    // Does this candidate have a function yet? +    if (!OF.MF) { +      OF.MF = createOutlinedFunction(M, OF, Mapper); +      FunctionsCreated++; +    } + +    MachineFunction *MF = OF.MF; +    const TargetSubtargetInfo &STI = MF->getSubtarget(); +    const TargetInstrInfo &TII = *STI.getInstrInfo(); + +    // Insert a call to the new function and erase the old sequence. +    TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall); +    StartIt = Mapper.InstrList[C.StartIdx]; +    MBB->erase(StartIt, EndIt); + +    OutlinedSomething = true; + +    // Statistics. +    NumOutlined++; +  } + +  DEBUG ( +    dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n"; +  ); + +  return OutlinedSomething; +} + +bool MachineOutliner::runOnModule(Module &M) { + +  // Is there anything in the module at all? +  if (M.empty()) +    return false; + +  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); +  const TargetSubtargetInfo &STI = MMI.getMachineFunction(*M.begin()) +                                      .getSubtarget(); +  const TargetRegisterInfo *TRI = STI.getRegisterInfo(); +  const TargetInstrInfo *TII = STI.getInstrInfo(); + +  InstructionMapper Mapper; + +  // Build instruction mappings for each function in the module. +  for (Function &F : M) { +    MachineFunction &MF = MMI.getMachineFunction(F); + +    // Is the function empty? Safe to outline from? +    if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF)) +      continue; + +    // If it is, look at each MachineBasicBlock in the function. +    for (MachineBasicBlock &MBB : MF) { + +      // Is there anything in MBB? +      if (MBB.empty()) +        continue; + +      // If yes, map it. +      Mapper.convertToUnsignedVec(MBB, *TRI, *TII); +    } +  } + +  // Construct a suffix tree, use it to find candidates, and then outline them. +  SuffixTree ST(Mapper.UnsignedVec); +  std::vector<Candidate> CandidateList; +  std::vector<OutlinedFunction> FunctionList; + +  // Find all of the outlining candidates. +  unsigned MaxCandidateLen = +      buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII); + +  // Remove candidates that overlap with other candidates. +  pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII); + +  // Outline each of the candidates and return true if something was outlined. +  return outline(M, CandidateList, FunctionList, Mapper); +} diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 43a18099d39a..d06c38cf4ed8 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -552,7 +552,9 @@ public:      os << "\n";    } -  void dump() const { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +  LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif  };  /// This class repesents the scheduled code.  The main data structure is a @@ -593,7 +595,7 @@ private:    /// Virtual register information.    MachineRegisterInfo &MRI; -  DFAPacketizer *Resources; +  std::unique_ptr<DFAPacketizer> Resources;  public:    SMSchedule(MachineFunction *mf) @@ -604,13 +606,6 @@ public:      InitiationInterval = 0;    } -  ~SMSchedule() { -    ScheduledInstrs.clear(); -    InstrToCycle.clear(); -    RegToStageDiff.clear(); -    delete Resources; -  } -    void reset() {      ScheduledInstrs.clear();      InstrToCycle.clear(); @@ -738,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {      return false;    if (mf.getFunction()->getAttributes().hasAttribute( -          AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && +          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&        !EnableSWPOptSize.getPosition())      return false; @@ -960,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,    for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)      if (Phi.getOperand(i + 1).getMBB() != Loop)        InitVal = Phi.getOperand(i).getReg(); -    else if (Phi.getOperand(i + 1).getMBB() == Loop) +    else        LoopVal = Phi.getOperand(i).getReg();    assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure."); @@ -2514,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis(      MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,      InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,      bool IsLast) { -  // Compute the stage number for the inital value of the Phi, which +  // Compute the stage number for the initial value of the Phi, which    // comes from the prolog. The prolog to use depends on to which kernel/    // epilog that we're adding the Phi.    unsigned PrologStage = 0; @@ -3480,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,    // increment value to determine if the accesses may be loop carried.    if (OffsetS >= OffsetD)      return OffsetS + AccessSizeS > DeltaS; -  else if (OffsetS < OffsetD) +  else      return OffsetD + AccessSizeD > DeltaD;    return true; @@ -3980,5 +3975,7 @@ void SMSchedule::print(raw_ostream &os) const {    }  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  /// Utility function used for debugging to print the schedule. -void SMSchedule::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); } +#endif diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp index fc32183c7f63..71ad4e6aa7f5 100644 --- a/lib/CodeGen/MachineRegionInfo.cpp +++ b/lib/CodeGen/MachineRegionInfo.cpp @@ -1,10 +1,9 @@ -  #include "llvm/CodeGen/MachineRegionInfo.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/RegionInfoImpl.h"  #include "llvm/CodeGen/MachinePostDominators.h" -#define DEBUG_TYPE "region" +#define DEBUG_TYPE "machine-region-info"  using namespace llvm; @@ -86,6 +85,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {    auto DF = &getAnalysis<MachineDominanceFrontier>();    RI.recalculate(F, DT, PDT, DF); + +  DEBUG(RI.dump()); +    return false;  } @@ -103,9 +105,10 @@ void MachineRegionInfoPass::verifyAnalysis() const {  void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {    AU.setPreservesAll(); -  AU.addRequiredTransitive<DominatorTreeWrapperPass>(); -  AU.addRequired<PostDominatorTreeWrapperPass>(); -  AU.addRequired<DominanceFrontierWrapperPass>(); +  AU.addRequired<MachineDominatorTree>(); +  AU.addRequired<MachinePostDominatorTree>(); +  AU.addRequired<MachineDominanceFrontier>(); +  MachineFunctionPass::getAnalysisUsage(AU);  }  void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const { @@ -119,14 +122,15 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {  #endif  char MachineRegionInfoPass::ID = 0; +char &MachineRegionInfoPassID = MachineRegionInfoPass::ID; -INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions", -                "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE, +                      "Detect single entry single exit regions", true, true)  INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)  INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)  INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) -INITIALIZE_PASS_END(MachineRegionInfoPass, "regions", -                "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE, +                    "Detect single entry single exit regions", true, true)  // Create methods available outside of this file, to use them  // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp index 242cb0b80953..128910f8eb2a 100644 --- a/lib/CodeGen/MachineRegisterInfo.cpp +++ b/lib/CodeGen/MachineRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===// +//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -11,13 +11,27 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DebugLoc.h"  #include "llvm/IR/Function.h" -#include "llvm/Support/raw_os_ostream.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert>  using namespace llvm; @@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden,  void MachineRegisterInfo::Delegate::anchor() {}  MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF) -    : MF(MF), TheDelegate(nullptr), -      TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && -                           EnableSubRegLiveness) { +    : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && +                                   EnableSubRegLiveness), +      IsUpdatedCSRsInitialized(false) {    unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();    VRegInfo.reserve(256);    RegAllocHints.reserve(256); @@ -444,8 +458,8 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {    return TRC.getLaneMask();  } -#ifndef NDEBUG -void MachineRegisterInfo::dumpUses(unsigned Reg) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const {    for (MachineInstr &I : use_instructions(Reg))      I.dump();  } @@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {    }    return false;  } + +void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) { + +  const TargetRegisterInfo *TRI = getTargetRegisterInfo(); +  assert(Reg && (Reg < TRI->getNumRegs()) && +         "Trying to disable an invalid register"); + +  if (!IsUpdatedCSRsInitialized) { +    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); +    for (const MCPhysReg *I = CSR; *I; ++I) +      UpdatedCSRs.push_back(*I); + +    // Zero value represents the end of the register list +    // (no more registers should be pushed). +    UpdatedCSRs.push_back(0); + +    IsUpdatedCSRsInitialized = true; +  } + +  // Remove the register (and its aliases from the list). +  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) +    UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI), +                      UpdatedCSRs.end()); +} + +const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const { +  if (IsUpdatedCSRsInitialized) +    return UpdatedCSRs.data(); + +  return getTargetRegisterInfo()->getCalleeSavedRegs(MF); +} + +void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) { +  if (IsUpdatedCSRsInitialized) +    UpdatedCSRs.clear(); + +  for (MCPhysReg Reg : CSRs) +    UpdatedCSRs.push_back(Reg); + +  // Zero value represents the end of the register list +  // (no more registers should be pushed). +  UpdatedCSRs.push_back(0); +  IsUpdatedCSRsInitialized = true; +} diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index e06bc517fa91..41e161f71e53 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -12,30 +12,67 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/iterator_range.h"  #include "llvm/ADT/PriorityQueue.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" +#include "llvm/CodeGen/RegisterPressure.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MachineValueType.h"  #include "llvm/CodeGen/Passes.h"  #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h"  #include "llvm/CodeGen/ScheduleDFS.h"  #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/SlotIndexes.h"  #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Pass.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/GraphWriter.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <memory> +#include <string> +#include <tuple> +#include <utility> +#include <vector>  using namespace llvm;  #define DEBUG_TYPE "misched"  namespace llvm { +  cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,                             cl::desc("Force top-down list scheduling"));  cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, @@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,  cl::opt<bool>  DumpCriticalPathLength("misched-dcpl", cl::Hidden,                         cl::desc("Print critical path length to stdout")); -} + +} // end namespace llvm  #ifndef NDEBUG  static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, @@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,                                          cl::desc("Enable memop clustering."),                                          cl::init(true)); -// Experimental heuristics -static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, -  cl::desc("Enable scheduling for macro fusion."), cl::init(true)); -  static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,    cl::desc("Verify machine instrs before and after machine scheduling")); @@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8;  // Pin the vtables to this file.  void MachineSchedStrategy::anchor() {} +  void ScheduleDAGMutation::anchor() {}  //===----------------------------------------------------------------------===//  // Machine Instruction Scheduling Pass and Registry  //===----------------------------------------------------------------------===// -MachineSchedContext::MachineSchedContext(): -    MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) { +MachineSchedContext::MachineSchedContext() {    RegClassInfo = new RegisterClassInfo();  } @@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() {  }  namespace { +  /// Base class for a machine scheduler class that can run at any point.  class MachineSchedulerBase : public MachineSchedContext,                               public MachineFunctionPass { @@ -149,7 +184,8 @@ public:  protected:    ScheduleDAGInstrs *createPostMachineScheduler();  }; -} // namespace + +} // end anonymous namespace  char MachineScheduler::ID = 0; @@ -158,6 +194,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;  INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",                        "Machine Instruction Scheduler", false, false)  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)  INITIALIZE_PASS_DEPENDENCY(SlotIndexes)  INITIALIZE_PASS_DEPENDENCY(LiveIntervals)  INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler", @@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {  /// MachineSchedOpt allows command line selection of the scheduler.  static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, -               RegisterPassParser<MachineSchedRegistry> > +               RegisterPassParser<MachineSchedRegistry>>  MachineSchedOpt("misched",                  cl::init(&useDefaultMachineSched), cl::Hidden,                  cl::desc("Machine instruction scheduler to use")); @@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,        // instruction stream until we find the nearest boundary.        unsigned NumRegionInstrs = 0;        MachineBasicBlock::iterator I = RegionEnd; -      for (;I != MBB->begin(); --I) { +      for (; I != MBB->begin(); --I) {          MachineInstr &MI = *std::prev(I);          if (isSchedBoundary(&MI, &*MBB, MF, TII))            break; @@ -504,13 +541,14 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {    // unimplemented  } -LLVM_DUMP_METHOD -void ReadyQueue::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ReadyQueue::dump() {    dbgs() << "Queue " << Name << ": ";    for (unsigned i = 0, e = Queue.size(); i < e; ++i)      dbgs() << Queue[i]->NodeNum << " ";    dbgs() << "\n";  } +#endif  //===----------------------------------------------------------------------===//  // ScheduleDAGMI - Basic machine instruction scheduling. This is @@ -519,8 +557,7 @@ void ReadyQueue::dump() {  // ===----------------------------------------------------------------------===/  // Provide a vtable anchor. -ScheduleDAGMI::~ScheduleDAGMI() { -} +ScheduleDAGMI::~ScheduleDAGMI() = default;  bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {    return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); @@ -825,7 +862,7 @@ void ScheduleDAGMI::placeDebugValues() {      RegionBegin = FirstDbgValue;    } -  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator +  for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator           DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {      std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);      MachineInstr *DbgValue = P.first; @@ -841,7 +878,7 @@ void ScheduleDAGMI::placeDebugValues() {  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void ScheduleDAGMI::dumpSchedule() const { +LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {    for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {      if (SUnit *SU = getSUnit(&(*MI)))        SU->dump(this); @@ -1012,7 +1049,7 @@ updateScheduledPressure(const SUnit *SU,        ++CritIdx;      if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {        if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() -          && NewMaxPressure[ID] <= INT16_MAX) +          && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())          RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);      }      unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); @@ -1136,6 +1173,12 @@ void ScheduleDAGMILive::schedule() {          dbgs() << "  Pressure Diff      : ";          getPressureDiff(&SU).dump(*TRI);        } +      dbgs() << "  Single Issue       : "; +      if (SchedModel.mustBeginGroup(SU.getInstr()) && +         SchedModel.mustEndGroup(SU.getInstr())) +        dbgs() << "true;"; +      else +        dbgs() << "false;";        dbgs() << '\n';      }      if (ExitSU.getInstr() != nullptr) @@ -1396,6 +1439,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {  //===----------------------------------------------------------------------===//  namespace { +  /// \brief Post-process the DAG to create cluster edges between neighboring  /// loads or between neighboring stores.  class BaseMemOpClusterMutation : public ScheduleDAGMutation { @@ -1403,6 +1447,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {      SUnit *SU;      unsigned BaseReg;      int64_t Offset; +      MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)          : SU(su), BaseReg(reg), Offset(ofs) {} @@ -1439,25 +1484,26 @@ public:    LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)        : BaseMemOpClusterMutation(tii, tri, true) {}  }; -} // anonymous + +} // end anonymous namespace  namespace llvm {  std::unique_ptr<ScheduleDAGMutation>  createLoadClusterDAGMutation(const TargetInstrInfo *TII,                               const TargetRegisterInfo *TRI) { -  return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI) +  return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI)                              : nullptr;  }  std::unique_ptr<ScheduleDAGMutation>  createStoreClusterDAGMutation(const TargetInstrInfo *TII,                                const TargetRegisterInfo *TRI) { -  return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI) +  return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI)                              : nullptr;  } -} // namespace llvm +} // end namespace llvm  void BaseMemOpClusterMutation::clusterNeighboringMemOps(      ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { @@ -1543,80 +1589,11 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {  }  //===----------------------------------------------------------------------===// -// MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - -namespace { -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class MacroFusion : public ScheduleDAGMutation { -  const TargetInstrInfo &TII; -public: -  MacroFusion(const TargetInstrInfo &TII) -    : TII(TII) {} - -  void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; -} // anonymous - -namespace llvm { - -std::unique_ptr<ScheduleDAGMutation> -createMacroFusionDAGMutation(const TargetInstrInfo *TII) { -  return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr; -} - -} // namespace llvm - -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { -  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - -  // For now, assume targets can only fuse with the branch. -  SUnit &ExitSU = DAG->ExitSU; -  MachineInstr *Branch = ExitSU.getInstr(); -  if (!Branch) -    return; - -  for (SDep &PredDep : ExitSU.Preds) { -    if (PredDep.isWeak()) -      continue; -    SUnit &SU = *PredDep.getSUnit(); -    MachineInstr &Pred = *SU.getInstr(); -    if (!TII.shouldScheduleAdjacent(Pred, *Branch)) -      continue; - -    // Create a single weak edge from SU to ExitSU. The only effect is to cause -    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no -    // need to copy predecessor edges from ExitSU to SU, since top-down -    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling -    // of SU, we could create an artificial edge from the deepest root, but it -    // hasn't been needed yet. -    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); -    (void)Success; -    assert(Success && "No DAG nodes should be reachable from ExitSU"); - -    // Adjust latency of data deps between the nodes. -    for (SDep &PredDep : ExitSU.Preds) { -      if (PredDep.getSUnit() == &SU) -        PredDep.setLatency(0); -    } -    for (SDep &SuccDep : SU.Succs) { -      if (SuccDep.getSUnit() == &ExitSU) -        SuccDep.setLatency(0); -    } - -    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); -    break; -  } -} - -//===----------------------------------------------------------------------===//  // CopyConstrain - DAG post-processing to encourage copy elimination.  //===----------------------------------------------------------------------===//  namespace { +  /// \brief Post-process the DAG to create weak edges from all uses of a copy to  /// the one use that defines the copy's source vreg, most likely an induction  /// variable increment. @@ -1626,6 +1603,7 @@ class CopyConstrain : public ScheduleDAGMutation {    // RegionEndIdx is the slot index of the last non-debug instruction in the    // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.    SlotIndex RegionEndIdx; +  public:    CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} @@ -1634,17 +1612,18 @@ public:  protected:    void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);  }; -} // anonymous + +} // end anonymous namespace  namespace llvm {  std::unique_ptr<ScheduleDAGMutation>  createCopyConstrainDAGMutation(const TargetInstrInfo *TII, -                             const TargetRegisterInfo *TRI) { -  return make_unique<CopyConstrain>(TII, TRI); +                               const TargetRegisterInfo *TRI) { +  return llvm::make_unique<CopyConstrain>(TII, TRI);  } -} // namespace llvm +} // end namespace llvm  /// constrainLocalCopy handles two possibilities:  /// 1) Local src: @@ -1836,7 +1815,7 @@ void SchedBoundary::reset() {    CheckPending = false;    CurrCycle = 0;    CurrMOps = 0; -  MinReadyCycle = UINT_MAX; +  MinReadyCycle = std::numeric_limits<unsigned>::max();    ExpectedLatency = 0;    DependentLatency = 0;    RetiredMOps = 0; @@ -1937,12 +1916,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) {        && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {      return true;    } +    unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());    if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {      DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="            << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');      return true;    } + +  if (CurrMOps > 0 && +      ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) || +       (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) { +    DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must " +                 << (isTop()? "begin" : "end") << " group\n"); +    return true; +  } +    if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {      const MCSchedClassDesc *SC = DAG->getSchedClass(SU);      for (TargetSchedModel::ProcResIter @@ -2039,7 +2028,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {  /// Move the boundary of scheduled code by one cycle.  void SchedBoundary::bumpCycle(unsigned NextCycle) {    if (SchedModel->getMicroOpBufferSize() == 0) { -    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); +    assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && +           "MinReadyCycle uninitialized");      if (MinReadyCycle > NextCycle)        NextCycle = MinReadyCycle;    } @@ -2237,6 +2227,18 @@ void SchedBoundary::bumpNode(SUnit *SU) {    // one cycle.  Since we commonly reach the max MOps here, opportunistically    // bump the cycle to avoid uselessly checking everything in the readyQ.    CurrMOps += IncMOps; + +  // Bump the cycle count for issue group constraints. +  // This must be done after NextCycle has been adjust for all other stalls. +  // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set +  // currCycle to X. +  if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) || +      (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) { +    DEBUG(dbgs() << "  Bump cycle to " +                 << (isTop() ? "end" : "begin") << " group\n"); +    bumpCycle(++NextCycle); +  } +    while (CurrMOps >= SchedModel->getIssueWidth()) {      DEBUG(dbgs() << "  *** Max MOps " << CurrMOps            << " at cycle " << CurrCycle << '\n'); @@ -2250,7 +2252,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {  void SchedBoundary::releasePending() {    // If the available queue is empty, it is safe to reset MinReadyCycle.    if (Available.empty()) -    MinReadyCycle = UINT_MAX; +    MinReadyCycle = std::numeric_limits<unsigned>::max();    // Check to see if any of the pending instructions are ready to issue.  If    // so, add them to the available queue. @@ -2323,10 +2325,10 @@ SUnit *SchedBoundary::pickOnlyChoice() {    return nullptr;  } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  // This is useful information to dump after bumpNode.  // Note that the Queue contents are more useful before pickNodeFromQueue. -void SchedBoundary::dumpScheduledState() { +LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() {    unsigned ResFactor;    unsigned ResCount;    if (ZoneCritResIdx) { @@ -2666,11 +2668,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,  }  void GenericScheduler::dumpPolicy() { +  // Cannot completely remove virtual function even in release mode. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)    dbgs() << "GenericScheduler RegionPolicy: "           << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure           << " OnlyTopDown=" << RegionPolicy.OnlyTopDown           << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp           << "\n"; +#endif  }  /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic @@ -2724,7 +2729,7 @@ void GenericScheduler::registerRoots() {      errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";    } -  if (EnableCyclicPath) { +  if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {      Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();      checkAcyclicLatency();    } @@ -3106,7 +3111,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {  }  void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { -    MachineBasicBlock::iterator InsertPos = SU->getInstr();    if (!isTop)      ++InsertPos; @@ -3154,7 +3158,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {  /// Create the standard converging machine scheduler. This will be used as the  /// default scheduler if the target does not set a default.  ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { -  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)); +  ScheduleDAGMILive *DAG = +      new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C));    // Register DAG post-processors.    //    // FIXME: extend the mutation API to allow earlier mutations to instantiate @@ -3195,7 +3200,6 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {    }  } -  void PostGenericScheduler::registerRoots() {    Rem.CriticalPath = DAG->ExitSU.getDepth(); @@ -3302,7 +3306,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {  }  ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { -  return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C), +  return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C),                             /*RemoveKillFlags=*/true);  } @@ -3311,14 +3315,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {  //===----------------------------------------------------------------------===//  namespace { +  /// \brief Order nodes by the ILP metric.  struct ILPOrder { -  const SchedDFSResult *DFSResult; -  const BitVector *ScheduledTrees; +  const SchedDFSResult *DFSResult = nullptr; +  const BitVector *ScheduledTrees = nullptr;    bool MaximizeILP; -  ILPOrder(bool MaxILP) -    : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {} +  ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}    /// \brief Apply a less-than relation on node priority.    /// @@ -3347,12 +3351,13 @@ struct ILPOrder {  /// \brief Schedule based on the ILP metric.  class ILPScheduler : public MachineSchedStrategy { -  ScheduleDAGMILive *DAG; +  ScheduleDAGMILive *DAG = nullptr;    ILPOrder Cmp;    std::vector<SUnit*> ReadyQ; +  public: -  ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {} +  ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}    void initialize(ScheduleDAGMI *dag) override {      assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness"); @@ -3405,14 +3410,16 @@ public:      std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);    }  }; -} // namespace + +} // end anonymous namespace  static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { -  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true)); +  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true));  }  static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { -  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false)); +  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false));  } +  static MachineSchedRegistry ILPMaxRegistry(    "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);  static MachineSchedRegistry ILPMinRegistry( @@ -3424,6 +3431,7 @@ static MachineSchedRegistry ILPMinRegistry(  #ifndef NDEBUG  namespace { +  /// Apply a less-than relation on the node order, which corresponds to the  /// instruction order prior to scheduling. IsReverse implements greater-than.  template<bool IsReverse> @@ -3444,11 +3452,12 @@ class InstructionShuffler : public MachineSchedStrategy {    // Using a less-than relation (SUnitOrder<false>) for the TopQ priority    // gives nodes with a higher number higher priority causing the latest    // instructions to be scheduled first. -  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > +  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>      TopQ;    // When scheduling bottom-up, use greater-than as the queue priority. -  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > +  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>      BottomQ; +  public:    InstructionShuffler(bool alternate, bool topdown)      : IsAlternating(alternate), IsTopDown(topdown) {} @@ -3492,15 +3501,18 @@ public:      BottomQ.push(SU);    }  }; -} // namespace + +} // end anonymous namespace  static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {    bool Alternate = !ForceTopDown && !ForceBottomUp;    bool TopDown = !ForceBottomUp;    assert((TopDown || !ForceTopDown) &&           "-misched-topdown incompatible with -misched-bottomup"); -  return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown)); +  return new ScheduleDAGMILive( +      C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown));  } +  static MachineSchedRegistry ShufflerRegistry(    "shuffle", "Shuffle machine instructions alternating directions",    createInstructionShuffler); @@ -3518,8 +3530,7 @@ template<> struct GraphTraits<  template<>  struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { - -  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} +  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}    static std::string getGraphName(const ScheduleDAG *G) {      return G->MF.getName(); @@ -3576,7 +3587,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {      return Str;    }  }; -} // namespace llvm + +} // end namespace llvm  #endif // NDEBUG  /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index ef7e525e8165..998a9645e68b 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -1,4 +1,4 @@ -//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===// +//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -7,21 +7,35 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h"  #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/SparseSet.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/Format.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <tuple> +#include <utility>  using namespace llvm; @@ -37,9 +51,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)  INITIALIZE_PASS_END(MachineTraceMetrics,                    "machine-trace-metrics", "Machine Trace Metrics", false, true) -MachineTraceMetrics::MachineTraceMetrics() -  : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr), -    MRI(nullptr), Loops(nullptr) { +MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {    std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);  } @@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {    return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);  } -  //===----------------------------------------------------------------------===//  //                         Ensemble utility functions  //===----------------------------------------------------------------------===// @@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)  }  // Virtual destructor serves as an anchor. -MachineTraceMetrics::Ensemble::~Ensemble() {} +MachineTraceMetrics::Ensemble::~Ensemble() = default;  const MachineLoop*  MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const { @@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {  // MinInstrCountEnsemble - Pick the trace that executes the least number of  // instructions.  namespace { +  class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {    const char *getName() const override { return "MinInstr"; }    const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override; @@ -306,7 +318,8 @@ public:    MinInstrCountEnsemble(MachineTraceMetrics *mtm)      : MachineTraceMetrics::Ensemble(mtm) {}  }; -} + +} // end anonymous namespace  // Select the preferred predecessor for MBB.  const MachineBasicBlock* @@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const {  // revisit blocks.  namespace { +  struct LoopBounds {    MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;    SmallPtrSet<const MachineBasicBlock*, 8> Visited;    const MachineLoopInfo *Loops; -  bool Downward; +  bool Downward = false; +    LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks, -             const MachineLoopInfo *loops) -    : Blocks(blocks), Loops(loops), Downward(false) {} +             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}  }; -} + +} // end anonymous namespace  // Specialize po_iterator_storage in order to prune the post-order traversal so  // it is limited to the current loop and doesn't traverse the loop back edges.  namespace llvm { +  template<>  class po_iterator_storage<LoopBounds, true> {    LoopBounds &LB; +  public:    po_iterator_storage(LoopBounds &lb) : LB(lb) {} +    void finishPostorder(const MachineBasicBlock*) {}    bool insertEdge(Optional<const MachineBasicBlock *> From, @@ -452,7 +470,8 @@ public:      return LB.Visited.insert(To).second;    }  }; -} + +} // end namespace llvm  /// Compute the trace through MBB.  void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) { @@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const {  // A data dependency is represented as a defining MI and operand numbers on the  // defining and using MI.  namespace { +  struct DataDep {    const MachineInstr *DefMI;    unsigned DefOp; @@ -622,7 +642,8 @@ struct DataDep {      assert((++DefI).atEnd() && "Register has multiple defs");    }  }; -} + +} // end anonymous namespace  // Get the input data dependencies that must be ready before UseMI can issue.  // Return true if UseMI has any physreg operands. @@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI,  // direction instructions are scanned, it could be the operand that defined the  // regunit, or the highest operand to read the regunit.  namespace { +  struct LiveRegUnit {    unsigned RegUnit; -  unsigned Cycle; -  const MachineInstr *MI; -  unsigned Op; +  unsigned Cycle = 0; +  const MachineInstr *MI = nullptr; +  unsigned Op = 0;    unsigned getSparseSetIndex() const { return RegUnit; } -  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {} +  LiveRegUnit(unsigned RU) : RegUnit(RU) {}  }; -} + +} // end anonymous namespace  // Identify physreg dependencies for UseMI, and update the live regunit  // tracking set when scanning instructions downwards. @@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,    return Height;  } -  typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;  // Push the height of DefMI upwards if required to match UseMI. diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index a98139f9e5af..d392c044bd71 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -260,8 +260,8 @@ namespace {      static char ID; // Pass ID, replacement for typeid      const std::string Banner; -    MachineVerifierPass(const std::string &banner = nullptr) -      : MachineFunctionPass(ID), Banner(banner) { +    MachineVerifierPass(std::string banner = std::string()) +      : MachineFunctionPass(ID), Banner(std::move(banner)) {          initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());        } @@ -528,7 +528,8 @@ void MachineVerifier::visitMachineFunctionBefore() {    lastIndex = SlotIndex();    regsReserved = MRI->getReservedRegs(); -  markReachable(&MF->front()); +  if (!MF->empty()) +    markReachable(&MF->front());    // Build a set of the basic blocks in the function.    FunctionBlocks.clear(); @@ -548,7 +549,8 @@ void MachineVerifier::visitMachineFunctionBefore() {    // Check that the register use lists are sane.    MRI->verifyUseLists(); -  verifyStackFrame(); +  if (!MF->empty()) +    verifyStackFrame();  }  // Does iterator point to a and b as the first two elements? @@ -572,7 +574,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {      for (const auto &LI : MBB->liveins()) {        if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&            MBB->getIterator() != MBB->getParent()->begin()) { -        report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB); +        report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB);        }      }    } @@ -908,6 +910,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {      }    } +  // Generic loads and stores must have a single MachineMemOperand +  // describing that access. +  if ((MI->getOpcode() == TargetOpcode::G_LOAD || +       MI->getOpcode() == TargetOpcode::G_STORE) && +      !MI->hasOneMemOperand()) +    report("Generic instruction accessing memory must have one mem operand", +           MI); +    StringRef ErrorInfo;    if (!TII->verifyInstruction(*MI, ErrorInfo))      report(ErrorInfo.data(), MI); @@ -2047,23 +2057,14 @@ void MachineVerifier::verifyStackFrame() {      // Update stack state by checking contents of MBB.      for (const auto &I : *MBB) {        if (I.getOpcode() == FrameSetupOpcode) { -        // The first operand of a FrameOpcode should be i32. -        int Size = I.getOperand(0).getImm(); -        assert(Size >= 0 && -          "Value should be non-negative in FrameSetup and FrameDestroy.\n"); -          if (BBState.ExitIsSetup)            report("FrameSetup is after another FrameSetup", &I); -        BBState.ExitValue -= Size; +        BBState.ExitValue -= TII->getFrameSize(I);          BBState.ExitIsSetup = true;        }        if (I.getOpcode() == FrameDestroyOpcode) { -        // The first operand of a FrameOpcode should be i32. -        int Size = I.getOperand(0).getImm(); -        assert(Size >= 0 && -          "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - +        int Size = TII->getFrameSize(I);          if (!BBState.ExitIsSetup)            report("FrameDestroy is not after a FrameSetup", &I);          int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue : diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp index ad9166f1ed23..00e72971a01e 100644 --- a/lib/CodeGen/PatchableFunction.cpp +++ b/lib/CodeGen/PatchableFunction.cpp @@ -75,7 +75,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {                   .addImm(FirstActualI->getOpcode());    for (auto &MO : FirstActualI->operands()) -    MIB.addOperand(MO); +    MIB.add(MO);    FirstActualI->eraseFromParent();    MF.ensureAlignment(4); diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index 6081916a6a82..61dccdde8f1d 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -253,7 +253,7 @@ void SchedulePostRATDList::exitRegion() {  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  /// dumpSchedule - dump the scheduled Sequence. -void SchedulePostRATDList::dumpSchedule() const { +LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {    for (unsigned i = 0, e = Sequence.size(); i != e; i++) {      if (SUnit *SU = Sequence[i])        SU->dump(this); diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 5fca7fa5536b..1354009794cb 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -265,11 +265,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) {    std::vector<MachineBasicBlock::iterator> FrameSDOps;    for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)      for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) -      if (I->getOpcode() == FrameSetupOpcode || -          I->getOpcode() == FrameDestroyOpcode) { -        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" -               " instructions should have a single immediate argument!"); -        unsigned Size = I->getOperand(0).getImm(); +      if (TII.isFrameInstr(*I)) { +        unsigned Size = TII.getFrameSize(*I);          if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;          AdjustsStack = true;          FrameSDOps.push_back(I); @@ -336,7 +333,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,      return;    const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); -  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); +  const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs();    std::vector<CalleeSavedInfo> CSI;    for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1049,8 +1046,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,    const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();    const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();    const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); -  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); -  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();    if (RS && FrameIndexEliminationScavenging)      RS->enterBasicBlock(*BB); @@ -1059,11 +1054,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { -    if (I->getOpcode() == FrameSetupOpcode || -        I->getOpcode() == FrameDestroyOpcode) { -      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); +    if (TII.isFrameInstr(*I)) { +      InsideCallSequence = TII.isFrameSetup(*I);        SPAdj += TII.getSPAdjust(*I); -        I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);        continue;      } @@ -1237,4 +1230,6 @@ doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) {          ++I;      }    } + +  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);  } diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp index 804a4c3dad66..b29e62bf1aa3 100644 --- a/lib/CodeGen/PseudoSourceValue.cpp +++ b/lib/CodeGen/PseudoSourceValue.cpp @@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {}  PseudoSourceValue::~PseudoSourceValue() {}  void PseudoSourceValue::printCustom(raw_ostream &O) const { -  O << PSVNames[Kind]; +  if (Kind < TargetCustom) +    O << PSVNames[Kind]; +  else +    O << "TargetCustom" << Kind;  }  bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const { diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index a558e371ad4c..a87fed3a687e 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -176,8 +176,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,    for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {      LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);      Q.collectInterferingVRegs(); -    if (Q.seenUnspillableVReg()) -      return false;      for (unsigned i = Q.interferingVRegs().size(); i; --i) {        LiveInterval *Intf = Q.interferingVRegs()[i - 1];        if (!Intf->isSpillable() || Intf->weight > VirtReg.weight) diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index c47cfb1b986f..06500289c971 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -29,8 +29,10 @@  #include "llvm/CodeGen/LiveStackAnalysis.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunctionPass.h"  #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/Passes.h"  #include "llvm/CodeGen/RegAllocRegistry.h" @@ -125,6 +127,7 @@ class RAGreedy : public MachineFunctionPass,    MachineBlockFrequencyInfo *MBFI;    MachineDominatorTree *DomTree;    MachineLoopInfo *Loops; +  MachineOptimizationRemarkEmitter *ORE;    EdgeBundles *Bundles;    SpillPlacement *SpillPlacer;    LiveDebugVariables *DebugVars; @@ -419,6 +422,20 @@ private:    void collectHintInfo(unsigned, HintsInfo &);    bool isUnusedCalleeSavedReg(unsigned PhysReg) const; + +  /// Compute and report the number of spills and reloads for a loop. +  void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, +                                    unsigned &FoldedReloads, unsigned &Spills, +                                    unsigned &FoldedSpills); + +  /// Report the number of spills and reloads for each loop. +  void reportNumberOfSplillsReloads() { +    for (MachineLoop *L : *Loops) { +      unsigned Reloads, FoldedReloads, Spills, FoldedSpills; +      reportNumberOfSplillsReloads(L, Reloads, FoldedReloads, Spills, +                                   FoldedSpills); +    } +  }  };  } // end anonymous namespace @@ -439,6 +456,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)  INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)  INITIALIZE_PASS_DEPENDENCY(EdgeBundles)  INITIALIZE_PASS_DEPENDENCY(SpillPlacement) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)  INITIALIZE_PASS_END(RAGreedy, "greedy",                  "Greedy Register Allocator", false, false) @@ -490,6 +508,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {    AU.addPreserved<LiveRegMatrix>();    AU.addRequired<EdgeBundles>();    AU.addRequired<SpillPlacement>(); +  AU.addRequired<MachineOptimizationRemarkEmitterPass>();    MachineFunctionPass::getAnalysisUsage(AU);  } @@ -679,7 +698,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {      MCRegUnitIterator Units(PhysReg, TRI);      for (; Units.isValid(); ++Units) {        // Instantiate a "subquery", not to be confused with the Queries array. -      LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]); +      LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]);        if (subQ.checkInterference())          break;      } @@ -830,7 +849,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,    SmallVector<LiveInterval*, 8> Intfs;    for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {      LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); -    assert(Q.seenAllInterferences() && "Didn't check all interfererences."); +    // We usually have the interfering VRegs cached so collectInterferingVRegs() +    // should be fast, we may need to recalculate if when different physregs +    // overlap the same register unit so we had different SubRanges queried +    // against it. +    Q.collectInterferingVRegs();      ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();      Intfs.append(IVR.begin(), IVR.end());    } @@ -2611,6 +2634,69 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,    return 0;  } +void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, +                                            unsigned &FoldedReloads, +                                            unsigned &Spills, +                                            unsigned &FoldedSpills) { +  Reloads = 0; +  FoldedReloads = 0; +  Spills = 0; +  FoldedSpills = 0; + +  // Sum up the spill and reloads in subloops. +  for (MachineLoop *SubLoop : *L) { +    unsigned SubReloads; +    unsigned SubFoldedReloads; +    unsigned SubSpills; +    unsigned SubFoldedSpills; + +    reportNumberOfSplillsReloads(SubLoop, SubReloads, SubFoldedReloads, +                                 SubSpills, SubFoldedSpills); +    Reloads += SubReloads; +    FoldedReloads += SubFoldedReloads; +    Spills += SubSpills; +    FoldedSpills += SubFoldedSpills; +  } + +  const MachineFrameInfo &MFI = MF->getFrameInfo(); +  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); +  int FI; + +  for (MachineBasicBlock *MBB : L->getBlocks()) +    // Handle blocks that were not included in subloops. +    if (Loops->getLoopFor(MBB) == L) +      for (MachineInstr &MI : *MBB) { +        const MachineMemOperand *MMO; + +        if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) +          ++Reloads; +        else if (TII->hasLoadFromStackSlot(MI, MMO, FI) && +                 MFI.isSpillSlotObjectIndex(FI)) +          ++FoldedReloads; +        else if (TII->isStoreToStackSlot(MI, FI) && +                 MFI.isSpillSlotObjectIndex(FI)) +          ++Spills; +        else if (TII->hasStoreToStackSlot(MI, MMO, FI) && +                 MFI.isSpillSlotObjectIndex(FI)) +          ++FoldedSpills; +      } + +  if (Reloads || FoldedReloads || Spills || FoldedSpills) { +    using namespace ore; +    MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReload", +                                      L->getStartLoc(), L->getHeader()); +    if (Spills) +      R << NV("NumSpills", Spills) << " spills "; +    if (FoldedSpills) +      R << NV("NumFoldedSpills", FoldedSpills) << " folded spills "; +    if (Reloads) +      R << NV("NumReloads", Reloads) << " reloads "; +    if (FoldedReloads) +      R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads "; +    ORE->emit(R << "generated in loop"); +  } +} +  bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {    DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"                 << "********** Function: " << mf.getName() << '\n'); @@ -2633,6 +2719,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {    Indexes = &getAnalysis<SlotIndexes>();    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();    DomTree = &getAnalysis<MachineDominatorTree>(); +  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();    SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));    Loops = &getAnalysis<MachineLoopInfo>();    Bundles = &getAnalysis<EdgeBundles>(); @@ -2658,6 +2745,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {    allocatePhysRegs();    tryHintsRecoloring();    postOptimization(); +  reportNumberOfSplillsReloads();    releaseMemory();    return true; diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index 101b30bf3b65..3b5964eef55e 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -1,4 +1,4 @@ -//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===// +//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -29,34 +29,61 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegAllocPBQP.h"  #include "RegisterCoalescer.h"  #include "Spiller.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervalAnalysis.h"  #include "llvm/CodeGen/LiveRangeEdit.h"  #include "llvm/CodeGen/LiveStackAnalysis.h"  #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"  #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineFunctionPass.h"  #include "llvm/CodeGen/MachineLoopInfo.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PBQP/Graph.h" +#include "llvm/CodeGen/PBQP/Solution.h" +#include "llvm/CodeGen/PBQPRAConstraint.h" +#include "llvm/CodeGen/RegAllocPBQP.h"  #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/SlotIndexes.h"  #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Function.h"  #include "llvm/IR/Module.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/FileSystem.h"  #include "llvm/Support/Printable.h"  #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstddef>  #include <limits> +#include <map>  #include <memory>  #include <queue>  #include <set>  #include <sstream> +#include <string> +#include <system_error> +#include <tuple>  #include <vector> +#include <utility>  using namespace llvm; @@ -86,7 +113,6 @@ namespace {  /// Programming problems.  class RegAllocPBQP : public MachineFunctionPass {  public: -    static char ID;    /// Construct a PBQP register allocator. @@ -113,7 +139,6 @@ public:    }  private: -    typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;    typedef std::vector<const LiveInterval*> Node2LIMap;    typedef std::vector<unsigned> AllowedSet; @@ -187,7 +212,6 @@ public:  /// @brief Add interference edges between overlapping vregs.  class Interference : public PBQPRAConstraint {  private: -    typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;    typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey;    typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache; @@ -276,7 +300,6 @@ private:    }  public: -    void apply(PBQPRAGraph &G) override {      // The following is loosely based on the linear scan algorithm introduced in      // "Linear Scan Register Allocation" by Poletto and Sarkar. This version @@ -363,7 +386,6 @@ public:    }  private: -    // Create an Interference edge and add it to the graph, unless it is    // a null matrix, meaning the nodes' allowed registers do not have any    // interference. This case occurs frequently between integer and floating @@ -372,7 +394,6 @@ private:    bool createInterferenceEdge(PBQPRAGraph &G,                                PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId,                                IMatrixCache &C) { -      const TargetRegisterInfo &TRI =          *G.getMetadata().MF.getSubtarget().getRegisterInfo();      const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs(); @@ -409,7 +430,6 @@ private:    }  }; -  class Coalescing : public PBQPRAConstraint {  public:    void apply(PBQPRAGraph &G) override { @@ -421,7 +441,6 @@ public:      // gives the Ok.      for (const auto &MBB : MF) {        for (const auto &MI : MBB) { -          // Skip not-coalescable or already coalesced copies.          if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())            continue; @@ -479,7 +498,6 @@ public:    }  private: -    void addVirtRegCoalesce(                      PBQPRAGraph::RawMatrix &CostMat,                      const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1, @@ -496,14 +514,15 @@ private:        }      }    } -  }; -} // End anonymous namespace. +} // end anonymous namespace  // Out-of-line destructor/anchor for PBQPRAConstraint. -PBQPRAConstraint::~PBQPRAConstraint() {} +PBQPRAConstraint::~PBQPRAConstraint() = default; +  void PBQPRAConstraint::anchor() {} +  void PBQPRAConstraintList::anchor() {}  void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { @@ -554,7 +573,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,  static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,                                     const MachineFunction &MF) { -  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); +  const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs();    for (unsigned i = 0; CSR[i] != 0; ++i)      if (TRI.regsOverlap(reg, CSR[i]))        return true; @@ -777,7 +796,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {    // If there are non-empty intervals allocate them using pbqp.    if (!VRegsToAlloc.empty()) { -      const TargetSubtargetInfo &Subtarget = MF.getSubtarget();      std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =        llvm::make_unique<PBQPRAConstraintList>(); @@ -840,7 +858,8 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,    });  } -void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {    for (auto NId : nodeIds()) {      const Vector &Costs = getNodeCosts(NId);      assert(Costs.getLength() != 0 && "Empty vector in graph."); @@ -861,7 +880,10 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {    }  } -LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); } +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { +  dump(dbgs()); +} +#endif  void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {    OS << "graph {\n"; diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp index ece44c28e9ed..855aa37ff3c3 100644 --- a/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/lib/CodeGen/RegUsageInfoCollector.cpp @@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {    DEBUG(dbgs() << "Clobbered Registers: "); -  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) -    if (MRI->isPhysRegModified(PReg, true)) -      RegMask[PReg / 32] &= ~(1u << PReg % 32); +  const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask(); +  auto SetRegAsDefined = [&RegMask] (unsigned Reg) { +    RegMask[Reg / 32] &= ~(1u << Reg % 32); +  }; +  // Scan all the physical registers. When a register is defined in the current +  // function set it and all the aliasing registers as defined in the regmask. +  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) { +    // If a register is in the UsedPhysRegsMask set then mark it as defined. +    // All it's aliases will also be in the set, so we can skip setting +    // as defined all the aliases here. +    if (UsedPhysRegsMask.test(PReg)) { +      SetRegAsDefined(PReg); +      continue; +    } +    // If a register is defined by an instruction mark it as defined together +    // with all it's aliases. +    if (!MRI->def_empty(PReg)) { +      for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI) +        SetRegAsDefined(*AI); +    } +  }    if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {      const uint32_t *CallPreservedMask = diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp index 178fa18ac5a6..82a3bd9a0bd1 100644 --- a/lib/CodeGen/RegisterClassInfo.cpp +++ b/lib/CodeGen/RegisterClassInfo.cpp @@ -1,4 +1,4 @@ -//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===// +//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -14,12 +14,22 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/MC/MCRegisterInfo.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint>  using namespace llvm; @@ -29,8 +39,7 @@ static cl::opt<unsigned>  StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),           cl::desc("Limit all regclasses to N registers")); -RegisterClassInfo::RegisterClassInfo() -  : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {} +RegisterClassInfo::RegisterClassInfo() = default;  void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {    bool Update = false; @@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {    // Does this MF have different CSRs?    assert(TRI && "no register info set"); -  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); -  if (Update || CSR != CalleeSaved) { -    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last + +  // Get the callee saved registers. +  const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs(); +  if (Update || CSR != CalleeSavedRegs) { +    // Build a CSRAlias map. Every CSR alias saves the last      // overlapping CSR. -    CSRNum.clear(); -    CSRNum.resize(TRI->getNumRegs(), 0); -    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N) -      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) -        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... +    CalleeSavedAliases.resize(TRI->getNumRegs(), 0); +    for (const MCPhysReg *I = CSR; *I; ++I) +      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) +        CalleeSavedAliases[*AI] = *I; +      Update = true;    } -  CalleeSaved = CSR; +  CalleeSavedRegs = CSR;    // Different reserved registers?    const BitVector &RR = MF->getRegInfo().getReservedRegs(); @@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {      unsigned Cost = TRI->getCostPerUse(PhysReg);      MinCost = std::min(MinCost, Cost); -    if (CSRNum[PhysReg]) +    if (CalleeSavedAliases[PhysReg])        // PhysReg aliases a CSR, save it for later.        CSRAlias.push_back(PhysReg);      else { @@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {      }    }    RCI.NumRegs = N + CSRAlias.size(); -  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); +  assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");    // CSR aliases go after the volatile registers, preserve the target's order.    for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) { @@ -156,9 +167,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {  unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {    const TargetRegisterClass *RC = nullptr;    unsigned NumRCUnits = 0; -  for (TargetRegisterInfo::regclass_iterator -         RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) { -    const int *PSetID = TRI->getRegClassPressureSets(*RI); +  for (const TargetRegisterClass *C : TRI->regclasses()) { +    const int *PSetID = TRI->getRegClassPressureSets(C);      for (; *PSetID != -1; ++PSetID) {        if ((unsigned)*PSetID == Idx)          break; @@ -168,9 +178,9 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {      // Found a register class that counts against this pressure set.      // For efficiency, only compute the set order for the largest set. -    unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit; +    unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit;      if (!RC || NUnits > NumRCUnits) { -      RC = *RI; +      RC = C;        NumRCUnits = NUnits;      }    } diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 4bb3c229afc5..bf44ee8453b6 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -22,6 +22,7 @@  #include "llvm/CodeGen/LiveRangeEdit.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h"  #include "llvm/CodeGen/MachineLoopInfo.h"  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/Passes.h" @@ -189,6 +190,9 @@ namespace {      /// This returns true if an interval was modified.      bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); +    /// We found a copy which can be moved to its less frequent predecessor. +    bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI); +      /// If the source of a copy is defined by a      /// trivial computation, replace the copy by rematerialize the definition.      bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, @@ -811,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,        VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);        assert(ASubValNo != nullptr); -      LaneBitmask AMask = SA.LaneMask; -      for (LiveInterval::SubRange &SB : IntB.subranges()) { -        LaneBitmask BMask = SB.LaneMask; -        LaneBitmask Common = BMask & AMask; -        if (Common.none()) -          continue; - -        DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask) -                      << " into " << PrintLaneMask(Common) << '\n'); -        LaneBitmask BRest = BMask & ~AMask; -        LiveInterval::SubRange *CommonRange; -        if (BRest.any()) { -          SB.LaneMask = BRest; -          DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest) -                       << '\n'); -          // Duplicate SubRange for newly merged common stuff. -          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB); -        } else { -          // We van reuse the L SubRange. -          SB.LaneMask = Common; -          CommonRange = &SB; -        } -        LiveRange RangeCopy(SB, Allocator); - -        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx); -        assert(BSubValNo->def == CopyIdx); -        BSubValNo->def = ASubValNo->def; -        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo); -        AMask &= ~BMask; -      } -      if (AMask.any()) { -        DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n'); -        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask); -        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator); -        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo); -      } +      IntB.refineSubRanges(Allocator, SA.LaneMask, +          [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) { +        VNInfo *BSubValNo = SR.empty() +          ? SR.getNextValue(CopyIdx, Allocator) +          : SR.getVNInfoAt(CopyIdx); +        assert(BSubValNo != nullptr); +        addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo); +      });      }    } @@ -861,6 +837,184 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,    return true;  } +/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a +/// predecessor of BB2, and if B is not redefined on the way from A = B +/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the +/// execution goes through the path from BB0 to BB2. We may move B = A +/// to the predecessor without such reversed copy. +/// So we will transform the program from: +///   BB0: +///      A = B;    BB1: +///       ...         ... +///     /     \      / +///             BB2: +///               ... +///               B = A; +/// +/// to: +/// +///   BB0:         BB1: +///      A = B;        ... +///       ...          B = A; +///     /     \       / +///             BB2: +///               ... +/// +/// A special case is when BB0 and BB2 are the same BB which is the only +/// BB in a loop: +///   BB1: +///        ... +///   BB0/BB2:  ---- +///        B = A;   | +///        ...      | +///        A = B;   | +///          |------- +///          | +/// We may hoist B = A from BB0/BB2 to BB1. +/// +/// The major preconditions for correctness to remove such partial +/// redundancy include: +/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of +///    the PHI is defined by the reversed copy A = B in BB0. +/// 2. No B is referenced from the start of BB2 to B = A. +/// 3. No B is defined from A = B to the end of BB0. +/// 4. BB1 has only one successor. +/// +/// 2 and 4 implicitly ensure B is not live at the end of BB1. +/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a +/// colder place, which not only prevent endless loop, but also make sure +/// the movement of copy is beneficial. +bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, +                                                MachineInstr &CopyMI) { +  assert(!CP.isPhys()); +  if (!CopyMI.isFullCopy()) +    return false; + +  MachineBasicBlock &MBB = *CopyMI.getParent(); +  if (MBB.isEHPad()) +    return false; + +  if (MBB.pred_size() != 2) +    return false; + +  LiveInterval &IntA = +      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); +  LiveInterval &IntB = +      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg()); + +  // A is defined by PHI at the entry of MBB. +  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true); +  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx); +  assert(AValNo && !AValNo->isUnused() && "COPY source not live"); +  if (!AValNo->isPHIDef()) +    return false; + +  // No B is referenced before CopyMI in MBB. +  if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx)) +    return false; + +  // MBB has two predecessors: one contains A = B so no copy will be inserted +  // for it. The other one will have a copy moved from MBB. +  bool FoundReverseCopy = false; +  MachineBasicBlock *CopyLeftBB = nullptr; +  for (MachineBasicBlock *Pred : MBB.predecessors()) { +    VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred)); +    MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def); +    if (!DefMI || !DefMI->isFullCopy()) { +      CopyLeftBB = Pred; +      continue; +    } +    // Check DefMI is a reverse copy and it is in BB Pred. +    if (DefMI->getOperand(0).getReg() != IntA.reg || +        DefMI->getOperand(1).getReg() != IntB.reg || +        DefMI->getParent() != Pred) { +      CopyLeftBB = Pred; +      continue; +    } +    // If there is any other def of B after DefMI and before the end of Pred, +    // we need to keep the copy of B = A at the end of Pred if we remove +    // B = A from MBB. +    bool ValB_Changed = false; +    for (auto VNI : IntB.valnos) { +      if (VNI->isUnused()) +        continue; +      if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) { +        ValB_Changed = true; +        break; +      } +    } +    if (ValB_Changed) { +      CopyLeftBB = Pred; +      continue; +    } +    FoundReverseCopy = true; +  } + +  // If no reverse copy is found in predecessors, nothing to do. +  if (!FoundReverseCopy) +    return false; + +  // If CopyLeftBB is nullptr, it means every predecessor of MBB contains +  // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated. +  // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and +  // update IntA/IntB. +  // +  // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so +  // MBB is hotter than CopyLeftBB. +  if (CopyLeftBB && CopyLeftBB->succ_size() > 1) +    return false; + +  // Now ok to move copy. +  if (CopyLeftBB) { +    DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#" +                 << CopyLeftBB->getNumber() << '\t' << CopyMI); + +    // Insert new copy to CopyLeftBB. +    auto InsPos = CopyLeftBB->getFirstTerminator(); +    MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(), +                                      TII->get(TargetOpcode::COPY), IntB.reg) +                                  .addReg(IntA.reg); +    SlotIndex NewCopyIdx = +        LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); +    IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); +    for (LiveInterval::SubRange &SR : IntB.subranges()) +      SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); +  } else { +    DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#" +                 << MBB.getNumber() << '\t' << CopyMI); +  } + +  // Remove CopyMI. +  // Note: This is fine to remove the copy before updating the live-ranges. +  // While updating the live-ranges, we only look at slot indices and +  // never go back to the instruction. +  LIS->RemoveMachineInstrFromMaps(CopyMI); +  CopyMI.eraseFromParent(); + +  // Update the liveness. +  SmallVector<SlotIndex, 8> EndPoints; +  VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead(); +  LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(), +                  &EndPoints); +  BValNo->markUnused(); +  // Extend IntB to the EndPoints of its original live interval. +  LIS->extendToIndices(IntB, EndPoints); + +  // Now, do the same for its subranges. +  for (LiveInterval::SubRange &SR : IntB.subranges()) { +    EndPoints.clear(); +    VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead(); +    assert(BValNo && "All sublanes should be live"); +    LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints); +    BValNo->markUnused(); +    LIS->extendToIndices(SR, EndPoints); +  } + +  // Finally, update the live-range of IntA. +  shrinkToUses(&IntA); +  return true; +} +  /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just  /// defining a subregister.  static bool definesFullReg(const MachineInstr &MI, unsigned Reg) { @@ -1290,7 +1444,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,      // If SrcReg wasn't read, it may still be the case that DstReg is live-in      // because SrcReg is a sub-register. -    if (DstInt && !Reads && SubIdx) +    if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue())        Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));      // Replace SrcReg with DstReg in all UseMI operands. @@ -1486,6 +1640,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {        }      } +    // Try and see if we can partially eliminate the copy by moving the copy to +    // its predecessor. +    if (!CP.isPartial() && !CP.isPhys()) +      if (removePartialRedundancy(CP, *CopyMI)) +        return true; +      // Otherwise, we are unable to join the intervals.      DEBUG(dbgs() << "\tInterference!\n");      Again = true;  // May be possible to coalesce later. @@ -1583,6 +1743,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {          return false;        }      } + +    // We must also check for overlaps with regmask clobbers. +    BitVector RegMaskUsable; +    if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) && +        !RegMaskUsable.test(DstReg)) { +      DEBUG(dbgs() << "\t\tRegMask interference\n"); +      return false; +    }    }    // Skip any value computations, we are not adding new values to the @@ -1636,14 +1804,6 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {            DEBUG(dbgs() << "\t\tInterference (read): " << *MI);            return false;          } - -        // We must also check for clobbers caused by regmasks. -        for (const auto &MO : MI->operands()) { -          if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) { -            DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI); -            return false; -          } -        }        }      } @@ -2738,39 +2898,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,                                            LaneBitmask LaneMask,                                            CoalescerPair &CP) {    BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); -  for (LiveInterval::SubRange &R : LI.subranges()) { -    LaneBitmask RMask = R.LaneMask; -    // LaneMask of subregisters common to subrange R and ToMerge. -    LaneBitmask Common = RMask & LaneMask; -    // There is nothing to do without common subregs. -    if (Common.none()) -      continue; - -    DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into " -                 << PrintLaneMask(Common) << '\n'); -    // LaneMask of subregisters contained in the R range but not in ToMerge, -    // they have to split into their own subrange. -    LaneBitmask LRest = RMask & ~LaneMask; -    LiveInterval::SubRange *CommonRange; -    if (LRest.any()) { -      R.LaneMask = LRest; -      DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n'); -      // Duplicate SubRange for newly merged common stuff. -      CommonRange = LI.createSubRangeFrom(Allocator, Common, R); +  LI.refineSubRanges(Allocator, LaneMask, +      [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) { +    if (SR.empty()) { +      SR.assign(ToMerge, Allocator);      } else { -      // Reuse the existing range. -      R.LaneMask = Common; -      CommonRange = &R; +      // joinSubRegRange() destroys the merged range, so we need a copy. +      LiveRange RangeCopy(ToMerge, Allocator); +      joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP);      } -    LiveRange RangeCopy(ToMerge, Allocator); -    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP); -    LaneMask &= ~RMask; -  } - -  if (LaneMask.any()) { -    DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n'); -    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge); -  } +  });  }  bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index fc84aebb14d7..c726edc88b41 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -1,4 +1,4 @@ -//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===// +//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -12,13 +12,37 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/CodeGen/LiveInterval.h"  #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h"  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iterator> +#include <limits> +#include <utility> +#include <vector>  using namespace llvm; @@ -52,6 +76,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,    }  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD  void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,                                const TargetRegisterInfo *TRI) { @@ -97,6 +122,7 @@ void RegPressureTracker::dump() const {    P.dump(TRI);  } +LLVM_DUMP_METHOD  void PressureDiff::dump(const TargetRegisterInfo &TRI) const {    const char *sep = "";    for (const PressureChange &Change : *this) { @@ -108,6 +134,7 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {    }    dbgs() << '\n';  } +#endif  void RegPressureTracker::increaseRegPressure(unsigned RegUnit,                                               LaneBitmask PreviousMask, @@ -264,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const {            MachineBasicBlock::const_iterator());  } -  SlotIndex RegPressureTracker::getCurrSlot() const {    MachineBasicBlock::const_iterator IdxPos =      skipDebugInstructionsForward(CurrPos, MBB->end()); @@ -328,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {  static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,                                 unsigned RegUnit) { -  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { +  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {      return Other.RegUnit == RegUnit;    });    if (I == RegUnits.end()) @@ -340,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,                          RegisterMaskPair Pair) {    unsigned RegUnit = Pair.RegUnit;    assert(Pair.LaneMask.any()); -  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { +  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {      return Other.RegUnit == RegUnit;    });    if (I == RegUnits.end()) { @@ -352,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,  static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,                         unsigned RegUnit) { -  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { +  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {      return Other.RegUnit == RegUnit;    });    if (I == RegUnits.end()) { @@ -366,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,                             RegisterMaskPair Pair) {    unsigned RegUnit = Pair.RegUnit;    assert(Pair.LaneMask.any()); -  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { +  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {      return Other.RegUnit == RegUnit;    });    if (I != RegUnits.end()) { @@ -423,6 +449,8 @@ namespace {  ///  /// FIXME: always ignore tied opers  class RegisterOperandsCollector { +  friend class llvm::RegisterOperands; +    RegisterOperands &RegOpers;    const TargetRegisterInfo &TRI;    const MachineRegisterInfo &MRI; @@ -517,11 +545,9 @@ class RegisterOperandsCollector {          addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll()));      }    } - -  friend class llvm::RegisterOperands;  }; -} // namespace +} // end anonymous namespace  void RegisterOperands::collect(const MachineInstr &MI,                                 const TargetRegisterInfo &TRI, @@ -674,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,    assert(Pair.LaneMask.any());    unsigned RegUnit = Pair.RegUnit; -  auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { +  auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {      return Other.RegUnit == RegUnit;    });    LaneBitmask PrevMask; @@ -772,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,          if (!TrackLaneMasks) {            addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));          } else { -          auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { -            return Other.RegUnit == Reg; -          }); +          auto I = +              llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { +                return Other.RegUnit == Reg; +              });            bool IsRedef = I != LiveUses->end();            if (IsRedef) {              // ignore re-defs here... @@ -1154,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,        if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {          int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); -        if (CritInc > 0 && CritInc <= INT16_MAX) { +        if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {            Delta.CriticalMax = PressureChange(PSetID);            Delta.CriticalMax.setUnitInc(CritInc);          } diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index fdf741fd58f7..6392136fa290 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -1,4 +1,4 @@ -//===-- RegisterScavenging.cpp - Machine register scavenging --------------===// +//===- RegisterScavenging.cpp - Machine register scavenging ---------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -15,28 +15,32 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/CodeGen/RegisterScavenging.h"  #include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCRegisterInfo.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <iterator> +#include <limits> +#include <string> +  using namespace llvm;  #define DEBUG_TYPE "reg-scavenging"  void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) { -  for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { -    LaneBitmask UnitMask = (*RUI).second; -    if (UnitMask.none() || (LaneMask & UnitMask).any()) -      RegUnitsAvailable.reset((*RUI).first); -  } +  LiveUnits.addRegMasked(Reg, LaneMask);  }  void RegScavenger::init(MachineBasicBlock &MBB) { @@ -44,6 +48,7 @@ void RegScavenger::init(MachineBasicBlock &MBB) {    TII = MF.getSubtarget().getInstrInfo();    TRI = MF.getSubtarget().getRegisterInfo();    MRI = &MF.getRegInfo(); +  LiveUnits.init(*TRI);    assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&           "Target changed?"); @@ -51,7 +56,6 @@ void RegScavenger::init(MachineBasicBlock &MBB) {    // Self-initialize.    if (!this->MBB) {      NumRegUnits = TRI->getNumRegUnits(); -    RegUnitsAvailable.resize(NumRegUnits);      KillRegUnits.resize(NumRegUnits);      DefRegUnits.resize(NumRegUnits);      TmpRegUnits.resize(NumRegUnits); @@ -64,32 +68,17 @@ void RegScavenger::init(MachineBasicBlock &MBB) {      I->Restore = nullptr;    } -  // All register units start out unused. -  RegUnitsAvailable.set(); - -  // Pristine CSRs are not available. -  BitVector PR = MF.getFrameInfo().getPristineRegs(MF); -  for (int I = PR.find_first(); I>0; I = PR.find_next(I)) -    setRegUsed(I); -    Tracking = false;  } -void RegScavenger::setLiveInsUsed(const MachineBasicBlock &MBB) { -  for (const auto &LI : MBB.liveins()) -    setRegUsed(LI.PhysReg, LI.LaneMask); -} -  void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) {    init(MBB); -  setLiveInsUsed(MBB); +  LiveUnits.addLiveIns(MBB);  }  void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {    init(MBB); -  // Merge live-ins of successors to get live-outs. -  for (const MachineBasicBlock *Succ : MBB.successors()) -    setLiveInsUsed(*Succ); +  LiveUnits.addLiveOuts(MBB);    // Move internal iterator at the last instruction of the block.    if (MBB.begin() != MBB.end()) { @@ -263,36 +252,7 @@ void RegScavenger::backward() {    assert(Tracking && "Must be tracking to determine kills and defs");    const MachineInstr &MI = *MBBI; -  // Defined or clobbered registers are available now. -  for (const MachineOperand &MO : MI.operands()) { -    if (MO.isRegMask()) { -      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; -           ++RU) { -        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) { -          if (MO.clobbersPhysReg(*RURI)) { -            RegUnitsAvailable.set(RU); -            break; -          } -        } -      } -    } else if (MO.isReg() && MO.isDef()) { -      unsigned Reg = MO.getReg(); -      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || -          isReserved(Reg)) -        continue; -      addRegUnits(RegUnitsAvailable, Reg); -    } -  } -  // Mark read registers as unavailable. -  for (const MachineOperand &MO : MI.uses()) { -    if (MO.isReg() && MO.readsReg()) { -      unsigned Reg = MO.getReg(); -      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || -          isReserved(Reg)) -        continue; -      removeRegUnits(RegUnitsAvailable, Reg); -    } -  } +  LiveUnits.stepBackward(MI);    if (MBBI == MBB->begin()) {      MBBI = MachineBasicBlock::iterator(nullptr); @@ -302,12 +262,9 @@ void RegScavenger::backward() {  }  bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const { -  if (includeReserved && isReserved(Reg)) -    return true; -  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) -    if (!RegUnitsAvailable.test(*RUI)) -      return true; -  return false; +  if (isReserved(Reg)) +    return includeReserved; +  return !LiveUnits.available(Reg);  }  unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const { @@ -441,7 +398,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,    unsigned NeedSize = RC->getSize();    unsigned NeedAlign = RC->getAlignment(); -  unsigned SI = Scavenged.size(), Diff = UINT_MAX; +  unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();    int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();    for (unsigned I = 0; I < Scavenged.size(); ++I) {      if (Scavenged[I].Reg != 0) diff --git a/lib/CodeGen/ResetMachineFunctionPass.cpp b/lib/CodeGen/ResetMachineFunctionPass.cpp index 451964199ba5..3e259927ac5c 100644 --- a/lib/CodeGen/ResetMachineFunctionPass.cpp +++ b/lib/CodeGen/ResetMachineFunctionPass.cpp @@ -30,17 +30,23 @@ namespace {      /// Tells whether or not this pass should emit a fallback      /// diagnostic when it resets a function.      bool EmitFallbackDiag; +    /// Whether we should abort immediately instead of resetting the function. +    bool AbortOnFailedISel;    public:      static char ID; // Pass identification, replacement for typeid -    ResetMachineFunction(bool EmitFallbackDiag = false) -        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag) {} +    ResetMachineFunction(bool EmitFallbackDiag = false, +                         bool AbortOnFailedISel = false) +        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag), +          AbortOnFailedISel(AbortOnFailedISel) {}      StringRef getPassName() const override { return "ResetMachineFunction"; }      bool runOnMachineFunction(MachineFunction &MF) override {        if (MF.getProperties().hasProperty(                MachineFunctionProperties::Property::FailedISel)) { +        if (AbortOnFailedISel) +          report_fatal_error("Instruction selection failed");          DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n');          ++NumFunctionsReset;          MF.reset(); @@ -62,6 +68,7 @@ INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE,                  "reset machine function if ISel failed", false, false)  MachineFunctionPass * -llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false) { -  return new ResetMachineFunction(EmitFallbackDiag); +llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false, +                                     bool AbortOnFailedISel = false) { +  return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel);  } diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp index 2b82df293c14..fa68411284e7 100644 --- a/lib/CodeGen/SafeStack.cpp +++ b/lib/CodeGen/SafeStack.cpp @@ -451,7 +451,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,    IRBuilder<> IRBFail(CheckTerm);    // FIXME: respect -fsanitize-trap / -ftrap-function here?    Constant *StackChkFail = F.getParent()->getOrInsertFunction( -      "__stack_chk_fail", IRB.getVoidTy(), nullptr); +      "__stack_chk_fail", IRB.getVoidTy());    IRBFail.CreateCall(StackChkFail, {});  } diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp index 7fbeaddb38e8..09289f947dc9 100644 --- a/lib/CodeGen/SafeStackColoring.cpp +++ b/lib/CodeGen/SafeStackColoring.cpp @@ -236,6 +236,7 @@ void StackColoring::calculateLiveIntervals() {    }  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void StackColoring::dumpAllocas() {    dbgs() << "Allocas:\n";    for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) @@ -262,6 +263,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {      dbgs() << "  " << AllocaNo << ": " << Range << "\n";    }  } +#endif  void StackColoring::run() {    DEBUG(dumpAllocas()); diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp index 427d95268c74..dc72ac073258 100644 --- a/lib/CodeGen/ScheduleDAG.cpp +++ b/lib/CodeGen/ScheduleDAG.cpp @@ -1,4 +1,4 @@ -//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===// +//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -7,22 +7,32 @@  //  //===----------------------------------------------------------------------===//  // -// This implements the ScheduleDAG class, which is a base class used by -// scheduling implementation classes. +/// \file Implements the ScheduleDAG class, which is a base class used by +/// scheduling implementation classes.  //  //===----------------------------------------------------------------------===// +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/ScheduleDAG.h"  #include "llvm/CodeGen/ScheduleHazardRecognizer.h"  #include "llvm/CodeGen/SelectionDAGNodes.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" -#include <climits> +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> +  using namespace llvm;  #define DEBUG_TYPE "pre-RA-sched" @@ -33,58 +43,52 @@ static cl::opt<bool> StressSchedOpt(    cl::desc("Stress test instruction scheduling"));  #endif -void SchedulingPriorityQueue::anchor() { } +void SchedulingPriorityQueue::anchor() {}  ScheduleDAG::ScheduleDAG(MachineFunction &mf)      : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),        TRI(mf.getSubtarget().getRegisterInfo()), MF(mf), -      MRI(mf.getRegInfo()), EntrySU(), ExitSU() { +      MRI(mf.getRegInfo()) {  #ifndef NDEBUG    StressSched = StressSchedOpt;  #endif  } -ScheduleDAG::~ScheduleDAG() {} +ScheduleDAG::~ScheduleDAG() = default; -/// Clear the DAG state (e.g. between scheduling regions).  void ScheduleDAG::clearDAG() {    SUnits.clear();    EntrySU = SUnit();    ExitSU = SUnit();  } -/// getInstrDesc helper to handle SDNodes.  const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {    if (!Node || !Node->isMachineOpcode()) return nullptr;    return &TII->get(Node->getMachineOpcode());  } -/// addPred - This adds the specified edge as a pred of the current node if -/// not already.  It also adds the current node as a successor of the -/// specified node.  bool SUnit::addPred(const SDep &D, bool Required) {    // If this node already has this dependence, don't add a redundant one. -  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); -         I != E; ++I) { +  for (SDep &PredDep : Preds) {      // Zero-latency weak edges may be added purely for heuristic ordering. Don't      // add them if another kind of edge already exists. -    if (!Required && I->getSUnit() == D.getSUnit()) +    if (!Required && PredDep.getSUnit() == D.getSUnit())        return false; -    if (I->overlaps(D)) { -      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D). -      if (I->getLatency() < D.getLatency()) { -        SUnit *PredSU = I->getSUnit(); +    if (PredDep.overlaps(D)) { +      // Extend the latency if needed. Equivalent to +      // removePred(PredDep) + addPred(D). +      if (PredDep.getLatency() < D.getLatency()) { +        SUnit *PredSU = PredDep.getSUnit();          // Find the corresponding successor in N. -        SDep ForwardD = *I; +        SDep ForwardD = PredDep;          ForwardD.setSUnit(this); -        for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(), -               EE = PredSU->Succs.end(); II != EE; ++II) { -          if (*II == ForwardD) { -            II->setLatency(D.getLatency()); +        for (SDep &SuccDep : PredSU->Succs) { +          if (SuccDep == ForwardD) { +            SuccDep.setLatency(D.getLatency());              break;            }          } -        I->setLatency(D.getLatency()); +        PredDep.setLatency(D.getLatency());        }        return false;      } @@ -95,8 +99,10 @@ bool SUnit::addPred(const SDep &D, bool Required) {    SUnit *N = D.getSUnit();    // Update the bookkeeping.    if (D.getKind() == SDep::Data) { -    assert(NumPreds < UINT_MAX && "NumPreds will overflow!"); -    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!"); +    assert(NumPreds < std::numeric_limits<unsigned>::max() && +           "NumPreds will overflow!"); +    assert(N->NumSuccs < std::numeric_limits<unsigned>::max() && +           "NumSuccs will overflow!");      ++NumPreds;      ++N->NumSuccs;    } @@ -105,7 +111,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {        ++WeakPredsLeft;      }      else { -      assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!"); +      assert(NumPredsLeft < std::numeric_limits<unsigned>::max() && +             "NumPredsLeft will overflow!");        ++NumPredsLeft;      }    } @@ -114,7 +121,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {        ++N->WeakSuccsLeft;      }      else { -      assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!"); +      assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() && +             "NumSuccsLeft will overflow!");        ++N->NumSuccsLeft;      }    } @@ -127,51 +135,46 @@ bool SUnit::addPred(const SDep &D, bool Required) {    return true;  } -/// removePred - This removes the specified edge as a pred of the current -/// node if it exists.  It also removes the current node as a successor of -/// the specified node.  void SUnit::removePred(const SDep &D) {    // Find the matching predecessor. -  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); -         I != E; ++I) -    if (*I == D) { -      // Find the corresponding successor in N. -      SDep P = D; -      P.setSUnit(this); -      SUnit *N = D.getSUnit(); -      SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P); -      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); -      N->Succs.erase(Succ); -      Preds.erase(I); -      // Update the bookkeeping. -      if (P.getKind() == SDep::Data) { -        assert(NumPreds > 0 && "NumPreds will underflow!"); -        assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); -        --NumPreds; -        --N->NumSuccs; -      } -      if (!N->isScheduled) { -        if (D.isWeak()) -          --WeakPredsLeft; -        else { -          assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); -          --NumPredsLeft; -        } -      } -      if (!isScheduled) { -        if (D.isWeak()) -          --N->WeakSuccsLeft; -        else { -          assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); -          --N->NumSuccsLeft; -        } -      } -      if (P.getLatency() != 0) { -        this->setDepthDirty(); -        N->setHeightDirty(); -      } -      return; +  SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D); +  if (I == Preds.end()) +    return; +  // Find the corresponding successor in N. +  SDep P = D; +  P.setSUnit(this); +  SUnit *N = D.getSUnit(); +  SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P); +  assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); +  N->Succs.erase(Succ); +  Preds.erase(I); +  // Update the bookkeeping. +  if (P.getKind() == SDep::Data) { +    assert(NumPreds > 0 && "NumPreds will underflow!"); +    assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); +    --NumPreds; +    --N->NumSuccs; +  } +  if (!N->isScheduled) { +    if (D.isWeak()) +      --WeakPredsLeft; +    else { +      assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); +      --NumPredsLeft;      } +  } +  if (!isScheduled) { +    if (D.isWeak()) +      --N->WeakSuccsLeft; +    else { +      assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); +      --N->NumSuccsLeft; +    } +  } +  if (P.getLatency() != 0) { +    this->setDepthDirty(); +    N->setHeightDirty(); +  }  }  void SUnit::setDepthDirty() { @@ -181,9 +184,8 @@ void SUnit::setDepthDirty() {    do {      SUnit *SU = WorkList.pop_back_val();      SU->isDepthCurrent = false; -    for (SUnit::const_succ_iterator I = SU->Succs.begin(), -         E = SU->Succs.end(); I != E; ++I) { -      SUnit *SuccSU = I->getSUnit(); +    for (SDep &SuccDep : SU->Succs) { +      SUnit *SuccSU = SuccDep.getSUnit();        if (SuccSU->isDepthCurrent)          WorkList.push_back(SuccSU);      } @@ -197,18 +199,14 @@ void SUnit::setHeightDirty() {    do {      SUnit *SU = WorkList.pop_back_val();      SU->isHeightCurrent = false; -    for (SUnit::const_pred_iterator I = SU->Preds.begin(), -         E = SU->Preds.end(); I != E; ++I) { -      SUnit *PredSU = I->getSUnit(); +    for (SDep &PredDep : SU->Preds) { +      SUnit *PredSU = PredDep.getSUnit();        if (PredSU->isHeightCurrent)          WorkList.push_back(PredSU);      }    } while (!WorkList.empty());  } -/// setDepthToAtLeast - Update this node's successors to reflect the -/// fact that this node's depth just increased. -///  void SUnit::setDepthToAtLeast(unsigned NewDepth) {    if (NewDepth <= getDepth())      return; @@ -217,9 +215,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) {    isDepthCurrent = true;  } -/// setHeightToAtLeast - Update this node's predecessors to reflect the -/// fact that this node's height just increased. -///  void SUnit::setHeightToAtLeast(unsigned NewHeight) {    if (NewHeight <= getHeight())      return; @@ -228,8 +223,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) {    isHeightCurrent = true;  } -/// ComputeDepth - Calculate the maximal path from the node to the exit. -/// +/// Calculates the maximal path from the node to the exit.  void SUnit::ComputeDepth() {    SmallVector<SUnit*, 8> WorkList;    WorkList.push_back(this); @@ -238,12 +232,11 @@ void SUnit::ComputeDepth() {      bool Done = true;      unsigned MaxPredDepth = 0; -    for (SUnit::const_pred_iterator I = Cur->Preds.begin(), -         E = Cur->Preds.end(); I != E; ++I) { -      SUnit *PredSU = I->getSUnit(); +    for (const SDep &PredDep : Cur->Preds) { +      SUnit *PredSU = PredDep.getSUnit();        if (PredSU->isDepthCurrent)          MaxPredDepth = std::max(MaxPredDepth, -                                PredSU->Depth + I->getLatency()); +                                PredSU->Depth + PredDep.getLatency());        else {          Done = false;          WorkList.push_back(PredSU); @@ -261,8 +254,7 @@ void SUnit::ComputeDepth() {    } while (!WorkList.empty());  } -/// ComputeHeight - Calculate the maximal path from the node to the entry. -/// +/// Calculates the maximal path from the node to the entry.  void SUnit::ComputeHeight() {    SmallVector<SUnit*, 8> WorkList;    WorkList.push_back(this); @@ -271,12 +263,11 @@ void SUnit::ComputeHeight() {      bool Done = true;      unsigned MaxSuccHeight = 0; -    for (SUnit::const_succ_iterator I = Cur->Succs.begin(), -         E = Cur->Succs.end(); I != E; ++I) { -      SUnit *SuccSU = I->getSUnit(); +    for (const SDep &SuccDep : Cur->Succs) { +      SUnit *SuccSU = SuccDep.getSUnit();        if (SuccSU->isHeightCurrent)          MaxSuccHeight = std::max(MaxSuccHeight, -                                 SuccSU->Height + I->getLatency()); +                                 SuccSU->Height + SuccDep.getLatency());        else {          Done = false;          WorkList.push_back(SuccSU); @@ -310,6 +301,7 @@ void SUnit::biasCriticalPath() {  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {    if (this == &DAG->ExitSU)      OS << "ExitSU"; @@ -319,15 +311,13 @@ void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {      OS << "SU(" << NodeNum << ")";  } -/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or -/// a group of nodes flagged together. -void SUnit::dump(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dump(const ScheduleDAG *G) const {    print(dbgs(), G);    dbgs() << ": ";    G->dumpNode(this);  } -void SUnit::dumpAll(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const {    dump(G);    dbgs() << "  # preds left       : " << NumPredsLeft << "\n"; @@ -343,41 +333,39 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {    if (Preds.size() != 0) {      dbgs() << "  Predecessors:\n"; -    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end(); -         I != E; ++I) { +    for (const SDep &SuccDep : Preds) {        dbgs() << "   "; -      switch (I->getKind()) { +      switch (SuccDep.getKind()) {        case SDep::Data:   dbgs() << "data "; break;        case SDep::Anti:   dbgs() << "anti "; break;        case SDep::Output: dbgs() << "out  "; break;        case SDep::Order:  dbgs() << "ord  "; break;        } -      I->getSUnit()->print(dbgs(), G); -      if (I->isArtificial()) +      SuccDep.getSUnit()->print(dbgs(), G); +      if (SuccDep.isArtificial())          dbgs() << " *"; -      dbgs() << ": Latency=" << I->getLatency(); -      if (I->isAssignedRegDep()) -        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); +      dbgs() << ": Latency=" << SuccDep.getLatency(); +      if (SuccDep.isAssignedRegDep()) +        dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI);        dbgs() << "\n";      }    }    if (Succs.size() != 0) {      dbgs() << "  Successors:\n"; -    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end(); -         I != E; ++I) { +    for (const SDep &SuccDep : Succs) {        dbgs() << "   "; -      switch (I->getKind()) { +      switch (SuccDep.getKind()) {        case SDep::Data:   dbgs() << "data "; break;        case SDep::Anti:   dbgs() << "anti "; break;        case SDep::Output: dbgs() << "out  "; break;        case SDep::Order:  dbgs() << "ord  "; break;        } -      I->getSUnit()->print(dbgs(), G); -      if (I->isArtificial()) +      SuccDep.getSUnit()->print(dbgs(), G); +      if (SuccDep.isArtificial())          dbgs() << " *"; -      dbgs() << ": Latency=" << I->getLatency(); -      if (I->isAssignedRegDep()) -        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); +      dbgs() << ": Latency=" << SuccDep.getLatency(); +      if (SuccDep.isAssignedRegDep()) +        dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI);        dbgs() << "\n";      }    } @@ -385,47 +373,44 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {  #endif  #ifndef NDEBUG -/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that -/// their state is consistent. Return the number of scheduled nodes. -///  unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {    bool AnyNotSched = false;    unsigned DeadNodes = 0; -  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { -    if (!SUnits[i].isScheduled) { -      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) { +  for (const SUnit &SUnit : SUnits) { +    if (!SUnit.isScheduled) { +      if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) {          ++DeadNodes;          continue;        }        if (!AnyNotSched)          dbgs() << "*** Scheduling failed! ***\n"; -      SUnits[i].dump(this); +      SUnit.dump(this);        dbgs() << "has not been scheduled!\n";        AnyNotSched = true;      } -    if (SUnits[i].isScheduled && -        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) > -          unsigned(INT_MAX)) { +    if (SUnit.isScheduled && +        (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) > +          unsigned(std::numeric_limits<int>::max())) {        if (!AnyNotSched)          dbgs() << "*** Scheduling failed! ***\n"; -      SUnits[i].dump(this); +      SUnit.dump(this);        dbgs() << "has an unexpected "             << (isBottomUp ? "Height" : "Depth") << " value!\n";        AnyNotSched = true;      }      if (isBottomUp) { -      if (SUnits[i].NumSuccsLeft != 0) { +      if (SUnit.NumSuccsLeft != 0) {          if (!AnyNotSched)            dbgs() << "*** Scheduling failed! ***\n"; -        SUnits[i].dump(this); +        SUnit.dump(this);          dbgs() << "has successors left!\n";          AnyNotSched = true;        }      } else { -      if (SUnits[i].NumPredsLeft != 0) { +      if (SUnit.NumPredsLeft != 0) {          if (!AnyNotSched)            dbgs() << "*** Scheduling failed! ***\n"; -        SUnits[i].dump(this); +        SUnit.dump(this);          dbgs() << "has predecessors left!\n";          AnyNotSched = true;        } @@ -436,36 +421,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {  }  #endif -/// InitDAGTopologicalSorting - create the initial topological -/// ordering from the DAG to be scheduled. -/// -/// The idea of the algorithm is taken from -/// "Online algorithms for managing the topological order of -/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly -/// This is the MNR algorithm, which was first introduced by -/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in -/// "Maintaining a topological order under edge insertions". -/// -/// Short description of the algorithm: -/// -/// Topological ordering, ord, of a DAG maps each node to a topological -/// index so that for all edges X->Y it is the case that ord(X) < ord(Y). -/// -/// This means that if there is a path from the node X to the node Z, -/// then ord(X) < ord(Z). -/// -/// This property can be used to check for reachability of nodes: -/// if Z is reachable from X, then an insertion of the edge Z->X would -/// create a cycle. -/// -/// The algorithm first computes a topological ordering for the DAG by -/// initializing the Index2Node and Node2Index arrays and then tries to keep -/// the ordering up-to-date after edge insertions by reordering the DAG. -/// -/// On insertion of the edge X->Y, the algorithm first marks by calling DFS -/// the nodes reachable from Y, and then shifts them using Shift to lie -/// immediately after X in Index2Node.  void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { +  // The idea of the algorithm is taken from +  // "Online algorithms for managing the topological order of +  // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly +  // This is the MNR algorithm, which was first introduced by +  // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in +  // "Maintaining a topological order under edge insertions". +  // +  // Short description of the algorithm: +  // +  // Topological ordering, ord, of a DAG maps each node to a topological +  // index so that for all edges X->Y it is the case that ord(X) < ord(Y). +  // +  // This means that if there is a path from the node X to the node Z, +  // then ord(X) < ord(Z). +  // +  // This property can be used to check for reachability of nodes: +  // if Z is reachable from X, then an insertion of the edge Z->X would +  // create a cycle. +  // +  // The algorithm first computes a topological ordering for the DAG by +  // initializing the Index2Node and Node2Index arrays and then tries to keep +  // the ordering up-to-date after edge insertions by reordering the DAG. +  // +  // On insertion of the edge X->Y, the algorithm first marks by calling DFS +  // the nodes reachable from Y, and then shifts them using Shift to lie +  // immediately after X in Index2Node.    unsigned DAGSize = SUnits.size();    std::vector<SUnit*> WorkList;    WorkList.reserve(DAGSize); @@ -476,18 +458,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {    // Initialize the data structures.    if (ExitSU)      WorkList.push_back(ExitSU); -  for (unsigned i = 0, e = DAGSize; i != e; ++i) { -    SUnit *SU = &SUnits[i]; -    int NodeNum = SU->NodeNum; -    unsigned Degree = SU->Succs.size(); +  for (SUnit &SU : SUnits) { +    int NodeNum = SU.NodeNum; +    unsigned Degree = SU.Succs.size();      // Temporarily use the Node2Index array as scratch space for degree counts.      Node2Index[NodeNum] = Degree;      // Is it a node without dependencies?      if (Degree == 0) { -      assert(SU->Succs.empty() && "SUnit should have no successors"); +      assert(SU.Succs.empty() && "SUnit should have no successors");        // Collect leaf nodes. -      WorkList.push_back(SU); +      WorkList.push_back(&SU);      }    } @@ -497,9 +478,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {      WorkList.pop_back();      if (SU->NodeNum < DAGSize)        Allocate(SU->NodeNum, --Id); -    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); -         I != E; ++I) { -      SUnit *SU = I->getSUnit(); +    for (const SDep &PredDep : SU->Preds) { +      SUnit *SU = PredDep.getSUnit();        if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])          // If all dependencies of the node are processed already,          // then the node can be computed now. @@ -511,19 +491,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {  #ifndef NDEBUG    // Check correctness of the ordering -  for (unsigned i = 0, e = DAGSize; i != e; ++i) { -    SUnit *SU = &SUnits[i]; -    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); -         I != E; ++I) { -      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && +  for (SUnit &SU : SUnits)  { +    for (const SDep &PD : SU.Preds) { +      assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] &&        "Wrong topological sorting");      }    }  #endif  } -/// AddPred - Updates the topological ordering to accommodate an edge -/// to be added from SUnit X to SUnit Y.  void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {    int UpperBound, LowerBound;    LowerBound = Node2Index[Y->NodeNum]; @@ -540,16 +516,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {    }  } -/// RemovePred - Updates the topological ordering to accommodate an -/// an edge to be removed from the specified node N from the predecessors -/// of the current node M.  void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {    // InitDAGTopologicalSorting();  } -/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark -/// all nodes affected by the edge insertion. These nodes will later get new -/// topological indexes by means of the Shift method.  void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,                                       bool &HasLoop) {    std::vector<const SUnit*> WorkList; @@ -560,8 +530,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,      SU = WorkList.back();      WorkList.pop_back();      Visited.set(SU->NodeNum); -    for (int I = SU->Succs.size()-1; I >= 0; --I) { -      unsigned s = SU->Succs[I].getSUnit()->NodeNum; +    for (const SDep &SuccDep +         : make_range(SU->Succs.rbegin(), SU->Succs.rend())) { +      unsigned s = SuccDep.getSUnit()->NodeNum;        // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).        if (s >= Node2Index.size())          continue; @@ -571,14 +542,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,        }        // Visit successors if not already and in affected region.        if (!Visited.test(s) && Node2Index[s] < UpperBound) { -        WorkList.push_back(SU->Succs[I].getSUnit()); +        WorkList.push_back(SuccDep.getSUnit());        }      }    } while (!WorkList.empty());  } -/// Shift - Renumber the nodes so that the topological ordering is -/// preserved. +std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU, +                                                         const SUnit &TargetSU, +                                                         bool &Success) { +  std::vector<const SUnit*> WorkList; +  int LowerBound = Node2Index[StartSU.NodeNum]; +  int UpperBound = Node2Index[TargetSU.NodeNum]; +  bool Found = false; +  BitVector VisitedBack; +  std::vector<int> Nodes; + +  if (LowerBound > UpperBound) { +    Success = false; +    return Nodes; +  } + +  WorkList.reserve(SUnits.size()); +  Visited.reset(); + +  // Starting from StartSU, visit all successors up +  // to UpperBound. +  WorkList.push_back(&StartSU); +  do { +    const SUnit *SU = WorkList.back(); +    WorkList.pop_back(); +    for (int I = SU->Succs.size()-1; I >= 0; --I) { +      const SUnit *Succ = SU->Succs[I].getSUnit(); +      unsigned s = Succ->NodeNum; +      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). +      if (Succ->isBoundaryNode()) +        continue; +      if (Node2Index[s] == UpperBound) { +        Found = true; +        continue; +      } +      // Visit successors if not already and in affected region. +      if (!Visited.test(s) && Node2Index[s] < UpperBound) { +        Visited.set(s); +        WorkList.push_back(Succ); +      } +    } +  } while (!WorkList.empty()); + +  if (!Found) { +    Success = false; +    return Nodes; +  } + +  WorkList.clear(); +  VisitedBack.resize(SUnits.size()); +  Found = false; + +  // Starting from TargetSU, visit all predecessors up +  // to LowerBound. SUs that are visited by the two +  // passes are added to Nodes. +  WorkList.push_back(&TargetSU); +  do { +    const SUnit *SU = WorkList.back(); +    WorkList.pop_back(); +    for (int I = SU->Preds.size()-1; I >= 0; --I) { +      const SUnit *Pred = SU->Preds[I].getSUnit(); +      unsigned s = Pred->NodeNum; +      // Edges to non-SUnits are allowed but ignored (e.g. EntrySU). +      if (Pred->isBoundaryNode()) +        continue; +      if (Node2Index[s] == LowerBound) { +        Found = true; +        continue; +      } +      if (!VisitedBack.test(s) && Visited.test(s)) { +        VisitedBack.set(s); +        WorkList.push_back(Pred); +        Nodes.push_back(s); +      } +    } +  } while (!WorkList.empty()); + +  assert(Found && "Error in SUnit Graph!"); +  Success = true; +  return Nodes; +} +  void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,                                         int UpperBound) {    std::vector<int> L; @@ -598,28 +648,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,      }    } -  for (unsigned j = 0; j < L.size(); ++j) { -    Allocate(L[j], i - shift); +  for (unsigned LI : L) { +    Allocate(LI, i - shift);      i = i + 1;    }  } - -/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will -/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU).  bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {    // Is SU reachable from TargetSU via successor edges?    if (IsReachable(SU, TargetSU))      return true; -  for (SUnit::pred_iterator -         I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I) -    if (I->isAssignedRegDep() && -        IsReachable(SU, I->getSUnit())) +  for (const SDep &PredDep : TargetSU->Preds) +    if (PredDep.isAssignedRegDep() && +        IsReachable(SU, PredDep.getSUnit()))        return true;    return false;  } -/// IsReachable - Checks if SU is reachable from TargetSU.  bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,                                               const SUnit *TargetSU) {    // If insertion of the edge SU->TargetSU would create a cycle @@ -637,7 +682,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,    return HasLoop;  } -/// Allocate - assign the topological index to the node n.  void ScheduleDAGTopologicalSort::Allocate(int n, int index) {    Node2Index[n] = index;    Index2Node[index] = n; @@ -647,4 +691,4 @@ ScheduleDAGTopologicalSort::  ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)    : SUnits(sunits), ExitSU(exitsu) {} -ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {} +ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default; diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 611c5a71bd5a..18823b74c47f 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -7,8 +7,8 @@  //  //===----------------------------------------------------------------------===//  // -// This implements the ScheduleDAGInstrs class, which implements re-scheduling -// of MachineInstrs. +/// \file This implements the ScheduleDAGInstrs class, which implements +/// re-scheduling of MachineInstrs.  //  //===----------------------------------------------------------------------===// @@ -101,8 +101,8 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,    SchedModel.init(ST.getSchedModel(), &ST, TII);  } -/// getUnderlyingObjectFromInt - This is the function that does the work of -/// looking through basic ptrtoint+arithmetic+inttoptr sequences. +/// This is the function that does the work of looking through basic +/// ptrtoint+arithmetic+inttoptr sequences.  static const Value *getUnderlyingObjectFromInt(const Value *V) {    do {      if (const Operator *U = dyn_cast<Operator>(V)) { @@ -129,8 +129,8 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) {    } while (1);  } -/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects -/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences. +/// This is a wrapper around GetUnderlyingObjects and adds support for basic +/// ptrtoint+arithmetic+inttoptr sequences.  static void getUnderlyingObjects(const Value *V,                                   SmallVectorImpl<Value *> &Objects,                                   const DataLayout &DL) { @@ -158,9 +158,8 @@ static void getUnderlyingObjects(const Value *V,    } while (!Working.empty());  } -/// getUnderlyingObjectsForInstr - If this machine instr has memory reference -/// information and it can be tracked to a normal reference to a known -/// object, return the Value for that object. +/// If this machine instr has memory reference information and it can be tracked +/// to a normal reference to a known object, return the Value for that object.  static void getUnderlyingObjectsForInstr(const MachineInstr *MI,                                           const MachineFrameInfo &MFI,                                           UnderlyingObjectsVector &Objects, @@ -216,10 +215,6 @@ void ScheduleDAGInstrs::finishBlock() {    BB = nullptr;  } -/// Initialize the DAG and common scheduler state for the current scheduling -/// region. This does not actually create the DAG, only clears it. The -/// scheduling driver may call BuildSchedGraph multiple times per scheduling -/// region.  void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,                                      MachineBasicBlock::iterator begin,                                      MachineBasicBlock::iterator end, @@ -230,20 +225,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,    NumRegionInstrs = regioninstrs;  } -/// Close the current scheduling region. Don't clear any state in case the -/// driver wants to refer to the previous scheduling region.  void ScheduleDAGInstrs::exitRegion() {    // Nothing to do.  } -/// addSchedBarrierDeps - Add dependencies from instructions in the current -/// list of instructions being scheduled to scheduling barrier by adding -/// the exit SU to the register defs and use list. This is because we want to -/// make sure instructions which define registers that are either used by -/// the terminator or are live-out are properly scheduled. This is -/// especially important when the definition latency of the return value(s) -/// are too high to be hidden by the branch or when the liveout registers -/// used by instructions in the fallthrough block.  void ScheduleDAGInstrs::addSchedBarrierDeps() {    MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;    ExitSU.setInstr(ExitMI); @@ -271,7 +256,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {    }  } -/// MO is an operand of SU's instruction that defines a physical register. Add +/// MO is an operand of SU's instruction that defines a physical register. Adds  /// data dependencies from SU to any uses of the physical register.  void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {    const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); @@ -313,9 +298,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {    }  } -/// addPhysRegDeps - Add register dependencies (data, anti, and output) from -/// this SUnit to following instructions in the same scheduling region that -/// depend the physical register referenced at OperIdx. +/// \brief Adds register dependencies (data, anti, and output) from this SUnit +/// to following instructions in the same scheduling region that depend the +/// physical register referenced at OperIdx.  void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {    MachineInstr *MI = SU->getInstr();    MachineOperand &MO = MI->getOperand(OperIdx); @@ -406,9 +391,9 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const    return TRI->getSubRegIndexLaneMask(SubReg);  } -/// addVRegDefDeps - Add register output and data dependencies from this SUnit -/// to instructions that occur later in the same scheduling region if they read -/// from or write to the virtual register defined at OperIdx. +/// Adds register output and data dependencies from this SUnit to instructions +/// that occur later in the same scheduling region if they read from or write to +/// the virtual register defined at OperIdx.  ///  /// TODO: Hoist loop induction variable increments. This has to be  /// reevaluated. Generally, IV scheduling should be done before coalescing. @@ -515,10 +500,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {      CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));  } -/// addVRegUseDeps - Add a register data dependency if the instruction that -/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a -/// register antidependency from this SUnit to instructions that occur later in -/// the same scheduling region if they write the virtual register. +/// \brief Adds a register data dependency if the instruction that defines the +/// virtual register used at OperIdx is mapped to an SUnit. Add a register +/// antidependency from this SUnit to instructions that occur later in the same +/// scheduling region if they write the virtual register.  ///  /// TODO: Handle ExitSU "uses" properly.  void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { @@ -545,87 +530,25 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {    }  } -/// Return true if MI is an instruction we are unable to reason about +/// Returns true if MI is an instruction we are unable to reason about  /// (like a call or something with unmodeled side effects).  static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {    return MI->isCall() || MI->hasUnmodeledSideEffects() ||           (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));  } -/// This returns true if the two MIs need a chain edge between them. -/// This is called on normal stores and loads. -static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI, -                             const DataLayout &DL, MachineInstr *MIa, -                             MachineInstr *MIb) { -  const MachineFunction *MF = MIa->getParent()->getParent(); -  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - -  assert ((MIa->mayStore() || MIb->mayStore()) && -          "Dependency checked between two loads"); - -  // Let the target decide if memory accesses cannot possibly overlap. -  if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA)) -    return false; - -  // To this point analysis is generic. From here on we do need AA. -  if (!AA) -    return true; - -  // FIXME: Need to handle multiple memory operands to support all targets. -  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) -    return true; - -  MachineMemOperand *MMOa = *MIa->memoperands_begin(); -  MachineMemOperand *MMOb = *MIb->memoperands_begin(); - -  if (!MMOa->getValue() || !MMOb->getValue()) -    return true; - -  // The following interface to AA is fashioned after DAGCombiner::isAlias -  // and operates with MachineMemOperand offset with some important -  // assumptions: -  //   - LLVM fundamentally assumes flat address spaces. -  //   - MachineOperand offset can *only* result from legalization and -  //     cannot affect queries other than the trivial case of overlap -  //     checking. -  //   - These offsets never wrap and never step outside -  //     of allocated objects. -  //   - There should never be any negative offsets here. -  // -  // FIXME: Modify API to hide this math from "user" -  // FIXME: Even before we go to AA we can reason locally about some -  // memory objects. It can save compile time, and possibly catch some -  // corner cases not currently covered. - -  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); -  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); - -  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); -  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; -  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; - -  AliasResult AAResult = -      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, -                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), -                MemoryLocation(MMOb->getValue(), Overlapb, -                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); - -  return (AAResult != NoAlias); -} - -/// Check whether two objects need a chain edge and add it if needed.  void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,                                              unsigned Latency) { -  if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(), -                       SUb->getInstr())) { +  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {      SDep Dep(SUa, SDep::MayAliasMem);      Dep.setLatency(Latency);      SUb->addPred(Dep);    }  } -/// Create an SUnit for each real instruction, numbered in top-down topological -/// order. The instruction order A < B, implies that no edge exists from B to A. +/// \brief Creates an SUnit for each real instruction, numbered in top-down +/// topological order. The instruction order A < B, implies that no edge exists +/// from B to A.  ///  /// Map each real instruction to its SUnit.  /// @@ -682,14 +605,13 @@ void ScheduleDAGInstrs::initSUnits() {  }  class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { -    /// Current total number of SUs in map.    unsigned NumNodes;    /// 1 for loads, 0 for stores. (see comment in SUList)    unsigned TrueMemOrderLatency; -public: +public:    Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}    /// To keep NumNodes up to date, insert() is used instead of @@ -697,8 +619,8 @@ public:    ValueType &operator[](const SUList &Key) {      llvm_unreachable("Don't use. Use insert() instead."); }; -  /// Add SU to the SUList of V. If Map grows huge, reduce its size -  /// by calling reduce(). +  /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling +  /// reduce().    void inline insert(SUnit *SU, ValueType V) {      MapVector::operator[](V).push_back(SU);      NumNodes++; @@ -723,7 +645,7 @@ public:    unsigned inline size() const { return NumNodes; } -  /// Count the number of SUs in this map after a reduction. +  /// Counts the number of SUs in this map after a reduction.    void reComputeSize(void) {      NumNodes = 0;      for (auto &I : *this) @@ -797,9 +719,6 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {    map.reComputeSize();  } -/// If RegPressure is non-null, compute register pressure as a side effect. The -/// DAG builder is an efficient place to do it because it already visits -/// operands.  void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,                                          RegPressureTracker *RPTracker,                                          PressureDiffs *PDiffs, @@ -1088,10 +1007,6 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() {    }  } -/// Reduce maps in FIFO order, by N SUs. This is better than turning -/// every Nth memory SU into BarrierChain in buildSchedGraph(), since -/// it avoids unnecessary edges between seen SUs above the new -/// BarrierChain, and those below it.  void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,                                                Value2SUsMap &loads, unsigned N) {    DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; @@ -1142,7 +1057,6 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,          loads.dump());  } -/// \brief Initialize register live-range state for updating kills.  void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {    // Start with no live registers.    LiveRegs.reset(); @@ -1178,32 +1092,35 @@ static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,        if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))           return;      } else -        (--End)->clearRegisterKills(Reg, TRI); +      (--End)->clearRegisterKills(Reg, TRI);    }  } -bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) { +void ScheduleDAGInstrs::toggleKillFlag(MachineInstr &MI, MachineOperand &MO) { +  if (MO.isDebug()) +    return; +    // Setting kill flag...    if (!MO.isKill()) {      MO.setIsKill(true); -    toggleBundleKillFlag(MI, MO.getReg(), true, TRI); -    return false; +    toggleBundleKillFlag(&MI, MO.getReg(), true, TRI); +    return;    }    // If MO itself is live, clear the kill flag...    if (LiveRegs.test(MO.getReg())) {      MO.setIsKill(false); -    toggleBundleKillFlag(MI, MO.getReg(), false, TRI); -    return false; +    toggleBundleKillFlag(&MI, MO.getReg(), false, TRI); +    return;    }    // If any subreg of MO is live, then create an imp-def for that    // subreg and keep MO marked as killed.    MO.setIsKill(false); -  toggleBundleKillFlag(MI, MO.getReg(), false, TRI); +  toggleBundleKillFlag(&MI, MO.getReg(), false, TRI);    bool AllDead = true;    const unsigned SuperReg = MO.getReg(); -  MachineInstrBuilder MIB(MF, MI); +  MachineInstrBuilder MIB(MF, &MI);    for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {      if (LiveRegs.test(*SubRegs)) {        MIB.addReg(*SubRegs, RegState::ImplicitDefine); @@ -1213,13 +1130,12 @@ bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {    if(AllDead) {      MO.setIsKill(true); -    toggleBundleKillFlag(MI, MO.getReg(), true, TRI); +    toggleBundleKillFlag(&MI, MO.getReg(), true, TRI);    } -  return false;  } -// FIXME: Reuse the LivePhysRegs utility for this.  void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { +  // FIXME: Reuse the LivePhysRegs utility for this.    DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');    LiveRegs.resize(TRI->getNumRegs()); @@ -1289,7 +1205,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {        if (MO.isKill() != kill) {          DEBUG(dbgs() << "Fixing " << MO << " in "); -        toggleKillFlag(&MI, MO); +        toggleKillFlag(MI, MO);          DEBUG(MI.dump());          DEBUG({            if (MI.getOpcode() == TargetOpcode::BUNDLE) { @@ -1319,6 +1235,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {  }  void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { +  // Cannot completely remove virtual function even in release mode.  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)    SU->getInstr()->dump();  #endif @@ -1347,7 +1264,7 @@ std::string ScheduleDAGInstrs::getDAGName() const {  //===----------------------------------------------------------------------===//  namespace llvm { -/// \brief Internal state used to compute SchedDFSResult. +/// Internal state used to compute SchedDFSResult.  class SchedDFSImpl {    SchedDFSResult &R; @@ -1358,8 +1275,8 @@ class SchedDFSImpl {    struct RootData {      unsigned NodeID; -    unsigned ParentNodeID;  // Parent node (member of the parent subtree). -    unsigned SubInstrCount; // Instr count in this tree only, not children. +    unsigned ParentNodeID;  ///< Parent node (member of the parent subtree). +    unsigned SubInstrCount; ///< Instr count in this tree only, not children.      RootData(unsigned id): NodeID(id),                             ParentNodeID(SchedDFSResult::InvalidSubtreeID), @@ -1375,7 +1292,7 @@ public:      RootSet.setUniverse(R.DFSNodeData.size());    } -  /// Return true if this node been visited by the DFS traversal. +  /// Returns true if this node been visited by the DFS traversal.    ///    /// During visitPostorderNode the Node's SubtreeID is assigned to the Node    /// ID. Later, SubtreeID is updated but remains valid. @@ -1384,7 +1301,7 @@ public:        != SchedDFSResult::InvalidSubtreeID;    } -  /// Initialize this node's instruction count. We don't need to flag the node +  /// Initializes this node's instruction count. We don't need to flag the node    /// visited until visitPostorder because the DAG cannot have cycles.    void visitPreorder(const SUnit *SU) {      R.DFSNodeData[SU->NodeNum].InstrCount = @@ -1433,8 +1350,8 @@ public:      RootSet[SU->NodeNum] = RData;    } -  /// Called once for each tree edge after calling visitPostOrderNode on the -  /// predecessor. Increment the parent node's instruction count and +  /// \brief Called once for each tree edge after calling visitPostOrderNode on +  /// the predecessor. Increment the parent node's instruction count and    /// preemptively join this subtree to its parent's if it is small enough.    void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {      R.DFSNodeData[Succ->NodeNum].InstrCount @@ -1442,13 +1359,13 @@ public:      joinPredSubtree(PredDep, Succ);    } -  /// Add a connection for cross edges. +  /// Adds a connection for cross edges.    void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {      ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));    } -  /// Set each node's subtree ID to the representative ID and record connections -  /// between trees. +  /// Sets each node's subtree ID to the representative ID and record +  /// connections between trees.    void finalize() {      SubtreeClasses.compress();      R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); @@ -1484,8 +1401,8 @@ public:    }  protected: -  /// Join the predecessor subtree with the successor that is its DFS -  /// parent. Apply some heuristics before joining. +  /// Joins the predecessor subtree with the successor that is its DFS parent. +  /// Applies some heuristics before joining.    bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,                         bool CheckLimit = true) {      assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); @@ -1531,10 +1448,10 @@ protected:      } while (FromTree != SchedDFSResult::InvalidSubtreeID);    }  }; -} // namespace llvm +} // end namespace llvm  namespace { -/// \brief Manage the stack used by a reverse depth-first search over the DAG. +/// Manage the stack used by a reverse depth-first search over the DAG.  class SchedDAGReverseDFS {    std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;  public: @@ -1569,7 +1486,7 @@ static bool hasDataSucc(const SUnit *SU) {    return false;  } -/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first +/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first  /// search from this root.  void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {    if (!IsBottomUp) @@ -1626,8 +1543,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) {    }  } -LLVM_DUMP_METHOD -void ILPValue::print(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {    OS << InstrCount << " / " << Length << " = ";    if (!Length)      OS << "BADILP"; @@ -1635,8 +1552,7 @@ void ILPValue::print(raw_ostream &OS) const {      OS << format("%g", ((double)InstrCount / Length));  } -LLVM_DUMP_METHOD -void ILPValue::dump() const { +LLVM_DUMP_METHOD void ILPValue::dump() const {    dbgs() << *this << '\n';  } @@ -1648,4 +1564,5 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {    return OS;  } -} // namespace llvm +} // end namespace llvm +#endif diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp index 83bc1ba7beb9..b3d83d5313af 100644 --- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp +++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp @@ -1,4 +1,4 @@ -//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===// +//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -15,11 +15,13 @@  #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"  #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/MC/MCInstrDesc.h"  #include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h" +#include <cassert>  using namespace llvm; @@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(      const InstrItineraryData *II, const ScheduleDAG *SchedDAG,      const char *ParentDebugType)      : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II), -      DAG(SchedDAG), IssueWidth(0), IssueCount(0) { - +      DAG(SchedDAG) {    // Determine the maximum depth of any itinerary. This determines the depth of    // the scoreboard. We always make the scoreboard at least 1 cycle deep to    // avoid dealing with the boundary condition. diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2c7bffe76503..4d468551ae24 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -53,10 +53,6 @@ STATISTIC(SlicedLoads, "Number of load sliced");  namespace {    static cl::opt<bool> -    CombinerAA("combiner-alias-analysis", cl::Hidden, -               cl::desc("Enable DAG combiner alias-analysis heuristics")); - -  static cl::opt<bool>      CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,                 cl::desc("Enable DAG combiner's use of IR alias analysis")); @@ -133,6 +129,9 @@ namespace {      /// Add to the worklist making sure its instance is at the back (next to be      /// processed.)      void AddToWorklist(SDNode *N) { +      assert(N->getOpcode() != ISD::DELETED_NODE && +             "Deleted Node added to Worklist"); +        // Skip handle nodes as they can't usefully be combined and confuse the        // zero-use deletion strategy.        if (N->getOpcode() == ISD::HANDLENODE) @@ -177,6 +176,7 @@ namespace {      void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);    private: +    unsigned MaximumLegalStoreInBits;      /// Check the specified integer node value to see if it can be simplified or      /// if things it uses can be simplified by bit propagation. @@ -232,9 +232,12 @@ namespace {      SDValue visitTokenFactor(SDNode *N);      SDValue visitMERGE_VALUES(SDNode *N);      SDValue visitADD(SDNode *N); +    SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);      SDValue visitSUB(SDNode *N);      SDValue visitADDC(SDNode *N); +    SDValue visitUADDO(SDNode *N);      SDValue visitSUBC(SDNode *N); +    SDValue visitUSUBO(SDNode *N);      SDValue visitADDE(SDNode *N);      SDValue visitSUBE(SDNode *N);      SDValue visitMUL(SDNode *N); @@ -259,6 +262,7 @@ namespace {      SDValue visitSRA(SDNode *N);      SDValue visitSRL(SDNode *N);      SDValue visitRotate(SDNode *N); +    SDValue visitABS(SDNode *N);      SDValue visitBSWAP(SDNode *N);      SDValue visitBITREVERSE(SDNode *N);      SDValue visitCTLZ(SDNode *N); @@ -274,6 +278,7 @@ namespace {      SDValue visitSIGN_EXTEND(SDNode *N);      SDValue visitZERO_EXTEND(SDNode *N);      SDValue visitANY_EXTEND(SDNode *N); +    SDValue visitAssertZext(SDNode *N);      SDValue visitSIGN_EXTEND_INREG(SDNode *N);      SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);      SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -336,6 +341,7 @@ namespace {      SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);      SDValue foldSelectOfConstants(SDNode *N); +    SDValue foldBinOpIntoSelect(SDNode *BO);      bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);      SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);      SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); @@ -344,6 +350,8 @@ namespace {                               bool NotExtCompare = false);      SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,                                     SDValue N2, SDValue N3, ISD::CondCode CC); +    SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, +                              const SDLoc &DL);      SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,                            const SDLoc &DL, bool foldBooleans = true); @@ -377,6 +385,7 @@ namespace {                                unsigned PosOpcode, unsigned NegOpcode,                                const SDLoc &DL);      SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); +    SDValue MatchLoadCombine(SDNode *N);      SDValue ReduceLoadWidth(SDNode *N);      SDValue ReduceLoadOpStoreWidth(SDNode *N);      SDValue splitMergedValStore(StoreSDNode *ST); @@ -384,9 +393,9 @@ namespace {      SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);      SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);      SDValue reduceBuildVecToShuffle(SDNode *N); -    SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask, -                                  SDValue VecIn1, SDValue VecIn2, -                                  unsigned LeftIdx); +    SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, +                                  ArrayRef<int> VectorMask, SDValue VecIn1, +                                  SDValue VecIn2, unsigned LeftIdx);      SDValue GetDemandedBits(SDValue V, const APInt &Mask); @@ -416,15 +425,12 @@ namespace {      /// Holds a pointer to an LSBaseSDNode as well as information on where it      /// is located in a sequence of memory operations connected by a chain.      struct MemOpLink { -      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): -      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } +      MemOpLink(LSBaseSDNode *N, int64_t Offset) +          : MemNode(N), OffsetFromBase(Offset) {}        // Ptr to the mem node.        LSBaseSDNode *MemNode;        // Offset from the base ptr.        int64_t OffsetFromBase; -      // What is the sequence number of this mem node. -      // Lowest mem operand in the DAG starts at zero. -      unsigned SequenceNum;      };      /// This is a helper function for visitMUL to check the profitability @@ -435,12 +441,6 @@ namespace {                                       SDValue &AddNode,                                       SDValue &ConstNode); -    /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a -    /// constant build_vector of the stored constant values in Stores. -    SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL, -                                         ArrayRef<MemOpLink> Stores, -                                         SmallVectorImpl<SDValue> &Chains, -                                         EVT Ty) const;      /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns      /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns @@ -451,34 +451,35 @@ namespace {                            EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,                            bool &NarrowLoad); +    /// Helper function for MergeConsecutiveStores which merges the +    /// component store chains. +    SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, +                                unsigned NumStores); +      /// This is a helper function for MergeConsecutiveStores. When the source      /// elements of the consecutive stores are all constants or all extracted      /// vector elements, try to merge them into one larger store. -    /// \return number of stores that were merged into a merged store (always -    /// a prefix of \p StoreNode). -    bool MergeStoresOfConstantsOrVecElts( -        SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, -        bool IsConstantSrc, bool UseVector); +    /// \return True if a merged store was created. +    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, +                                         EVT MemVT, unsigned NumStores, +                                         bool IsConstantSrc, bool UseVector);      /// This is a helper function for MergeConsecutiveStores.      /// Stores that may be merged are placed in StoreNodes. -    /// Loads that may alias with those stores are placed in AliasLoadNodes. -    void getStoreMergeAndAliasCandidates( -        StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, -        SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes); +    void getStoreMergeCandidates(StoreSDNode *St, +                                 SmallVectorImpl<MemOpLink> &StoreNodes);      /// Helper function for MergeConsecutiveStores. Checks if      /// Candidate stores have indirect dependency through their      /// operands. \return True if safe to merge      bool checkMergeStoreCandidatesForDependencies( -        SmallVectorImpl<MemOpLink> &StoreNodes); +        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);      /// Merge consecutive store operations into a wide store.      /// This optimization uses wide integers or vectors when possible.      /// \return number of stores that were merged into a merged store (the      /// affected nodes are stored as a prefix in \p StoreNodes). -    bool MergeConsecutiveStores(StoreSDNode *N, -                                SmallVectorImpl<MemOpLink> &StoreNodes); +    bool MergeConsecutiveStores(StoreSDNode *N);      /// \brief Try to transform a truncation where C is a constant:      ///     (trunc (and X, C)) -> (and (trunc X), (trunc C)) @@ -493,6 +494,13 @@ namespace {          : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),            OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {        ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); + +      MaximumLegalStoreInBits = 0; +      for (MVT VT : MVT::all_valuetypes()) +        if (EVT(VT).isSimple() && VT != MVT::Other && +            TLI.isTypeLegal(EVT(VT)) && +            VT.getSizeInBits() >= MaximumLegalStoreInBits) +          MaximumLegalStoreInBits = VT.getSizeInBits();      }      /// Runs the dag combiner on all nodes in the work list @@ -607,10 +615,16 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,    switch (Op.getOpcode()) {    default: return false; -  case ISD::ConstantFP: -    // Don't invert constant FP values after legalize.  The negated constant -    // isn't necessarily legal. -    return LegalOperations ? 0 : 1; +  case ISD::ConstantFP: { +    if (!LegalOperations) +      return 1; + +    // Don't invert constant FP values after legalization unless the target says +    // the negated constant is legal. +    EVT VT = Op.getValueType(); +    return TLI.isOperationLegal(ISD::ConstantFP, VT) || +      TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT); +  }    case ISD::FADD:      // FIXME: determine better conditions for this xform.      if (!Options->UnsafeFPMath) return 0; @@ -629,7 +643,8 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,                                Depth + 1);    case ISD::FSUB:      // We can't turn -(A-B) into B-A when we honor signed zeros. -    if (!Options->UnsafeFPMath && !Op.getNode()->getFlags()->hasNoSignedZeros()) +    if (!Options->NoSignedZerosFPMath && +        !Op.getNode()->getFlags()->hasNoSignedZeros())        return 0;      // fold (fneg (fsub A, B)) -> (fsub B, A) @@ -1079,37 +1094,36 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {    if (TLI.IsDesirableToPromoteOp(Op, PVT)) {      assert(PVT != VT && "Don't know what type to promote to!"); +    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); +      bool Replace0 = false;      SDValue N0 = Op.getOperand(0);      SDValue NN0 = PromoteOperand(N0, PVT, Replace0); -    if (!NN0.getNode()) -      return SDValue();      bool Replace1 = false;      SDValue N1 = Op.getOperand(1); -    SDValue NN1; -    if (N0 == N1) -      NN1 = NN0; -    else { -      NN1 = PromoteOperand(N1, PVT, Replace1); -      if (!NN1.getNode()) -        return SDValue(); -    } +    SDValue NN1 = PromoteOperand(N1, PVT, Replace1); +    SDLoc DL(Op); -    AddToWorklist(NN0.getNode()); -    if (NN1.getNode()) -      AddToWorklist(NN1.getNode()); +    SDValue RV = +        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); -    if (Replace0) +    // New replace instances of N0 and N1 +    if (Replace0 && N0 && N0.getOpcode() != ISD::DELETED_NODE && NN0 && +        NN0.getOpcode() != ISD::DELETED_NODE) { +      AddToWorklist(NN0.getNode());        ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); -    if (Replace1) +    } + +    if (Replace1 && N1 && N1.getOpcode() != ISD::DELETED_NODE && NN1 && +        NN1.getOpcode() != ISD::DELETED_NODE) { +      AddToWorklist(NN1.getNode());        ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); +    } -    DEBUG(dbgs() << "\nPromoting "; -          Op.getNode()->dump(&DAG)); -    SDLoc DL(Op); -    return DAG.getNode(ISD::TRUNCATE, DL, VT, -                       DAG.getNode(Opc, DL, PVT, NN0, NN1)); +    // Deal with Op being deleted. +    if (Op && Op.getOpcode() != ISD::DELETED_NODE) +      return RV;    }    return SDValue();  } @@ -1137,26 +1151,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {    if (TLI.IsDesirableToPromoteOp(Op, PVT)) {      assert(PVT != VT && "Don't know what type to promote to!"); +    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); +      bool Replace = false;      SDValue N0 = Op.getOperand(0); +    SDValue N1 = Op.getOperand(1);      if (Opc == ISD::SRA) -      N0 = SExtPromoteOperand(Op.getOperand(0), PVT); +      N0 = SExtPromoteOperand(N0, PVT);      else if (Opc == ISD::SRL) -      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT); +      N0 = ZExtPromoteOperand(N0, PVT);      else        N0 = PromoteOperand(N0, PVT, Replace); +      if (!N0.getNode())        return SDValue(); +    SDLoc DL(Op); +    SDValue RV = +        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); +      AddToWorklist(N0.getNode());      if (Replace)        ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); -    DEBUG(dbgs() << "\nPromoting "; -          Op.getNode()->dump(&DAG)); -    SDLoc DL(Op); -    return DAG.getNode(ISD::TRUNCATE, DL, VT, -                       DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1))); +    // Deal with Op being deleted. +    if (Op && Op.getOpcode() != ISD::DELETED_NODE) +      return RV;    }    return SDValue();  } @@ -1361,8 +1381,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {      else {        assert(N->getValueType(0) == RV.getValueType() &&               N->getNumValues() == 1 && "Type mismatch"); -      SDValue OpV = RV; -      DAG.ReplaceAllUsesWith(N, &OpV); +      DAG.ReplaceAllUsesWith(N, &RV);      }      // Push the new node and any users onto the worklist @@ -1389,7 +1408,9 @@ SDValue DAGCombiner::visit(SDNode *N) {    case ISD::ADD:                return visitADD(N);    case ISD::SUB:                return visitSUB(N);    case ISD::ADDC:               return visitADDC(N); +  case ISD::UADDO:              return visitUADDO(N);    case ISD::SUBC:               return visitSUBC(N); +  case ISD::USUBO:              return visitUSUBO(N);    case ISD::ADDE:               return visitADDE(N);    case ISD::SUBE:               return visitSUBE(N);    case ISD::MUL:                return visitMUL(N); @@ -1415,6 +1436,7 @@ SDValue DAGCombiner::visit(SDNode *N) {    case ISD::SRL:                return visitSRL(N);    case ISD::ROTR:    case ISD::ROTL:               return visitRotate(N); +  case ISD::ABS:                return visitABS(N);    case ISD::BSWAP:              return visitBSWAP(N);    case ISD::BITREVERSE:         return visitBITREVERSE(N);    case ISD::CTLZ:               return visitCTLZ(N); @@ -1430,6 +1452,7 @@ SDValue DAGCombiner::visit(SDNode *N) {    case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);    case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);    case ISD::ANY_EXTEND:         return visitANY_EXTEND(N); +  case ISD::AssertZext:         return visitAssertZext(N);    case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);    case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);    case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -1574,7 +1597,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {    }    SmallVector<SDNode *, 8> TFs;     // List of token factors to visit. -  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor. +  SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.    SmallPtrSet<SDNode*, 16> SeenOps;    bool Changed = false;             // If we should replace this token factor. @@ -1618,6 +1641,86 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {      }    } +  // Remove Nodes that are chained to another node in the list. Do so +  // by walking up chains breath-first stopping when we've seen +  // another operand. In general we must climb to the EntryNode, but we can exit +  // early if we find all remaining work is associated with just one operand as +  // no further pruning is possible. + +  // List of nodes to search through and original Ops from which they originate. +  SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; +  SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. +  SmallPtrSet<SDNode *, 16> SeenChains; +  bool DidPruneOps = false; + +  unsigned NumLeftToConsider = 0; +  for (const SDValue &Op : Ops) { +    Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); +    OpWorkCount.push_back(1); +  } + +  auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { +    // If this is an Op, we can remove the op from the list. Remark any +    // search associated with it as from the current OpNumber. +    if (SeenOps.count(Op) != 0) { +      Changed = true; +      DidPruneOps = true; +      unsigned OrigOpNumber = 0; +      while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) +        OrigOpNumber++; +      assert((OrigOpNumber != Ops.size()) && +             "expected to find TokenFactor Operand"); +      // Re-mark worklist from OrigOpNumber to OpNumber +      for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { +        if (Worklist[i].second == OrigOpNumber) { +          Worklist[i].second = OpNumber; +        } +      } +      OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; +      OpWorkCount[OrigOpNumber] = 0; +      NumLeftToConsider--; +    } +    // Add if it's a new chain +    if (SeenChains.insert(Op).second) { +      OpWorkCount[OpNumber]++; +      Worklist.push_back(std::make_pair(Op, OpNumber)); +    } +  }; + +  for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { +    // We need at least be consider at least 2 Ops to prune. +    if (NumLeftToConsider <= 1) +      break; +    auto CurNode = Worklist[i].first; +    auto CurOpNumber = Worklist[i].second; +    assert((OpWorkCount[CurOpNumber] > 0) && +           "Node should not appear in worklist"); +    switch (CurNode->getOpcode()) { +    case ISD::EntryToken: +      // Hitting EntryToken is the only way for the search to terminate without +      // hitting +      // another operand's search. Prevent us from marking this operand +      // considered. +      NumLeftToConsider++; +      break; +    case ISD::TokenFactor: +      for (const SDValue &Op : CurNode->op_values()) +        AddToWorklist(i, Op.getNode(), CurOpNumber); +      break; +    case ISD::CopyFromReg: +    case ISD::CopyToReg: +      AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); +      break; +    default: +      if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) +        AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); +      break; +    } +    OpWorkCount[CurOpNumber]--; +    if (OpWorkCount[CurOpNumber] == 0) +      NumLeftToConsider--; +  } +    SDValue Result;    // If we've changed things around then replace token factor. @@ -1626,15 +1729,22 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {        // The entry token is the only possible outcome.        Result = DAG.getEntryNode();      } else { -      // New and improved token factor. -      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); +      if (DidPruneOps) { +        SmallVector<SDValue, 8> PrunedOps; +        // +        for (const SDValue &Op : Ops) { +          if (SeenChains.count(Op.getNode()) == 0) +            PrunedOps.push_back(Op); +        } +        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); +      } else { +        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); +      }      } -    // Add users to worklist if AA is enabled, since it may introduce -    // a lot of new chained token factors while removing memory deps. -    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA -      : DAG.getSubtarget().useAA(); -    return CombineTo(N, Result, UseAA /*add to worklist*/); +    // Add users to worklist, since we may introduce a lot of new +    // chained token factors while removing memory deps. +    return CombineTo(N, Result, true /*add to worklist*/);    }    return Result; @@ -1664,6 +1774,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {    return Const != nullptr && !Const->isOpaque() ? Const : nullptr;  } +SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { +  auto BinOpcode = BO->getOpcode(); +  assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB || +          BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV || +          BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM || +          BinOpcode == ISD::UREM || BinOpcode == ISD::AND || +          BinOpcode == ISD::OR || BinOpcode == ISD::XOR || +          BinOpcode == ISD::SHL || BinOpcode == ISD::SRL || +          BinOpcode == ISD::SRA || BinOpcode == ISD::FADD || +          BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL || +          BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) && +         "Unexpected binary operator"); + +  // Bail out if any constants are opaque because we can't constant fold those. +  SDValue C1 = BO->getOperand(1); +  if (!isConstantOrConstantVector(C1, true) && +      !isConstantFPBuildVectorOrConstantFP(C1)) +    return SDValue(); + +  // Don't do this unless the old select is going away. We want to eliminate the +  // binary operator, not replace a binop with a select. +  // TODO: Handle ISD::SELECT_CC. +  SDValue Sel = BO->getOperand(0); +  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) +    return SDValue(); + +  SDValue CT = Sel.getOperand(1); +  if (!isConstantOrConstantVector(CT, true) && +      !isConstantFPBuildVectorOrConstantFP(CT)) +    return SDValue(); + +  SDValue CF = Sel.getOperand(2); +  if (!isConstantOrConstantVector(CF, true) && +      !isConstantFPBuildVectorOrConstantFP(CF)) +    return SDValue(); + +  // We have a select-of-constants followed by a binary operator with a +  // constant. Eliminate the binop by pulling the constant math into the select. +  // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1 +  EVT VT = Sel.getValueType(); +  SDLoc DL(Sel); +  SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1); +  assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) || +          isConstantFPBuildVectorOrConstantFP(NewCT)) && +         "Failed to constant fold a binop with constant operands"); + +  SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1); +  assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) || +          isConstantFPBuildVectorOrConstantFP(NewCF)) && +         "Failed to constant fold a binop with constant operands"); + +  return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); +} +  SDValue DAGCombiner::visitADD(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -1712,6 +1876,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {        }    } +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // reassociate add    if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))      return RADD; @@ -1774,6 +1941,19 @@ SDValue DAGCombiner::visitADD(SDNode *N) {        VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1))      return DAG.getNode(ISD::OR, DL, VT, N0, N1); +  if (SDValue Combined = visitADDLike(N0, N1, N)) +    return Combined; + +  if (SDValue Combined = visitADDLike(N1, N0, N)) +    return Combined; + +  return SDValue(); +} + +SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { +  EVT VT = N0.getValueType(); +  SDLoc DL(LocReference); +    // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))    if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&        isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) @@ -1781,12 +1961,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) {                         DAG.getNode(ISD::SHL, DL, VT,                                     N1.getOperand(0).getOperand(1),                                     N1.getOperand(1))); -  if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB && -      isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0))) -    return DAG.getNode(ISD::SUB, DL, VT, N1, -                       DAG.getNode(ISD::SHL, DL, VT, -                                   N0.getOperand(0).getOperand(1), -                                   N0.getOperand(1)));    if (N1.getOpcode() == ISD::AND) {      SDValue AndOp0 = N1.getOperand(0); @@ -1797,7 +1971,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {      // and similar xforms where the inner op is either ~0 or 0.      if (NumSignBits == DestBits &&          isOneConstantOrOneSplatConstant(N1->getOperand(1))) -      return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0); +      return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);    }    // add (sext i1), X -> sub X, (zext i1) @@ -1825,39 +1999,61 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1);    EVT VT = N0.getValueType(); +  SDLoc DL(N);    // If the flag result is dead, turn this into an ADD.    if (!N->hasAnyUseOfValue(1)) -    return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1), -                     DAG.getNode(ISD::CARRY_FALSE, -                                 SDLoc(N), MVT::Glue)); +    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), +                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));    // canonicalize constant to RHS.    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);    if (N0C && !N1C) -    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0); +    return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);    // fold (addc x, 0) -> x + no carry out    if (isNullConstant(N1))      return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, -                                        SDLoc(N), MVT::Glue)); +                                        DL, MVT::Glue)); -  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits. -  APInt LHSZero, LHSOne; -  APInt RHSZero, RHSOne; -  DAG.computeKnownBits(N0, LHSZero, LHSOne); +  // If it cannot overflow, transform into an add. +  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) +    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), +                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); -  if (LHSZero.getBoolValue()) { -    DAG.computeKnownBits(N1, RHSZero, RHSOne); +  return SDValue(); +} -    // If all possibly-set bits on the LHS are clear on the RHS, return an OR. -    // If all possibly-set bits on the RHS are clear on the LHS, return an OR. -    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero) -      return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1), -                       DAG.getNode(ISD::CARRY_FALSE, -                                   SDLoc(N), MVT::Glue)); -  } +SDValue DAGCombiner::visitUADDO(SDNode *N) { +  SDValue N0 = N->getOperand(0); +  SDValue N1 = N->getOperand(1); +  EVT VT = N0.getValueType(); +  if (VT.isVector()) +    return SDValue(); + +  EVT CarryVT = N->getValueType(1); +  SDLoc DL(N); + +  // If the flag result is dead, turn this into an ADD. +  if (!N->hasAnyUseOfValue(1)) +    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), +                     DAG.getUNDEF(CarryVT)); + +  // canonicalize constant to RHS. +  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); +  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); +  if (N0C && !N1C) +    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); + +  // fold (uaddo x, 0) -> x + no carry out +  if (isNullConstant(N1)) +    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + +  // If it cannot overflow, transform into an add. +  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) +    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), +                     DAG.getConstant(0, DL, CarryVT));    return SDValue();  } @@ -1920,6 +2116,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {                                        N1.getNode());    } +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);    // fold (sub x, c) -> (add x, -c) @@ -2066,6 +2265,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {    return SDValue();  } +SDValue DAGCombiner::visitUSUBO(SDNode *N) { +  SDValue N0 = N->getOperand(0); +  SDValue N1 = N->getOperand(1); +  EVT VT = N0.getValueType(); +  if (VT.isVector()) +    return SDValue(); + +  EVT CarryVT = N->getValueType(1); +  SDLoc DL(N); + +  // If the flag result is dead, turn this into an SUB. +  if (!N->hasAnyUseOfValue(1)) +    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), +                     DAG.getUNDEF(CarryVT)); + +  // fold (usubo x, x) -> 0 + no borrow +  if (N0 == N1) +    return CombineTo(N, DAG.getConstant(0, DL, VT), +                     DAG.getConstant(0, DL, CarryVT)); + +  // fold (usubo x, 0) -> x + no borrow +  if (isNullConstant(N1)) +    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + +  // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow +  if (isAllOnesConstant(N0)) +    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), +                     DAG.getConstant(0, DL, CarryVT)); + +  return SDValue(); +} +  SDValue DAGCombiner::visitSUBE(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -2131,6 +2362,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {    // fold (mul x, 1) -> x    if (N1IsConst && ConstValue1 == 1 && IsFullSplat)      return N0; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // fold (mul x, -1) -> 0-x    if (N1IsConst && ConstValue1.isAllOnesValue()) {      SDLoc DL(N); @@ -2297,6 +2532,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {    return combined;  } +static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { +  SDValue N0 = N->getOperand(0); +  SDValue N1 = N->getOperand(1); +  EVT VT = N->getValueType(0); +  SDLoc DL(N); + +  if (DAG.isUndef(N->getOpcode(), {N0, N1})) +    return DAG.getUNDEF(VT); + +  // undef / X -> 0 +  // undef % X -> 0 +  if (N0.isUndef()) +    return DAG.getConstant(0, DL, VT); + +  return SDValue(); +} +  SDValue DAGCombiner::visitSDIV(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -2319,8 +2571,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {      return N0;    // fold (sdiv X, -1) -> 0-X    if (N1C && N1C->isAllOnesValue()) -    return DAG.getNode(ISD::SUB, DL, VT, -                       DAG.getConstant(0, DL, VT), N0); +    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); + +  if (SDValue V = simplifyDivRem(N, DAG)) +    return V; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel;    // If we know the sign bits of both operands are zero, strength reduce to a    // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2 @@ -2372,7 +2629,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {    // If integer divide is expensive and we satisfy the requirements, emit an    // alternate sequence.  Targets may check function attributes for size/speed    // trade-offs. -  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();    if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))      if (SDValue Op = BuildSDIV(N))        return Op; @@ -2384,13 +2641,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {      if (SDValue DivRem = useDivRem(N))          return DivRem; -  // undef / X -> 0 -  if (N0.isUndef()) -    return DAG.getConstant(0, DL, VT); -  // X / undef -> undef -  if (N1.isUndef()) -    return N1; -    return SDValue();  } @@ -2414,6 +2664,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {                                                      N0C, N1C))        return Folded; +  if (SDValue V = simplifyDivRem(N, DAG)) +    return V; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // fold (udiv x, (1 << c)) -> x >>u c    if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&        DAG.isKnownToBeAPowerOfTwo(N1)) { @@ -2444,7 +2700,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {    }    // fold (udiv x, c) -> alternate -  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();    if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))      if (SDValue Op = BuildUDIV(N))        return Op; @@ -2456,13 +2712,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {      if (SDValue DivRem = useDivRem(N))          return DivRem; -  // undef / X -> 0 -  if (N0.isUndef()) -    return DAG.getConstant(0, DL, VT); -  // X / undef -> undef -  if (N1.isUndef()) -    return N1; -    return SDValue();  } @@ -2482,32 +2731,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) {      if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))        return Folded; +  if (SDValue V = simplifyDivRem(N, DAG)) +    return V; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    if (isSigned) {      // If we know the sign bits of both operands are zero, strength reduce to a      // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15      if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))        return DAG.getNode(ISD::UREM, DL, VT, N0, N1);    } else { -    // fold (urem x, pow2) -> (and x, pow2-1) +    SDValue NegOne = DAG.getAllOnesConstant(DL, VT);      if (DAG.isKnownToBeAPowerOfTwo(N1)) { -      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); -      SDValue Add = -          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); +      // fold (urem x, pow2) -> (and x, pow2-1) +      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);        AddToWorklist(Add.getNode());        return DAG.getNode(ISD::AND, DL, VT, N0, Add);      } -    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))      if (N1.getOpcode() == ISD::SHL &&          DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { -      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); -      SDValue Add = -          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); +      // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) +      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);        AddToWorklist(Add.getNode());        return DAG.getNode(ISD::AND, DL, VT, N0, Add);      }    } -  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();    // If X/C can be simplified by the division-by-constant logic, lower    // X%C to the equivalent of X-X/C*C. @@ -2536,13 +2788,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) {    if (SDValue DivRem = useDivRem(N))      return DivRem.getValue(1); -  // undef % X -> 0 -  if (N0.isUndef()) -    return DAG.getConstant(0, DL, VT); -  // X % undef -> undef -  if (N1.isUndef()) -    return N1; -    return SDValue();  } @@ -2932,95 +3177,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {    return SDValue();  } +/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. +SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, +                                       const SDLoc &DL) { +  SDValue LL, LR, RL, RR, N0CC, N1CC; +  if (!isSetCCEquivalent(N0, LL, LR, N0CC) || +      !isSetCCEquivalent(N1, RL, RR, N1CC)) +    return SDValue(); + +  assert(N0.getValueType() == N1.getValueType() && +         "Unexpected operand types for bitwise logic op"); +  assert(LL.getValueType() == LR.getValueType() && +         RL.getValueType() == RR.getValueType() && +         "Unexpected operand types for setcc"); + +  // If we're here post-legalization or the logic op type is not i1, the logic +  // op type must match a setcc result type. Also, all folds require new +  // operations on the left and right operands, so those types must match. +  EVT VT = N0.getValueType(); +  EVT OpVT = LL.getValueType(); +  if (LegalOperations || VT != MVT::i1) +    if (VT != getSetCCResultType(OpVT)) +      return SDValue(); +  if (OpVT != RL.getValueType()) +    return SDValue(); + +  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); +  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); +  bool IsInteger = OpVT.isInteger(); +  if (LR == RR && CC0 == CC1 && IsInteger) { +    bool IsZero = isNullConstantOrNullSplatConstant(LR); +    bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); + +    // All bits clear? +    bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; +    // All sign bits clear? +    bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; +    // Any bits set? +    bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; +    // Any sign bits set? +    bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; + +    // (and (seteq X,  0), (seteq Y,  0)) --> (seteq (or X, Y),  0) +    // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) +    // (or  (setne X,  0), (setne Y,  0)) --> (setne (or X, Y),  0) +    // (or  (setlt X,  0), (setlt Y,  0)) --> (setlt (or X, Y),  0) +    if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { +      SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); +      AddToWorklist(Or.getNode()); +      return DAG.getSetCC(DL, VT, Or, LR, CC1); +    } + +    // All bits set? +    bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; +    // All sign bits set? +    bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; +    // Any bits clear? +    bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; +    // Any sign bits clear? +    bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; + +    // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) +    // (and (setlt X,  0), (setlt Y,  0)) --> (setlt (and X, Y),  0) +    // (or  (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) +    // (or  (setgt X, -1), (setgt Y  -1)) --> (setgt (and X, Y), -1) +    if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { +      SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); +      AddToWorklist(And.getNode()); +      return DAG.getSetCC(DL, VT, And, LR, CC1); +    } +  } + +  // TODO: What is the 'or' equivalent of this fold? +  // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) +  if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE && +      ((isNullConstant(LR) && isAllOnesConstant(RR)) || +       (isAllOnesConstant(LR) && isNullConstant(RR)))) { +    SDValue One = DAG.getConstant(1, DL, OpVT); +    SDValue Two = DAG.getConstant(2, DL, OpVT); +    SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); +    AddToWorklist(Add.getNode()); +    return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); +  } + +  // Try more general transforms if the predicates match and the only user of +  // the compares is the 'and' or 'or'. +  if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && +      N0.hasOneUse() && N1.hasOneUse()) { +    // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 +    // or  (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 +    if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { +      SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); +      SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); +      SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); +      SDValue Zero = DAG.getConstant(0, DL, OpVT); +      return DAG.getSetCC(DL, VT, Or, Zero, CC1); +    } +  } + +  // Canonicalize equivalent operands to LL == RL. +  if (LL == RR && LR == RL) { +    CC1 = ISD::getSetCCSwappedOperands(CC1); +    std::swap(RL, RR); +  } + +  // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) +  // (or  (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) +  if (LL == RL && LR == RR) { +    ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) +                                : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); +    if (NewCC != ISD::SETCC_INVALID && +        (!LegalOperations || +         (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && +          TLI.isOperationLegal(ISD::SETCC, OpVT)))) +      return DAG.getSetCC(DL, VT, LL, LR, NewCC); +  } + +  return SDValue(); +} +  /// This contains all DAGCombine rules which reduce two values combined by  /// an And operation to a single value. This makes them reusable in the context  /// of visitSELECT(). Rules involving constants are not included as  /// visitSELECT() already handles those cases. -SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, -                                  SDNode *LocReference) { +SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {    EVT VT = N1.getValueType(); +  SDLoc DL(N);    // fold (and x, undef) -> 0    if (N0.isUndef() || N1.isUndef()) -    return DAG.getConstant(0, SDLoc(LocReference), VT); -  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y)) -  SDValue LL, LR, RL, RR, CC0, CC1; -  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ -    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); -    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - -    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 && -        LL.getValueType().isInteger()) { -      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0) -      if (isNullConstant(LR) && Op1 == ISD::SETEQ) { -        EVT CCVT = getSetCCResultType(LR.getValueType()); -        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { -          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), -                                       LR.getValueType(), LL, RL); -          AddToWorklist(ORNode.getNode()); -          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); -        } -      } -      if (isAllOnesConstant(LR)) { -        // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1) -        if (Op1 == ISD::SETEQ) { -          EVT CCVT = getSetCCResultType(LR.getValueType()); -          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { -            SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0), -                                          LR.getValueType(), LL, RL); -            AddToWorklist(ANDNode.getNode()); -            return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); -          } -        } -        // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1) -        if (Op1 == ISD::SETGT) { -          EVT CCVT = getSetCCResultType(LR.getValueType()); -          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { -            SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), -                                         LR.getValueType(), LL, RL); -            AddToWorklist(ORNode.getNode()); -            return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); -          } -        } -      } -    } -    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2) -    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) && -        Op0 == Op1 && LL.getValueType().isInteger() && -      Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) || -                            (isAllOnesConstant(LR) && isNullConstant(RR)))) { -      EVT CCVT = getSetCCResultType(LL.getValueType()); -      if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { -        SDLoc DL(N0); -        SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(), -                                      LL, DAG.getConstant(1, DL, -                                                          LL.getValueType())); -        AddToWorklist(ADDNode.getNode()); -        return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode, -                            DAG.getConstant(2, DL, LL.getValueType()), -                            ISD::SETUGE); -      } -    } -    // canonicalize equivalent to ll == rl -    if (LL == RR && LR == RL) { -      Op1 = ISD::getSetCCSwappedOperands(Op1); -      std::swap(RL, RR); -    } -    if (LL == RL && LR == RR) { -      bool isInteger = LL.getValueType().isInteger(); -      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger); -      if (Result != ISD::SETCC_INVALID && -          (!LegalOperations || -           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && -            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { -        EVT CCVT = getSetCCResultType(LL.getValueType()); -        if (N0.getValueType() == CCVT || -            (!LegalOperations && N0.getValueType() == MVT::i1)) -          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), -                              LL, LR, Result); -      } -    } -  } +    return DAG.getConstant(0, DL, VT); + +  if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) +    return V;    if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&        VT.getSizeInBits() <= 64) { @@ -3037,13 +3326,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,            if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {              ADDC |= Mask;              if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { -              SDLoc DL(N0); +              SDLoc DL0(N0);                SDValue NewAdd = -                DAG.getNode(ISD::ADD, DL, VT, +                DAG.getNode(ISD::ADD, DL0, VT,                              N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));                CombineTo(N0.getNode(), NewAdd);                // Return N so it doesn't get rechecked! -              return SDValue(LocReference, 0); +              return SDValue(N, 0);              }            }          } @@ -3068,7 +3357,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,          unsigned MaskBits = AndMask.countTrailingOnes();          EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); -        if (APIntOps::isMask(AndMask) && +        if (AndMask.isMask() &&              // Required bits must not span the two halves of the integer and              // must fit in the half size type.              (ShiftBits + MaskBits <= Size / 2) && @@ -3108,7 +3397,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,                                     bool &NarrowLoad) {    uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); -  if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue())) +  if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits))      return false;    ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); @@ -3191,6 +3480,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {    if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),                                     APInt::getAllOnesValue(BitWidth)))      return DAG.getConstant(0, SDLoc(N), VT); + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // reassociate and    if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))      return RAND; @@ -3299,6 +3592,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {        // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to        // preserve semantics once we get rid of the AND.        SDValue NewLoad(Load, 0); + +      // Fold the AND away. NewLoad may get replaced immediately. +      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); +        if (Load->getExtensionType() == ISD::EXTLOAD) {          NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,                                Load->getValueType(0), SDLoc(Load), @@ -3316,10 +3613,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {          }        } -      // Fold the AND away, taking care not to fold to the old load node if we -      // replaced it. -      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); -        return SDValue(N, 0); // Return N so it doesn't get rechecked!      }    } @@ -3723,65 +4016,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {  /// This contains all DAGCombine rules which reduce two values combined by  /// an Or operation to a single value \see visitANDLike(). -SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { +SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {    EVT VT = N1.getValueType(); +  SDLoc DL(N); +    // fold (or x, undef) -> -1 -  if (!LegalOperations && -      (N0.isUndef() || N1.isUndef())) { -    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT; -    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), -                           SDLoc(LocReference), VT); -  } -  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y)) -  SDValue LL, LR, RL, RR, CC0, CC1; -  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ -    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); -    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - -    if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) { -      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0) -      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0) -      if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) { -        EVT CCVT = getSetCCResultType(LR.getValueType()); -        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { -          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR), -                                       LR.getValueType(), LL, RL); -          AddToWorklist(ORNode.getNode()); -          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); -        } -      } -      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1) -      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1) -      if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) { -        EVT CCVT = getSetCCResultType(LR.getValueType()); -        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { -          SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR), -                                        LR.getValueType(), LL, RL); -          AddToWorklist(ANDNode.getNode()); -          return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); -        } -      } -    } -    // canonicalize equivalent to ll == rl -    if (LL == RR && LR == RL) { -      Op1 = ISD::getSetCCSwappedOperands(Op1); -      std::swap(RL, RR); -    } -    if (LL == RL && LR == RR) { -      bool isInteger = LL.getValueType().isInteger(); -      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger); -      if (Result != ISD::SETCC_INVALID && -          (!LegalOperations || -           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && -            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { -        EVT CCVT = getSetCCResultType(LL.getValueType()); -        if (N0.getValueType() == CCVT || -            (!LegalOperations && N0.getValueType() == MVT::i1)) -          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), -                              LL, LR, Result); -      } -    } -  } +  if (!LegalOperations && (N0.isUndef() || N1.isUndef())) +    return DAG.getAllOnesConstant(DL, VT); + +  if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) +    return V;    // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && @@ -3802,7 +4046,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {              DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {            SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,                                    N0.getOperand(0), N1.getOperand(0)); -          SDLoc DL(LocReference);            return DAG.getNode(ISD::AND, DL, VT, X,                               DAG.getConstant(LHSMask | RHSMask, DL, VT));          } @@ -3818,7 +4061,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {        (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {      SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,                              N0.getOperand(1), N1.getOperand(1)); -    return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X); +    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);    }    return SDValue(); @@ -3847,14 +4090,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {      // fold (or x, -1) -> -1, vector edition      if (ISD::isBuildVectorAllOnes(N0.getNode()))        // do not return N0, because undef node may exist in N0 -      return DAG.getConstant( -          APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N), -          N0.getValueType()); +      return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());      if (ISD::isBuildVectorAllOnes(N1.getNode()))        // do not return N1, because undef node may exist in N1 -      return DAG.getConstant( -          APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N), -          N1.getValueType()); +      return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());      // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)      // Do this only if the resulting shuffle is legal. @@ -3867,7 +4106,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {        bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());        bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());        // Ensure both shuffles have a zero input. -      if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) { +      if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {          assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");          assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");          const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); @@ -3939,6 +4178,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {    // fold (or x, -1) -> -1    if (isAllOnesConstant(N1))      return N1; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // fold (or x, c) -> c iff (x & ~c) == 0    if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))      return N1; @@ -3956,7 +4199,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {    if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))      return ROR;    // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) -  // iff (c1 & c2) == 0. +  // iff (c1 & c2) != 0.    if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&               isa<ConstantSDNode>(N0.getOperand(1))) {      ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1)); @@ -3978,6 +4221,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {    if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))      return SDValue(Rot, 0); +  if (SDValue Load = MatchLoadCombine(N)) +    return Load; +    // Simplify the operands using demanded-bits information.    if (!VT.isVector() &&        SimplifyDemandedBits(SDValue(N, 0))) @@ -4190,8 +4436,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {      // If there is an AND of either shifted operand, apply it to the result.      if (LHSMask.getNode() || RHSMask.getNode()) { -      APInt AllBits = APInt::getAllOnesValue(EltSizeInBits); -      SDValue Mask = DAG.getConstant(AllBits, DL, VT); +      SDValue Mask = DAG.getAllOnesConstant(DL, VT);        if (LHSMask.getNode()) {          APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); @@ -4349,6 +4594,299 @@ struct BaseIndexOffset {  };  } // namespace +namespace { +/// Represents known origin of an individual byte in load combine pattern. The +/// value of the byte is either constant zero or comes from memory. +struct ByteProvider { +  // For constant zero providers Load is set to nullptr. For memory providers +  // Load represents the node which loads the byte from memory. +  // ByteOffset is the offset of the byte in the value produced by the load. +  LoadSDNode *Load; +  unsigned ByteOffset; + +  ByteProvider() : Load(nullptr), ByteOffset(0) {} + +  static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { +    return ByteProvider(Load, ByteOffset); +  } +  static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } + +  bool isConstantZero() const { return !Load; } +  bool isMemory() const { return Load; } + +  bool operator==(const ByteProvider &Other) const { +    return Other.Load == Load && Other.ByteOffset == ByteOffset; +  } + +private: +  ByteProvider(LoadSDNode *Load, unsigned ByteOffset) +      : Load(Load), ByteOffset(ByteOffset) {} +}; + +/// Recursively traverses the expression calculating the origin of the requested +/// byte of the given value. Returns None if the provider can't be calculated. +/// +/// For all the values except the root of the expression verifies that the value +/// has exactly one use and if it's not true return None. This way if the origin +/// of the byte is returned it's guaranteed that the values which contribute to +/// the byte are not used outside of this expression. +/// +/// Because the parts of the expression are not allowed to have more than one +/// use this function iterates over trees, not DAGs. So it never visits the same +/// node more than once. +const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, +                                                   unsigned Depth, +                                                   bool Root = false) { +  // Typical i64 by i8 pattern requires recursion up to 8 calls depth +  if (Depth == 10) +    return None; + +  if (!Root && !Op.hasOneUse()) +    return None; + +  assert(Op.getValueType().isScalarInteger() && "can't handle other types"); +  unsigned BitWidth = Op.getValueSizeInBits(); +  if (BitWidth % 8 != 0) +    return None; +  unsigned ByteWidth = BitWidth / 8; +  assert(Index < ByteWidth && "invalid index requested"); +  (void) ByteWidth; + +  switch (Op.getOpcode()) { +  case ISD::OR: { +    auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); +    if (!LHS) +      return None; +    auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); +    if (!RHS) +      return None; + +    if (LHS->isConstantZero()) +      return RHS; +    if (RHS->isConstantZero()) +      return LHS; +    return None; +  } +  case ISD::SHL: { +    auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); +    if (!ShiftOp) +      return None; + +    uint64_t BitShift = ShiftOp->getZExtValue(); +    if (BitShift % 8 != 0) +      return None; +    uint64_t ByteShift = BitShift / 8; + +    return Index < ByteShift +               ? ByteProvider::getConstantZero() +               : calculateByteProvider(Op->getOperand(0), Index - ByteShift, +                                       Depth + 1); +  } +  case ISD::ANY_EXTEND: +  case ISD::SIGN_EXTEND: +  case ISD::ZERO_EXTEND: { +    SDValue NarrowOp = Op->getOperand(0); +    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); +    if (NarrowBitWidth % 8 != 0) +      return None; +    uint64_t NarrowByteWidth = NarrowBitWidth / 8; + +    if (Index >= NarrowByteWidth) +      return Op.getOpcode() == ISD::ZERO_EXTEND +                 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) +                 : None; +    return calculateByteProvider(NarrowOp, Index, Depth + 1); +  } +  case ISD::BSWAP: +    return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, +                                 Depth + 1); +  case ISD::LOAD: { +    auto L = cast<LoadSDNode>(Op.getNode()); +    if (L->isVolatile() || L->isIndexed()) +      return None; + +    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); +    if (NarrowBitWidth % 8 != 0) +      return None; +    uint64_t NarrowByteWidth = NarrowBitWidth / 8; + +    if (Index >= NarrowByteWidth) +      return L->getExtensionType() == ISD::ZEXTLOAD +                 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) +                 : None; +    return ByteProvider::getMemory(L, Index); +  } +  } + +  return None; +} +} // namespace + +/// Match a pattern where a wide type scalar value is loaded by several narrow +/// loads and combined by shifts and ors. Fold it into a single load or a load +/// and a BSWAP if the targets supports it. +/// +/// Assuming little endian target: +///  i8 *a = ... +///  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) +/// => +///  i32 val = *((i32)a) +/// +///  i8 *a = ... +///  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] +/// => +///  i32 val = BSWAP(*((i32)a)) +/// +/// TODO: This rule matches complex patterns with OR node roots and doesn't +/// interact well with the worklist mechanism. When a part of the pattern is +/// updated (e.g. one of the loads) its direct users are put into the worklist, +/// but the root node of the pattern which triggers the load combine is not +/// necessarily a direct user of the changed node. For example, once the address +/// of t28 load is reassociated load combine won't be triggered: +///             t25: i32 = add t4, Constant:i32<2> +///           t26: i64 = sign_extend t25 +///        t27: i64 = add t2, t26 +///       t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 +///     t29: i32 = zero_extend t28 +///   t32: i32 = shl t29, Constant:i8<8> +/// t33: i32 = or t23, t32 +/// As a possible fix visitLoad can check if the load can be a part of a load +/// combine pattern and add corresponding OR roots to the worklist. +SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { +  assert(N->getOpcode() == ISD::OR && +         "Can only match load combining against OR nodes"); + +  // Handles simple types only +  EVT VT = N->getValueType(0); +  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) +    return SDValue(); +  unsigned ByteWidth = VT.getSizeInBits() / 8; + +  const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +  // Before legalize we can introduce too wide illegal loads which will be later +  // split into legal sized loads. This enables us to combine i64 load by i8 +  // patterns to a couple of i32 loads on 32 bit targets. +  if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) +    return SDValue(); + +  std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = []( +    unsigned BW, unsigned i) { return i; }; +  std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = []( +    unsigned BW, unsigned i) { return BW - i - 1; }; + +  bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); +  auto MemoryByteOffset = [&] (ByteProvider P) { +    assert(P.isMemory() && "Must be a memory byte provider"); +    unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); +    assert(LoadBitWidth % 8 == 0 && +           "can only analyze providers for individual bytes not bit"); +    unsigned LoadByteWidth = LoadBitWidth / 8; +    return IsBigEndianTarget +            ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) +            : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); +  }; + +  Optional<BaseIndexOffset> Base; +  SDValue Chain; + +  SmallSet<LoadSDNode *, 8> Loads; +  Optional<ByteProvider> FirstByteProvider; +  int64_t FirstOffset = INT64_MAX; + +  // Check if all the bytes of the OR we are looking at are loaded from the same +  // base address. Collect bytes offsets from Base address in ByteOffsets. +  SmallVector<int64_t, 4> ByteOffsets(ByteWidth); +  for (unsigned i = 0; i < ByteWidth; i++) { +    auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); +    if (!P || !P->isMemory()) // All the bytes must be loaded from memory +      return SDValue(); + +    LoadSDNode *L = P->Load; +    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && +           "Must be enforced by calculateByteProvider"); +    assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); + +    // All loads must share the same chain +    SDValue LChain = L->getChain(); +    if (!Chain) +      Chain = LChain; +    else if (Chain != LChain) +      return SDValue(); + +    // Loads must share the same base address +    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); +    if (!Base) +      Base = Ptr; +    else if (!Base->equalBaseIndex(Ptr)) +      return SDValue(); + +    // Calculate the offset of the current byte from the base address +    int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset(*P); +    ByteOffsets[i] = ByteOffsetFromBase; + +    // Remember the first byte load +    if (ByteOffsetFromBase < FirstOffset) { +      FirstByteProvider = P; +      FirstOffset = ByteOffsetFromBase; +    } + +    Loads.insert(L); +  } +  assert(Loads.size() > 0 && "All the bytes of the value must be loaded from " +         "memory, so there must be at least one load which produces the value"); +  assert(Base && "Base address of the accessed memory location must be set"); +  assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + +  // Check if the bytes of the OR we are looking at match with either big or +  // little endian value load +  bool BigEndian = true, LittleEndian = true; +  for (unsigned i = 0; i < ByteWidth; i++) { +    int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; +    LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); +    BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); +    if (!BigEndian && !LittleEndian) +      return SDValue(); +  } +  assert((BigEndian != LittleEndian) && "should be either or"); +  assert(FirstByteProvider && "must be set"); + +  // Ensure that the first byte is loaded from zero offset of the first load. +  // So the combined value can be loaded from the first load address. +  if (MemoryByteOffset(*FirstByteProvider) != 0) +    return SDValue(); +  LoadSDNode *FirstLoad = FirstByteProvider->Load; + +  // The node we are looking at matches with the pattern, check if we can +  // replace it with a single load and bswap if needed. + +  // If the load needs byte swap check if the target supports it +  bool NeedsBswap = IsBigEndianTarget != BigEndian; + +  // Before legalize we can introduce illegal bswaps which will be later +  // converted to an explicit bswap sequence. This way we end up with a single +  // load and byte shuffling instead of several loads and byte shuffling. +  if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) +    return SDValue(); + +  // Check that a load of the wide type is both allowed and fast on the target +  bool Fast = false; +  bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), +                                        VT, FirstLoad->getAddressSpace(), +                                        FirstLoad->getAlignment(), &Fast); +  if (!Allowed || !Fast) +    return SDValue(); + +  SDValue NewLoad = +      DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), +                  FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + +  // Transfer chain users from old loads to the new load. +  for (LoadSDNode *L : Loads) +    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); + +  return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; +} +  SDValue DAGCombiner::visitXOR(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -4386,6 +4924,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {    // fold (xor x, 0) -> x    if (isNullConstant(N1))      return N0; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // reassociate xor    if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))      return RXOR; @@ -4403,9 +4945,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {        default:          llvm_unreachable("Unhandled SetCC Equivalent!");        case ISD::SETCC: -        return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC); +        return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);        case ISD::SELECT_CC: -        return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2), +        return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),                                 N0.getOperand(3), NotCC);        }      } @@ -4470,6 +5012,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {                                           N01C->getAPIntValue(), DL, VT));      }    } + +  // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) +  unsigned OpSizeInBits = VT.getScalarSizeInBits(); +  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && +      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) && +      TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { +    if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) +      if (C->getAPIntValue() == (OpSizeInBits - 1)) +        return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0)); +  } +    // fold (xor x, x) -> 0    if (N0 == N1)      return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); @@ -4673,6 +5226,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {    // fold (shl undef, x) -> 0    if (N0.isUndef())      return DAG.getConstant(0, SDLoc(N), VT); + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // if (shl x, c) is known to be zero, return 0    if (DAG.MaskedValueIsZero(SDValue(N, 0),                              APInt::getAllOnesValue(OpSizeInBits))) @@ -4808,9 +5365,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {    // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))    if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&        isConstantOrConstantVector(N1, /* No Opaques */ true)) { -    unsigned BitSize = VT.getScalarSizeInBits();      SDLoc DL(N); -    SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT); +    SDValue AllBits = DAG.getAllOnesConstant(DL, VT);      SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);      return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);    } @@ -4877,6 +5433,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {    // fold (sra x, 0) -> x    if (N1C && N1C->isNullValue())      return N0; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports    // sext_inreg.    if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { @@ -5024,6 +5584,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {    // fold (srl x, 0) -> x    if (N1C && N1C->isNullValue())      return N0; + +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // if (srl x, c) is known to be zero, return 0    if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),                                     APInt::getAllOnesValue(OpSizeInBits))) @@ -5074,9 +5638,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {    if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&        isConstantOrConstantVector(N1, /* NoOpaques */ true)) {      SDLoc DL(N); -    APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits());      SDValue Mask = -        DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1); +        DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);      AddToWorklist(Mask.getNode());      return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);    } @@ -5202,6 +5765,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {    return SDValue();  } +SDValue DAGCombiner::visitABS(SDNode *N) { +  SDValue N0 = N->getOperand(0); +  EVT VT = N->getValueType(0); + +  // fold (abs c1) -> c2 +  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) +    return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); +  // fold (abs (abs x)) -> (abs x) +  if (N0.getOpcode() == ISD::ABS) +    return N0; +  // fold (abs x) -> x iff not-negative +  if (DAG.SignBitIsZero(N0)) +    return N0; +  return SDValue(); +} +  SDValue DAGCombiner::visitBSWAP(SDNode *N) {    SDValue N0 = N->getOperand(0);    EVT VT = N->getValueType(0); @@ -5217,7 +5796,11 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {  SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {    SDValue N0 = N->getOperand(0); +  EVT VT = N->getValueType(0); +  // fold (bitreverse c1) -> c2 +  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) +    return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);    // fold (bitreverse (bitreverse x)) -> x    if (N0.getOpcode() == ISD::BITREVERSE)      return N0.getOperand(0); @@ -5311,7 +5894,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,    }  } -// TODO: We should handle other cases of selecting between {-1,0,1} here.  SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {    SDValue Cond = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -5320,6 +5902,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {    EVT CondVT = Cond.getValueType();    SDLoc DL(N); +  if (!VT.isInteger()) +    return SDValue(); + +  auto *C1 = dyn_cast<ConstantSDNode>(N1); +  auto *C2 = dyn_cast<ConstantSDNode>(N2); +  if (!C1 || !C2) +    return SDValue(); + +  // Only do this before legalization to avoid conflicting with target-specific +  // transforms in the other direction (create a select from a zext/sext). There +  // is also a target-independent combine here in DAGCombiner in the other +  // direction for (select Cond, -1, 0) when the condition is not i1. +  if (CondVT == MVT::i1 && !LegalOperations) { +    if (C1->isNullValue() && C2->isOne()) { +      // select Cond, 0, 1 --> zext (!Cond) +      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); +      if (VT != MVT::i1) +        NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); +      return NotCond; +    } +    if (C1->isNullValue() && C2->isAllOnesValue()) { +      // select Cond, 0, -1 --> sext (!Cond) +      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); +      if (VT != MVT::i1) +        NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); +      return NotCond; +    } +    if (C1->isOne() && C2->isNullValue()) { +      // select Cond, 1, 0 --> zext (Cond) +      if (VT != MVT::i1) +        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); +      return Cond; +    } +    if (C1->isAllOnesValue() && C2->isNullValue()) { +      // select Cond, -1, 0 --> sext (Cond) +      if (VT != MVT::i1) +        Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); +      return Cond; +    } + +    // For any constants that differ by 1, we can transform the select into an +    // extend and add. Use a target hook because some targets may prefer to +    // transform in the other direction. +    if (TLI.convertSelectOfConstantsToMath()) { +      if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { +        // select Cond, C1, C1-1 --> add (zext Cond), C1-1 +        if (VT != MVT::i1) +          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); +        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); +      } +      if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) { +        // select Cond, C1, C1+1 --> add (sext Cond), C1+1 +        if (VT != MVT::i1) +          Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); +        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); +      } +    } + +    return SDValue(); +  } +    // fold (select Cond, 0, 1) -> (xor Cond, 1)    // We can't do this reliably if integer based booleans have different contents    // to floating point based booleans. This is because we can't tell whether we @@ -5329,15 +5972,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {    // undiscoverable (or not reasonably discoverable). For example, it could be    // in another basic block or it could require searching a complicated    // expression. -  if (VT.isInteger() && -      (CondVT == MVT::i1 || (CondVT.isInteger() && -                             TLI.getBooleanContents(false, true) == -                                 TargetLowering::ZeroOrOneBooleanContent && -                             TLI.getBooleanContents(false, false) == -                                 TargetLowering::ZeroOrOneBooleanContent)) && -      isNullConstant(N1) && isOneConstant(N2)) { -    SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond, -                                  DAG.getConstant(1, DL, CondVT)); +  if (CondVT.isInteger() && +      TLI.getBooleanContents(false, true) == +          TargetLowering::ZeroOrOneBooleanContent && +      TLI.getBooleanContents(false, false) == +          TargetLowering::ZeroOrOneBooleanContent && +      C1->isNullValue() && C2->isOne()) { +    SDValue NotCond = +        DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));      if (VT.bitsEq(CondVT))        return NotCond;      return DAG.getZExtOrTrunc(NotCond, DL, VT); @@ -5847,7 +6489,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {                             ISD::NON_EXTLOAD, MLD->isExpandingLoad());      Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, -                                     MLD->isExpandingLoad());  +                                     MLD->isExpandingLoad());      MMO = DAG.getMachineFunction().      getMachineMemOperand(MLD->getPointerInfo(), @@ -5921,34 +6563,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {    if (SimplifySelectOps(N, N1, N2))      return SDValue(N, 0);  // Don't revisit N. -  // If the VSELECT result requires splitting and the mask is provided by a -  // SETCC, then split both nodes and its operands before legalization. This -  // prevents the type legalizer from unrolling SETCC into scalar comparisons -  // and enables future optimizations (e.g. min/max pattern matching on X86). -  if (N0.getOpcode() == ISD::SETCC) { -    EVT VT = N->getValueType(0); - -    // Check if any splitting is required. -    if (TLI.getTypeAction(*DAG.getContext(), VT) != -        TargetLowering::TypeSplitVector) -      return SDValue(); - -    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH; -    std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG); -    std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1); -    std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2); - -    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL); -    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH); - -    // Add the new VSELECT nodes to the work list in case they need to be split -    // again. -    AddToWorklist(Lo.getNode()); -    AddToWorklist(Hi.getNode()); - -    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); -  } -    // Fold (vselect (build_vector all_ones), N1, N2) -> N1    if (ISD::isBuildVectorAllOnes(N0.getNode()))      return N1; @@ -6258,6 +6872,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);    SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); +  // Simplify TF. +  AddToWorklist(NewChain.getNode()); +    CombineTo(N, NewValue);    // Replace uses of the original load (before extension) @@ -6273,6 +6890,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {  SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {    SDValue N0 = N->getOperand(0);    EVT VT = N->getValueType(0); +  SDLoc DL(N);    if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,                                                LegalOperations)) @@ -6281,8 +6899,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {    // fold (sext (sext x)) -> (sext x)    // fold (sext (aext x)) -> (sext x)    if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) -    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, -                       N0.getOperand(0)); +    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));    if (N0.getOpcode() == ISD::TRUNCATE) {      // fold (sext (truncate (load x))) -> (sext (smaller load x)) @@ -6314,12 +6931,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {        // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign        // bits, just sext from i32.        if (NumSignBits > OpBits-MidBits) -        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op); +        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);      } else {        // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign        // bits, just truncate to i32.        if (NumSignBits > OpBits-MidBits) -        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); +        return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);      }      // fold (sext (truncate x)) -> (sextinreg x). @@ -6329,7 +6946,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {          Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);        else if (OpBits > DestBits)          Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); -      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op, +      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,                           DAG.getValueType(N0.getValueType()));      }    } @@ -6349,16 +6966,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {        DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));      if (DoXform) {        LoadSDNode *LN0 = cast<LoadSDNode>(N0); -      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, -                                       LN0->getChain(), +      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),                                         LN0->getBasePtr(), N0.getValueType(),                                         LN0->getMemOperand());        CombineTo(N, ExtLoad);        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),                                    N0.getValueType(), ExtLoad);        CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); -      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), -                      ISD::SIGN_EXTEND); +      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);        return SDValue(N, 0);   // Return N so it doesn't get rechecked!      }    } @@ -6376,8 +6991,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {      EVT MemVT = LN0->getMemoryVT();      if ((!LegalOperations && !LN0->isVolatile()) ||          TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { -      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, -                                       LN0->getChain(), +      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),                                         LN0->getBasePtr(), MemVT,                                         LN0->getMemOperand());        CombineTo(N, ExtLoad); @@ -6411,7 +7025,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {                                           LN0->getMemOperand());          APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();          Mask = Mask.sext(VT.getSizeInBits()); -        SDLoc DL(N);          SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,                                    ExtLoad, DAG.getConstant(Mask, DL, VT));          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, @@ -6419,24 +7032,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {                                      N0.getOperand(0).getValueType(), ExtLoad);          CombineTo(N, And);          CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); -        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, -                        ISD::SIGN_EXTEND); +        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);          return SDValue(N, 0);   // Return N so it doesn't get rechecked!        }      }    }    if (N0.getOpcode() == ISD::SETCC) { -    EVT N0VT = N0.getOperand(0).getValueType(); +    SDValue N00 = N0.getOperand(0); +    SDValue N01 = N0.getOperand(1); +    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); +    EVT N00VT = N0.getOperand(0).getValueType(); +      // sext(setcc) -> sext_in_reg(vsetcc) for vectors.      // Only do this before legalize for now.      if (VT.isVector() && !LegalOperations && -        TLI.getBooleanContents(N0VT) == +        TLI.getBooleanContents(N00VT) ==              TargetLowering::ZeroOrNegativeOneBooleanContent) {        // On some architectures (such as SSE/NEON/etc) the SETCC result type is        // of the same size as the compared operands. Only optimize sext(setcc())        // if this is the case. -      EVT SVT = getSetCCResultType(N0VT); +      EVT SVT = getSetCCResultType(N00VT);        // We know that the # elements of the results is the same as the        // # elements of the compare (and the # elements of the compare result @@ -6444,19 +7060,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {        // we know that the element size of the sext'd result matches the        // element size of the compare operands.        if (VT.getSizeInBits() == SVT.getSizeInBits()) -        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), -                             N0.getOperand(1), -                             cast<CondCodeSDNode>(N0.getOperand(2))->get()); +        return DAG.getSetCC(DL, VT, N00, N01, CC);        // If the desired elements are smaller or larger than the source -      // elements we can use a matching integer vector type and then -      // truncate/sign extend -      EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); -      if (SVT == MatchingVectorType) { -        SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, -                               N0.getOperand(0), N0.getOperand(1), -                               cast<CondCodeSDNode>(N0.getOperand(2))->get()); -        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT); +      // elements, we can use a matching integer vector type and then +      // truncate/sign extend. +      EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); +      if (SVT == MatchingVecType) { +        SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); +        return DAG.getSExtOrTrunc(VsetCC, DL, VT);        }      } @@ -6465,36 +7077,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {      // getBooleanContents().      unsigned SetCCWidth = N0.getScalarValueSizeInBits(); -    SDLoc DL(N);      // To determine the "true" side of the select, we need to know the high bit      // of the value returned by the setcc if it evaluates to true.      // If the type of the setcc is i1, then the true case of the select is just      // sext(i1 1), that is, -1.      // If the type of the setcc is larger (say, i8) then the value of the high -    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value +    // bit depends on getBooleanContents(), so ask TLI for a real "true" value      // of the appropriate width. -    SDValue ExtTrueVal = -        (SetCCWidth == 1) -            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), -                              DL, VT) -            : TLI.getConstTrueVal(DAG, VT, DL); - -    if (SDValue SCC = SimplifySelectCC( -            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal, -            DAG.getConstant(0, DL, VT), -            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) +    SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT) +                                           : TLI.getConstTrueVal(DAG, VT, DL); +    SDValue Zero = DAG.getConstant(0, DL, VT); +    if (SDValue SCC = +            SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))        return SCC;      if (!VT.isVector()) { -      EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType()); -      if (!LegalOperations || -          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { -        SDLoc DL(N); -        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); -        SDValue SetCC = -            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); -        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, -                             DAG.getConstant(0, DL, VT)); +      EVT SetCCVT = getSetCCResultType(N00VT); +      // Don't do this transform for i1 because there's a select transform +      // that would reverse it. +      // TODO: We should not do this transform at all without a target hook +      // because a sext is likely cheaper than a select? +      if (SetCCVT.getScalarSizeInBits() != 1 && +          (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { +        SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); +        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);        }      }    } @@ -6502,7 +7108,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {    // fold (sext x) -> (zext x) if the sign bit is known zero.    if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&        DAG.SignBitIsZero(N0)) -    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0); +    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);    return SDValue();  } @@ -6677,13 +7283,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {                                         LN0->getChain(),                                         LN0->getBasePtr(), N0.getValueType(),                                         LN0->getMemOperand()); -      CombineTo(N, ExtLoad); +        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),                                    N0.getValueType(), ExtLoad);        CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),                        ISD::ZERO_EXTEND); +      CombineTo(N, ExtLoad);        return SDValue(N, 0);   // Return N so it doesn't get rechecked!      }    } @@ -6991,9 +7598,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {    return SDValue();  } +SDValue DAGCombiner::visitAssertZext(SDNode *N) { +  SDValue N0 = N->getOperand(0); +  SDValue N1 = N->getOperand(1); +  EVT EVT = cast<VTSDNode>(N1)->getVT(); + +  // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt) +  if (N0.getOpcode() == ISD::AssertZext && +      EVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) +    return N0; + +  return SDValue(); +} +  /// See if the specified operand can be simplified with the knowledge that only  /// the bits specified by Mask are used.  If so, return the simpler operand,  /// otherwise return a null SDValue. +/// +/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can +/// simplify nodes with multiple uses more aggressively.)  SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {    switch (V.getOpcode()) {    default: break; @@ -7029,6 +7652,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {          return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),                             SimplifyLHS, V.getOperand(1));      } +    break; +  case ISD::AND: { +    // X & -1 -> X (ignoring bits which aren't demanded). +    ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1)); +    if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask) +      return V.getOperand(0); +    break; +  }    }    return SDValue();  } @@ -7244,6 +7875,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);    } +  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x) +  if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || +       N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || +       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && +      N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { +    if (!LegalOperations || +        TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) +      return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT); +  } +    // fold (sext_in_reg (zext x)) -> (sext x)    // iff we are extending the source sign bit.    if (N0.getOpcode() == ISD::ZERO_EXTEND) { @@ -7254,7 +7895,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {    }    // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. -  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits))) +  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))      return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());    // fold operands of sext_in_reg based on knowledge that the top bits are not @@ -7496,6 +8137,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {                                                       VT.getSizeInBits())))        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);    } +    // fold (truncate (load x)) -> (smaller load x)    // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))    if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { @@ -7517,6 +8159,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {        }      }    } +    // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),    // where ... are all 'undef'.    if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { @@ -7582,6 +8225,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {        SimplifyDemandedBits(SDValue(N, 0)))      return SDValue(N, 0); +  // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) +  // When the adde's carry is not used. +  if (N0.getOpcode() == ISD::ADDE && N0.hasOneUse() && +      !N0.getNode()->hasAnyUseOfValue(1) && +      (!LegalOperations || TLI.isOperationLegal(ISD::ADDE, VT))) { +    SDLoc SL(N); +    auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); +    auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); +    return DAG.getNode(ISD::ADDE, SL, DAG.getVTList(VT, MVT::Glue), +                       X, Y, N0.getOperand(2)); +  } +    return SDValue();  } @@ -7672,6 +8327,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {    SDValue N0 = N->getOperand(0);    EVT VT = N->getValueType(0); +  if (N0.isUndef()) +    return DAG.getUNDEF(VT); +    // If the input is a BUILD_VECTOR with all constant elements, fold this now.    // Only do this before legalize, since afterward the target may be depending    // on the bitconvert. @@ -8040,6 +8698,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {    return DAG.getBuildVector(VT, DL, Ops);  } +static bool isContractable(SDNode *N) { +  SDNodeFlags F = cast<BinaryWithFlagsSDNode>(N)->Flags; +  return F.hasAllowContract() || F.hasUnsafeAlgebra(); +} +  /// Try to perform FMA combining on a given FADD node.  SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {    SDValue N0 = N->getOperand(0); @@ -8048,24 +8711,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {    SDLoc SL(N);    const TargetOptions &Options = DAG.getTarget().Options; -  bool AllowFusion = -      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);    // Floating-point multiply-add with intermediate rounding.    bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));    // Floating-point multiply-add without intermediate rounding.    bool HasFMA = -      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && +      TLI.isFMAFasterThanFMulAndFAdd(VT) &&        (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));    // No valid opcode, do not combine.    if (!HasFMAD && !HasFMA)      return SDValue(); +  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || +                              Options.UnsafeFPMath || HasFMAD); +  // If the addition is not contractable, do not combine. +  if (!AllowFusionGlobally && !isContractable(N)) +    return SDValue(); +    const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); -  ; -  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) +  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))      return SDValue();    // Always prefer FMAD to FMA for precision. @@ -8073,35 +8739,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {    bool Aggressive = TLI.enableAggressiveFMAFusion(VT);    bool LookThroughFPExt = TLI.isFPExtFree(VT); +  // Is the node an FMUL and contractable either due to global flags or +  // SDNodeFlags. +  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { +    if (N.getOpcode() != ISD::FMUL) +      return false; +    return AllowFusionGlobally || isContractable(N.getNode()); +  };    // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),    // prefer to fold the multiply with fewer uses. -  if (Aggressive && N0.getOpcode() == ISD::FMUL && -      N1.getOpcode() == ISD::FMUL) { +  if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {      if (N0.getNode()->use_size() > N1.getNode()->use_size())        std::swap(N0, N1);    }    // fold (fadd (fmul x, y), z) -> (fma x, y, z) -  if (N0.getOpcode() == ISD::FMUL && -      (Aggressive || N0->hasOneUse())) { +  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {      return DAG.getNode(PreferredFusedOpcode, SL, VT,                         N0.getOperand(0), N0.getOperand(1), N1);    }    // fold (fadd x, (fmul y, z)) -> (fma y, z, x)    // Note: Commutes FADD operands. -  if (N1.getOpcode() == ISD::FMUL && -      (Aggressive || N1->hasOneUse())) { +  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {      return DAG.getNode(PreferredFusedOpcode, SL, VT,                         N1.getOperand(0), N1.getOperand(1), N0);    }    // Look through FP_EXTEND nodes to do more combining. -  if (AllowFusion && LookThroughFPExt) { +  if (LookThroughFPExt) {      // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)      if (N0.getOpcode() == ISD::FP_EXTEND) {        SDValue N00 = N0.getOperand(0); -      if (N00.getOpcode() == ISD::FMUL) +      if (isContractableFMUL(N00))          return DAG.getNode(PreferredFusedOpcode, SL, VT,                             DAG.getNode(ISD::FP_EXTEND, SL, VT,                                         N00.getOperand(0)), @@ -8113,7 +8783,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {      // Note: Commutes FADD operands.      if (N1.getOpcode() == ISD::FP_EXTEND) {        SDValue N10 = N1.getOperand(0); -      if (N10.getOpcode() == ISD::FMUL) +      if (isContractableFMUL(N10))          return DAG.getNode(PreferredFusedOpcode, SL, VT,                             DAG.getNode(ISD::FP_EXTEND, SL, VT,                                         N10.getOperand(0)), @@ -8154,7 +8824,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {                                       N0));      } -    if (AllowFusion && LookThroughFPExt) { +    if (LookThroughFPExt) {        // fold (fadd (fma x, y, (fpext (fmul u, v))), z)        //   -> (fma x, y, (fma (fpext u), (fpext v), z))        auto FoldFAddFMAFPExtFMul = [&] ( @@ -8169,7 +8839,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {          SDValue N02 = N0.getOperand(2);          if (N02.getOpcode() == ISD::FP_EXTEND) {            SDValue N020 = N02.getOperand(0); -          if (N020.getOpcode() == ISD::FMUL) +          if (isContractableFMUL(N020))              return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),                                          N020.getOperand(0), N020.getOperand(1),                                          N1); @@ -8195,7 +8865,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {          SDValue N00 = N0.getOperand(0);          if (N00.getOpcode() == PreferredFusedOpcode) {            SDValue N002 = N00.getOperand(2); -          if (N002.getOpcode() == ISD::FMUL) +          if (isContractableFMUL(N002))              return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),                                          N002.getOperand(0), N002.getOperand(1),                                          N1); @@ -8208,7 +8878,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {          SDValue N12 = N1.getOperand(2);          if (N12.getOpcode() == ISD::FP_EXTEND) {            SDValue N120 = N12.getOperand(0); -          if (N120.getOpcode() == ISD::FMUL) +          if (isContractableFMUL(N120))              return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),                                          N120.getOperand(0), N120.getOperand(1),                                          N0); @@ -8224,7 +8894,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {          SDValue N10 = N1.getOperand(0);          if (N10.getOpcode() == PreferredFusedOpcode) {            SDValue N102 = N10.getOperand(2); -          if (N102.getOpcode() == ISD::FMUL) +          if (isContractableFMUL(N102))              return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),                                          N102.getOperand(0), N102.getOperand(1),                                          N0); @@ -8244,23 +8914,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {    SDLoc SL(N);    const TargetOptions &Options = DAG.getTarget().Options; -  bool AllowFusion = -      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); -    // Floating-point multiply-add with intermediate rounding.    bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));    // Floating-point multiply-add without intermediate rounding.    bool HasFMA = -      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && +      TLI.isFMAFasterThanFMulAndFAdd(VT) &&        (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));    // No valid opcode, do not combine.    if (!HasFMAD && !HasFMA)      return SDValue(); +  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || +                              Options.UnsafeFPMath || HasFMAD); +  // If the subtraction is not contractable, do not combine. +  if (!AllowFusionGlobally && !isContractable(N)) +    return SDValue(); +    const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); -  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) +  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))      return SDValue();    // Always prefer FMAD to FMA for precision. @@ -8268,9 +8941,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {    bool Aggressive = TLI.enableAggressiveFMAFusion(VT);    bool LookThroughFPExt = TLI.isFPExtFree(VT); +  // Is the node an FMUL and contractable either due to global flags or +  // SDNodeFlags. +  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { +    if (N.getOpcode() != ISD::FMUL) +      return false; +    return AllowFusionGlobally || isContractable(N.getNode()); +  }; +    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -  if (N0.getOpcode() == ISD::FMUL && -      (Aggressive || N0->hasOneUse())) { +  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {      return DAG.getNode(PreferredFusedOpcode, SL, VT,                         N0.getOperand(0), N0.getOperand(1),                         DAG.getNode(ISD::FNEG, SL, VT, N1)); @@ -8278,16 +8958,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)    // Note: Commutes FSUB operands. -  if (N1.getOpcode() == ISD::FMUL && -      (Aggressive || N1->hasOneUse())) +  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse()))      return DAG.getNode(PreferredFusedOpcode, SL, VT,                         DAG.getNode(ISD::FNEG, SL, VT,                                     N1.getOperand(0)),                         N1.getOperand(1), N0);    // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) -  if (N0.getOpcode() == ISD::FNEG && -      N0.getOperand(0).getOpcode() == ISD::FMUL && +  if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&        (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {      SDValue N00 = N0.getOperand(0).getOperand(0);      SDValue N01 = N0.getOperand(0).getOperand(1); @@ -8297,12 +8975,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {    }    // Look through FP_EXTEND nodes to do more combining. -  if (AllowFusion && LookThroughFPExt) { +  if (LookThroughFPExt) {      // fold (fsub (fpext (fmul x, y)), z)      //   -> (fma (fpext x), (fpext y), (fneg z))      if (N0.getOpcode() == ISD::FP_EXTEND) {        SDValue N00 = N0.getOperand(0); -      if (N00.getOpcode() == ISD::FMUL) +      if (isContractableFMUL(N00))          return DAG.getNode(PreferredFusedOpcode, SL, VT,                             DAG.getNode(ISD::FP_EXTEND, SL, VT,                                         N00.getOperand(0)), @@ -8316,7 +8994,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {      // Note: Commutes FSUB operands.      if (N1.getOpcode() == ISD::FP_EXTEND) {        SDValue N10 = N1.getOperand(0); -      if (N10.getOpcode() == ISD::FMUL) +      if (isContractableFMUL(N10))          return DAG.getNode(PreferredFusedOpcode, SL, VT,                             DAG.getNode(ISD::FNEG, SL, VT,                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8336,7 +9014,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {        SDValue N00 = N0.getOperand(0);        if (N00.getOpcode() == ISD::FNEG) {          SDValue N000 = N00.getOperand(0); -        if (N000.getOpcode() == ISD::FMUL) { +        if (isContractableFMUL(N000)) {            return DAG.getNode(ISD::FNEG, SL, VT,                               DAG.getNode(PreferredFusedOpcode, SL, VT,                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8358,7 +9036,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {        SDValue N00 = N0.getOperand(0);        if (N00.getOpcode() == ISD::FP_EXTEND) {          SDValue N000 = N00.getOperand(0); -        if (N000.getOpcode() == ISD::FMUL) { +        if (isContractableFMUL(N000)) {            return DAG.getNode(ISD::FNEG, SL, VT,                               DAG.getNode(PreferredFusedOpcode, SL, VT,                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8378,10 +9056,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {      //   -> (fma x, y (fma u, v, (fneg z)))      // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF      // are currently only supported on binary nodes. -    if (Options.UnsafeFPMath && -        N0.getOpcode() == PreferredFusedOpcode && -        N0.getOperand(2).getOpcode() == ISD::FMUL && -        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { +    if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && +        isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && +        N0.getOperand(2)->hasOneUse()) {        return DAG.getNode(PreferredFusedOpcode, SL, VT,                           N0.getOperand(0), N0.getOperand(1),                           DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8395,9 +9072,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {      //   -> (fma (fneg y), z, (fma (fneg u), v, x))      // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF      // are currently only supported on binary nodes. -    if (Options.UnsafeFPMath && -        N1.getOpcode() == PreferredFusedOpcode && -        N1.getOperand(2).getOpcode() == ISD::FMUL) { +    if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && +        isContractableFMUL(N1.getOperand(2))) {        SDValue N20 = N1.getOperand(2).getOperand(0);        SDValue N21 = N1.getOperand(2).getOperand(1);        return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8410,14 +9086,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {                                       N21, N0));      } -    if (AllowFusion && LookThroughFPExt) { +    if (LookThroughFPExt) {        // fold (fsub (fma x, y, (fpext (fmul u, v))), z)        //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))        if (N0.getOpcode() == PreferredFusedOpcode) {          SDValue N02 = N0.getOperand(2);          if (N02.getOpcode() == ISD::FP_EXTEND) {            SDValue N020 = N02.getOperand(0); -          if (N020.getOpcode() == ISD::FMUL) +          if (isContractableFMUL(N020))              return DAG.getNode(PreferredFusedOpcode, SL, VT,                                 N0.getOperand(0), N0.getOperand(1),                                 DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8440,7 +9116,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {          SDValue N00 = N0.getOperand(0);          if (N00.getOpcode() == PreferredFusedOpcode) {            SDValue N002 = N00.getOperand(2); -          if (N002.getOpcode() == ISD::FMUL) +          if (isContractableFMUL(N002))              return DAG.getNode(PreferredFusedOpcode, SL, VT,                                 DAG.getNode(ISD::FP_EXTEND, SL, VT,                                             N00.getOperand(0)), @@ -8461,7 +9137,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {        if (N1.getOpcode() == PreferredFusedOpcode &&          N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {          SDValue N120 = N1.getOperand(2).getOperand(0); -        if (N120.getOpcode() == ISD::FMUL) { +        if (isContractableFMUL(N120)) {            SDValue N1200 = N120.getOperand(0);            SDValue N1201 = N120.getOperand(1);            return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8488,7 +9164,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {          SDValue N100 = N1.getOperand(0).getOperand(0);          SDValue N101 = N1.getOperand(0).getOperand(1);          SDValue N102 = N1.getOperand(0).getOperand(2); -        if (N102.getOpcode() == ISD::FMUL) { +        if (isContractableFMUL(N102)) {            SDValue N1020 = N102.getOperand(0);            SDValue N1021 = N102.getOperand(1);            return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8624,6 +9300,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {    if (N0CFP && !N1CFP)      return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // fold (fadd A, (fneg B)) -> (fsub A, B)    if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&        isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) @@ -8637,7 +9316,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {                         GetNegatedExpression(N0, DAG, LegalOperations), Flags);    // FIXME: Auto-upgrade the target/function-level option. -  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { +  if (Options.NoSignedZerosFPMath || N->getFlags()->hasNoSignedZeros()) {      // fold (fadd A, 0) -> A      if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))        if (N1C->isZero()) @@ -8771,13 +9450,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {    if (N0CFP && N1CFP)      return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    // fold (fsub A, (fneg B)) -> (fadd A, B)    if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))      return DAG.getNode(ISD::FADD, DL, VT, N0,                         GetNegatedExpression(N1, DAG, LegalOperations), Flags);    // FIXME: Auto-upgrade the target/function-level option. -  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { +  if (Options.NoSignedZerosFPMath  || N->getFlags()->hasNoSignedZeros()) {      // (fsub 0, B) -> -B      if (N0CFP && N0CFP->isZero()) {        if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) @@ -8850,6 +9532,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {    if (N1CFP && N1CFP->isExactlyValue(1.0))      return N0; +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    if (Options.UnsafeFPMath) {      // fold (fmul A, 0) -> 0      if (N1CFP && N1CFP->isZero()) @@ -9104,6 +9789,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {    if (N0CFP && N1CFP)      return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    if (Options.UnsafeFPMath) {      // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.      if (N1CFP) { @@ -9207,6 +9895,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {      return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1,                         &cast<BinaryWithFlagsSDNode>(N)->Flags); +  if (SDValue NewSel = foldBinOpIntoSelect(N)) +    return NewSel; +    return SDValue();  } @@ -10361,7 +11052,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {                dbgs() << "\n");          WorklistRemover DeadNodes(*this);          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); - +        AddUsersToWorklist(Chain.getNode());          if (N->use_empty())            deleteAndRecombine(N); @@ -10414,7 +11105,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {        StoreSDNode *PrevST = cast<StoreSDNode>(Chain);        if (PrevST->getBasePtr() == Ptr &&            PrevST->getValue().getValueType() == N->getValueType(0)) -      return CombineTo(N, Chain.getOperand(1), Chain); +        return CombineTo(N, PrevST->getOperand(1), Chain);      }    } @@ -10432,14 +11123,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {      }    } -  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA -                                                  : DAG.getSubtarget().useAA(); -#ifndef NDEBUG -  if (CombinerAAOnlyFunc.getNumOccurrences() && -      CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) -    UseAA = false; -#endif -  if (UseAA && LD->isUnindexed()) { +  if (LD->isUnindexed()) {      // Walk up chain skipping non-aliasing memory nodes.      SDValue BetterChain = FindBetterChain(N, Chain); @@ -11021,6 +11705,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {    SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,                                ArgChains);    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); +  AddToWorklist(Chain.getNode());    return true;  } @@ -11414,18 +12099,24 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,    return false;  } -SDValue DAGCombiner::getMergedConstantVectorStore( -    SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores, -    SmallVectorImpl<SDValue> &Chains, EVT Ty) const { -  SmallVector<SDValue, 8> BuildVector; +SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, +                                         unsigned NumStores) { +  SmallVector<SDValue, 8> Chains; +  SmallPtrSet<const SDNode *, 8> Visited; +  SDLoc StoreDL(StoreNodes[0].MemNode); + +  for (unsigned i = 0; i < NumStores; ++i) { +    Visited.insert(StoreNodes[i].MemNode); +  } -  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { -    StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode); -    Chains.push_back(St->getChain()); -    BuildVector.push_back(St->getValue()); +  // don't include nodes that are children +  for (unsigned i = 0; i < NumStores; ++i) { +    if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) +      Chains.push_back(StoreNodes[i].MemNode->getChain());    } -  return DAG.getBuildVector(Ty, SL, BuildVector); +  assert(Chains.size() > 0 && "Chain should have generated a chain"); +  return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);  }  bool DAGCombiner::MergeStoresOfConstantsOrVecElts( @@ -11436,22 +12127,8 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(      return false;    int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; -  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; -  unsigned LatestNodeUsed = 0; - -  for (unsigned i=0; i < NumStores; ++i) { -    // Find a chain for the new wide-store operand. Notice that some -    // of the store nodes that we found may not be selected for inclusion -    // in the wide store. The chain we use needs to be the chain of the -    // latest store node which is *used* and replaced by the wide store. -    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) -      LatestNodeUsed = i; -  } - -  SmallVector<SDValue, 8> Chains;    // The latest Node in the DAG. -  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;    SDLoc DL(StoreNodes[0].MemNode);    SDValue StoredVal; @@ -11467,7 +12144,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(      assert(TLI.isTypeLegal(Ty) && "Illegal vector store");      if (IsConstantSrc) { -      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty); +      SmallVector<SDValue, 8> BuildVector; +      for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { +        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); +        SDValue Val = St->getValue(); +        if (MemVT.getScalarType().isInteger()) +          if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue())) +            Val = DAG.getConstant( +                (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(), +                SDLoc(CFP), MemVT); +        BuildVector.push_back(Val); +      } +      StoredVal = DAG.getBuildVector(Ty, DL, BuildVector);      } else {        SmallVector<SDValue, 8> Ops;        for (unsigned i = 0; i < NumStores; ++i) { @@ -11477,7 +12165,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(          if (Val.getValueType() != MemVT)            return false;          Ops.push_back(Val); -        Chains.push_back(St->getChain());        }        // Build the extracted vector elements back into a vector. @@ -11497,7 +12184,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(      for (unsigned i = 0; i < NumStores; ++i) {        unsigned Idx = IsLE ? (NumStores - 1 - i) : i;        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode); -      Chains.push_back(St->getChain());        SDValue Val = St->getValue();        StoreInt <<= ElementSizeBytes * 8; @@ -11515,54 +12201,27 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(      StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);    } -  assert(!Chains.empty()); - -  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); +  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; +  SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);    SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,                                    FirstInChain->getBasePtr(),                                    FirstInChain->getPointerInfo(),                                    FirstInChain->getAlignment()); -  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA -                                                  : DAG.getSubtarget().useAA(); -  if (UseAA) { -    // Replace all merged stores with the new store. -    for (unsigned i = 0; i < NumStores; ++i) -      CombineTo(StoreNodes[i].MemNode, NewStore); -  } else { -    // Replace the last store with the new store. -    CombineTo(LatestOp, NewStore); -    // Erase all other stores. -    for (unsigned i = 0; i < NumStores; ++i) { -      if (StoreNodes[i].MemNode == LatestOp) -        continue; -      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); -      // ReplaceAllUsesWith will replace all uses that existed when it was -      // called, but graph optimizations may cause new ones to appear. For -      // example, the case in pr14333 looks like -      // -      //  St's chain -> St -> another store -> X -      // -      // And the only difference from St to the other store is the chain. -      // When we change it's chain to be St's chain they become identical, -      // get CSEed and the net result is that X is now a use of St. -      // Since we know that St is redundant, just iterate. -      while (!St->use_empty()) -        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); -      deleteAndRecombine(St); -    } -  } +  // Replace all merged stores with the new store. +  for (unsigned i = 0; i < NumStores; ++i) +    CombineTo(StoreNodes[i].MemNode, NewStore); -  StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end()); +  AddToWorklist(NewChain.getNode());    return true;  } -void DAGCombiner::getStoreMergeAndAliasCandidates( -    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, -    SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) { +void DAGCombiner::getStoreMergeCandidates( +    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {    // This holds the base pointer, index, and the offset in bytes from the base    // pointer.    BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); +  EVT MemVT = St->getMemoryVT();    // We must have a base and an offset.    if (!BasePtr.Base.getNode()) @@ -11572,104 +12231,71 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(    if (BasePtr.Base.isUndef())      return; -  // Walk up the chain and look for nodes with offsets from the same -  // base pointer. Stop when reaching an instruction with a different kind -  // or instruction which has a different base pointer. -  EVT MemVT = St->getMemoryVT(); -  unsigned Seq = 0; -  StoreSDNode *Index = St; - - -  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA -                                                  : DAG.getSubtarget().useAA(); - -  if (UseAA) { -    // Look at other users of the same chain. Stores on the same chain do not -    // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized -    // to be on the same chain, so don't bother looking at adjacent chains. - -    SDValue Chain = St->getChain(); -    for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) { -      if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { -        if (I.getOperandNo() != 0) -          continue; - -        if (OtherST->isVolatile() || OtherST->isIndexed()) -          continue; - -        if (OtherST->getMemoryVT() != MemVT) -          continue; - -        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG); - -        if (Ptr.equalBaseIndex(BasePtr)) -          StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++)); -      } -    } - -    return; -  } - -  while (Index) { -    // If the chain has more than one use, then we can't reorder the mem ops. -    if (Index != St && !SDValue(Index, 0)->hasOneUse()) -      break; - -    // Find the base pointer and offset for this memory node. -    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); - -    // Check that the base pointer is the same as the original one. -    if (!Ptr.equalBaseIndex(BasePtr)) -      break; - -    // The memory operands must not be volatile. -    if (Index->isVolatile() || Index->isIndexed()) -      break; - -    // No truncation. -    if (Index->isTruncatingStore()) -      break; - -    // The stored memory type must be the same. -    if (Index->getMemoryVT() != MemVT) -      break; - -    // We do not allow under-aligned stores in order to prevent -    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD -    // be irrelevant here; what MATTERS is that we not move memory -    // operations that potentially overlap past each-other. -    if (Index->getAlignment() < MemVT.getStoreSize()) -      break; - -    // We found a potential memory operand to merge. -    StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); - -    // Find the next memory operand in the chain. If the next operand in the -    // chain is a store then move up and continue the scan with the next -    // memory operand. If the next operand is a load save it and use alias -    // information to check if it interferes with anything. -    SDNode *NextInChain = Index->getChain().getNode(); -    while (1) { -      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { -        // We found a store node. Use it for the next iteration. -        Index = STn; -        break; -      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) { -        if (Ldn->isVolatile()) { -          Index = nullptr; -          break; +  bool IsLoadSrc = isa<LoadSDNode>(St->getValue()); +  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) || +                       isa<ConstantFPSDNode>(St->getValue()); +  bool IsExtractVecSrc = +      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || +       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); +  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool { +    if (Other->isVolatile() || Other->isIndexed()) +      return false; +    // We can merge constant floats to equivalent integers +    if (Other->getMemoryVT() != MemVT) +      if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && +            isa<ConstantFPSDNode>(Other->getValue()))) +        return false; +    if (IsLoadSrc) +      if (!isa<LoadSDNode>(Other->getValue())) +        return false; +    if (IsConstantSrc) +      if (!(isa<ConstantSDNode>(Other->getValue()) || +            isa<ConstantFPSDNode>(Other->getValue()))) +        return false; +    if (IsExtractVecSrc) +      if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || +            Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) +        return false; +    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); +    return (Ptr.equalBaseIndex(BasePtr)); +  }; +  // We looking for a root node which is an ancestor to all mergable +  // stores. We search up through a load, to our root and then down +  // through all children. For instance we will find Store{1,2,3} if +  // St is Store1, Store2. or Store3 where the root is not a load +  // which always true for nonvolatile ops. TODO: Expand +  // the search to find all valid candidates through multiple layers of loads. +  // +  // Root +  // |-------|-------| +  // Load    Load    Store3 +  // |       | +  // Store1   Store2 +  // +  // FIXME: We should be able to climb and +  // descend TokenFactors to find candidates as well. + +  SDNode *RootNode = (St->getChain()).getNode(); + +  if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { +    RootNode = Ldn->getChain().getNode(); +    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) +      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain +        for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) +          if (I2.getOperandNo() == 0) +            if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { +              BaseIndexOffset Ptr; +              if (CandidateMatch(OtherST, Ptr)) +                StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset)); +            } +  } else +    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) +      if (I.getOperandNo() == 0) +        if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { +          BaseIndexOffset Ptr; +          if (CandidateMatch(OtherST, Ptr)) +            StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));          } - -        // Save the load node for later. Continue the scan. -        AliasLoadNodes.push_back(Ldn); -        NextInChain = Ldn->getChain().getNode(); -        continue; -      } else { -        Index = nullptr; -        break; -      } -    } -  }  }  // We need to check that merging these stores does not cause a loop @@ -11678,31 +12304,34 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(  // through the chain). Check in parallel by searching up from  // non-chain operands of candidates.  bool DAGCombiner::checkMergeStoreCandidatesForDependencies( -    SmallVectorImpl<MemOpLink> &StoreNodes) { +    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {    SmallPtrSet<const SDNode *, 16> Visited;    SmallVector<const SDNode *, 8> Worklist;    // search ops of store candidates -  for (unsigned i = 0; i < StoreNodes.size(); ++i) { +  for (unsigned i = 0; i < NumStores; ++i) {      SDNode *n = StoreNodes[i].MemNode;      // Potential loops may happen only through non-chain operands      for (unsigned j = 1; j < n->getNumOperands(); ++j)        Worklist.push_back(n->getOperand(j).getNode());    }    // search through DAG. We can stop early if we find a storenode -  for (unsigned i = 0; i < StoreNodes.size(); ++i) { +  for (unsigned i = 0; i < NumStores; ++i) {      if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist))        return false;    }    return true;  } -bool DAGCombiner::MergeConsecutiveStores( -    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) { +bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {    if (OptLevel == CodeGenOpt::None)      return false;    EVT MemVT = St->getMemoryVT();    int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + +  if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) +    return false; +    bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(        Attribute::NoImplicitFloat); @@ -11731,145 +12360,137 @@ bool DAGCombiner::MergeConsecutiveStores(    if (MemVT.isVector() && IsLoadSrc)      return false; -  // Only look at ends of store sequences. -  SDValue Chain = SDValue(St, 0); -  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) -    return false; - -  // Save the LoadSDNodes that we find in the chain. -  // We need to make sure that these nodes do not interfere with -  // any of the store nodes. -  SmallVector<LSBaseSDNode*, 8> AliasLoadNodes; - -  getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes); +  SmallVector<MemOpLink, 8> StoreNodes; +  // Find potential store merge candidates by searching through chain sub-DAG +  getStoreMergeCandidates(St, StoreNodes);    // Check if there is anything to merge.    if (StoreNodes.size() < 2)      return false; -  // only do dependence check in AA case -  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA -                                                  : DAG.getSubtarget().useAA(); -  if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes)) -    return false; -    // Sort the memory operands according to their distance from the -  // base pointer.  As a secondary criteria: make sure stores coming -  // later in the code come first in the list. This is important for -  // the non-UseAA case, because we're merging stores into the FINAL -  // store along a chain which potentially contains aliasing stores. -  // Thus, if there are multiple stores to the same address, the last -  // one can be considered for merging but not the others. +  // base pointer.    std::sort(StoreNodes.begin(), StoreNodes.end(),              [](MemOpLink LHS, MemOpLink RHS) { -    return LHS.OffsetFromBase < RHS.OffsetFromBase || -           (LHS.OffsetFromBase == RHS.OffsetFromBase && -            LHS.SequenceNum < RHS.SequenceNum); -  }); +              return LHS.OffsetFromBase < RHS.OffsetFromBase; +            });    // Scan the memory operations on the chain and find the first non-consecutive    // store memory address. -  unsigned LastConsecutiveStore = 0; +  unsigned NumConsecutiveStores = 0;    int64_t StartAddress = StoreNodes[0].OffsetFromBase; -  for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) { -    // Check that the addresses are consecutive starting from the second -    // element in the list of stores. -    if (i > 0) { -      int64_t CurrAddress = StoreNodes[i].OffsetFromBase; -      if (CurrAddress - StartAddress != (ElementSizeBytes * i)) -        break; -    } - -    // Check if this store interferes with any of the loads that we found. -    // If we find a load that alias with this store. Stop the sequence. -    if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) { -          return isAlias(Ldn, StoreNodes[i].MemNode); -        })) +  // Check that the addresses are consecutive starting from the second +  // element in the list of stores. +  for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { +    int64_t CurrAddress = StoreNodes[i].OffsetFromBase; +    if (CurrAddress - StartAddress != (ElementSizeBytes * i))        break; - -    // Mark this node as useful. -    LastConsecutiveStore = i; +    NumConsecutiveStores = i + 1;    } +  if (NumConsecutiveStores < 2) +    return false; + +  // Check that we can merge these candidates without causing a cycle +  if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumConsecutiveStores)) +    return false; + +    // The node with the lowest store address. -  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; -  unsigned FirstStoreAS = FirstInChain->getAddressSpace(); -  unsigned FirstStoreAlign = FirstInChain->getAlignment();    LLVMContext &Context = *DAG.getContext();    const DataLayout &DL = DAG.getDataLayout();    // Store the constants into memory as one consecutive store.    if (IsConstantSrc) { -    unsigned LastLegalType = 0; -    unsigned LastLegalVectorType = 0; -    bool NonZero = false; -    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { -      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode); -      SDValue StoredVal = St->getValue(); - -      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { -        NonZero |= !C->isNullValue(); -      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) { -        NonZero |= !C->getConstantFPValue()->isNullValue(); -      } else { -        // Non-constant. -        break; -      } +    bool RV = false; +    while (NumConsecutiveStores > 1) { +      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; +      unsigned FirstStoreAS = FirstInChain->getAddressSpace(); +      unsigned FirstStoreAlign = FirstInChain->getAlignment(); +      unsigned LastLegalType = 0; +      unsigned LastLegalVectorType = 0; +      bool NonZero = false; +      for (unsigned i = 0; i < NumConsecutiveStores; ++i) { +        StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); +        SDValue StoredVal = ST->getValue(); + +        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { +          NonZero |= !C->isNullValue(); +        } else if (ConstantFPSDNode *C = +                       dyn_cast<ConstantFPSDNode>(StoredVal)) { +          NonZero |= !C->getConstantFPValue()->isNullValue(); +        } else { +          // Non-constant. +          break; +        } -      // Find a legal type for the constant store. -      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; -      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); -      bool IsFast; -      if (TLI.isTypeLegal(StoreTy) && -          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, -                                 FirstStoreAlign, &IsFast) && IsFast) { -        LastLegalType = i+1; -      // Or check whether a truncstore is legal. -      } else if (TLI.getTypeAction(Context, StoreTy) == -                 TargetLowering::TypePromoteInteger) { -        EVT LegalizedStoredValueTy = -          TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); -        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && -            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, -                                   FirstStoreAS, FirstStoreAlign, &IsFast) && +        // Find a legal type for the constant store. +        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; +        EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); +        bool IsFast = false; +        if (TLI.isTypeLegal(StoreTy) && +            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, +                                   FirstStoreAlign, &IsFast) &&              IsFast) {            LastLegalType = i + 1; +          // Or check whether a truncstore is legal. +        } else if (TLI.getTypeAction(Context, StoreTy) == +                   TargetLowering::TypePromoteInteger) { +          EVT LegalizedStoredValueTy = +              TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); +          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && +              TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, +                                     FirstStoreAS, FirstStoreAlign, &IsFast) && +              IsFast) { +            LastLegalType = i + 1; +          }          } -      } -      // We only use vectors if the constant is known to be zero or the target -      // allows it and the function is not marked with the noimplicitfloat -      // attribute. -      if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1, -                                                        FirstStoreAS)) && -          !NoVectors) { -        // Find a legal type for the vector store. -        EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); -        if (TLI.isTypeLegal(Ty) && -            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, -                                   FirstStoreAlign, &IsFast) && IsFast) -          LastLegalVectorType = i + 1; +        // We only use vectors if the constant is known to be zero or the target +        // allows it and the function is not marked with the noimplicitfloat +        // attribute. +        if ((!NonZero || +             TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && +            !NoVectors) { +          // Find a legal type for the vector store. +          EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1); +          if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) && +              TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, +                                     FirstStoreAlign, &IsFast) && +              IsFast) +            LastLegalVectorType = i + 1; +        }        } -    } -    // Check if we found a legal integer type to store. -    if (LastLegalType == 0 && LastLegalVectorType == 0) -      return false; +      // Check if we found a legal integer type that creates a meaningful merge. +      if (LastLegalType < 2 && LastLegalVectorType < 2) +        break; -    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; -    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; +      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; +      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; -    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, -                                           true, UseVector); +      bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, +                                                    true, UseVector); +      if (!Merged) +        break; +      // Remove merged stores for next iteration. +      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); +      RV = true; +      NumConsecutiveStores -= NumElem; +    } +    return RV;    }    // When extracting multiple vector elements, try to store them    // in one vector store rather than a sequence of scalar stores.    if (IsExtractVecSrc) { +    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; +    unsigned FirstStoreAS = FirstInChain->getAddressSpace(); +    unsigned FirstStoreAlign = FirstInChain->getAlignment();      unsigned NumStoresToMerge = 0;      bool IsVec = MemVT.isVector(); -    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { +    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);        unsigned StoreValOpcode = St->getValue().getOpcode();        // This restriction could be loosened. @@ -11909,7 +12530,7 @@ bool DAGCombiner::MergeConsecutiveStores(    // Find acceptable loads. Loads need to have the same chain (token factor),    // must not be zext, volatile, indexed, and they must be consecutive.    BaseIndexOffset LdBasePtr; -  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { +  for (unsigned i = 0; i < NumConsecutiveStores; ++i) {      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);      LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());      if (!Ld) break; @@ -11942,7 +12563,7 @@ bool DAGCombiner::MergeConsecutiveStores(      }      // We found a potential memory operand to merge. -    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0)); +    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));    }    if (LoadNodes.size() < 2) @@ -11954,7 +12575,9 @@ bool DAGCombiner::MergeConsecutiveStores(    if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&        St->getAlignment() >= RequiredAlignment)      return false; - +  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; +  unsigned FirstStoreAS = FirstInChain->getAddressSpace(); +  unsigned FirstStoreAlign = FirstInChain->getAlignment();    LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);    unsigned FirstLoadAS = FirstLoad->getAddressSpace();    unsigned FirstLoadAlign = FirstLoad->getAlignment(); @@ -12023,31 +12646,12 @@ bool DAGCombiner::MergeConsecutiveStores(    // We add +1 here because the LastXXX variables refer to location while    // the NumElem refers to array/index size. -  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1; +  unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);    NumElem = std::min(LastLegalType, NumElem);    if (NumElem < 2)      return false; -  // Collect the chains from all merged stores. -  SmallVector<SDValue, 8> MergeStoreChains; -  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); - -  // The latest Node in the DAG. -  unsigned LatestNodeUsed = 0; -  for (unsigned i=1; i<NumElem; ++i) { -    // Find a chain for the new wide-store operand. Notice that some -    // of the store nodes that we found may not be selected for inclusion -    // in the wide store. The chain we use needs to be the chain of the -    // latest store node which is *used* and replaced by the wide store. -    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) -      LatestNodeUsed = i; - -    MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); -  } - -  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; -    // Find if it is better to use vectors or integers to load and store    // to memory.    EVT JointMemOpVT; @@ -12067,8 +12671,9 @@ bool DAGCombiner::MergeConsecutiveStores(                                  FirstLoad->getBasePtr(),                                  FirstLoad->getPointerInfo(), FirstLoadAlign); -  SDValue NewStoreChain = -    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); +  SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); + +  AddToWorklist(NewStoreChain.getNode());    SDValue NewStore =        DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), @@ -12081,25 +12686,9 @@ bool DAGCombiner::MergeConsecutiveStores(                                    SDValue(NewLoad.getNode(), 1));    } -  if (UseAA) { -    // Replace the all stores with the new store. -    for (unsigned i = 0; i < NumElem; ++i) -      CombineTo(StoreNodes[i].MemNode, NewStore); -  } else { -    // Replace the last store with the new store. -    CombineTo(LatestOp, NewStore); -    // Erase all other stores. -    for (unsigned i = 0; i < NumElem; ++i) { -      // Remove all Store nodes. -      if (StoreNodes[i].MemNode == LatestOp) -        continue; -      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); -      DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); -      deleteAndRecombine(St); -    } -  } - -  StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end()); +  // Replace the all stores with the new store. +  for (unsigned i = 0; i < NumElem; ++i) +    CombineTo(StoreNodes[i].MemNode, NewStore);    return true;  } @@ -12256,19 +12845,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {    if (SDValue NewST = TransformFPLoadStorePair(N))      return NewST; -  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA -                                                  : DAG.getSubtarget().useAA(); -#ifndef NDEBUG -  if (CombinerAAOnlyFunc.getNumOccurrences() && -      CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) -    UseAA = false; -#endif -  if (UseAA && ST->isUnindexed()) { -    // FIXME: We should do this even without AA enabled. AA will just allow -    // FindBetterChain to work in more situations. The problem with this is that -    // any combine that expects memory operations to be on consecutive chains -    // first needs to be updated to look for users of the same chain. - +  if (ST->isUnindexed()) {      // Walk up chain skipping non-aliasing memory nodes, on this store and any      // adjacent stores.      if (findBetterNeighborChains(ST)) { @@ -12302,8 +12879,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {      if (SimplifyDemandedBits(              Value,              APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), -                                 ST->getMemoryVT().getScalarSizeInBits()))) +                                 ST->getMemoryVT().getScalarSizeInBits()))) { +      // Re-visit the store if anything changed and the store hasn't been merged +      // with another node (N is deleted) SimplifyDemandedBits will add Value's +      // node back to the worklist if necessary, but we also need to re-visit +      // the Store node itself. +      if (N->getOpcode() != ISD::DELETED_NODE) +        AddToWorklist(N);        return SDValue(N, 0); +    }    }    // If this is a load followed by a store to the same location, then the store @@ -12347,15 +12931,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {        // There can be multiple store sequences on the same chain.        // Keep trying to merge store sequences until we are unable to do so        // or until we merge the last store on the chain. -      SmallVector<MemOpLink, 8> StoreNodes; -      bool Changed = MergeConsecutiveStores(ST, StoreNodes); +      bool Changed = MergeConsecutiveStores(ST);        if (!Changed) break; - -      if (any_of(StoreNodes, -                 [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) { -        // ST has been merged and no longer exists. +      // Return N as merge only uses CombineTo and no worklist clean +      // up is necessary. +      if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))          return SDValue(N, 0); -      }      }    } @@ -12364,7 +12945,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {    // Make sure to do this only after attempting to merge stores in order to    //  avoid changing the types of some subset of stores due to visit order,    //  preventing their merging. -  if (isa<ConstantFPSDNode>(Value)) { +  if (isa<ConstantFPSDNode>(ST->getValue())) {      if (SDValue NewSt = replaceStoreOfFPConstant(ST))        return NewSt;    } @@ -12493,10 +13074,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {    EVT VT = InVec.getValueType(); -  // If we can't generate a legal BUILD_VECTOR, exit -  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) -    return SDValue(); -    // Check that we know which element is being inserted    if (!isa<ConstantSDNode>(EltNo))      return SDValue(); @@ -12523,6 +13100,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {      }    } +  // If we can't generate a legal BUILD_VECTOR, exit +  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) +    return SDValue(); +    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the    // vector elements. @@ -12544,11 +13125,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {      // All the operands of BUILD_VECTOR must have the same type;      // we enforce that here.      EVT OpVT = Ops[0].getValueType(); -    if (InVal.getValueType() != OpVT) -      InVal = OpVT.bitsGT(InVal.getValueType()) ? -                DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : -                DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); -    Ops[Elt] = InVal; +    Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;    }    // Return the new vector @@ -12568,6 +13145,11 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))      return SDValue(); +  ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? +    ISD::NON_EXTLOAD : ISD::EXTLOAD; +  if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) +    return SDValue(); +    Align = NewAlign;    SDValue NewPtr = OriginalLoad->getBasePtr(); @@ -12639,6 +13221,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {    EVT VT = InVec.getValueType();    EVT NVT = N->getValueType(0); +  if (InVec.isUndef()) +    return DAG.getUNDEF(NVT); +    if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {      // Check if the result type doesn't match the inserted element type. A      // SCALAR_TO_VECTOR may truncate the inserted element and the @@ -13022,7 +13607,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {    return DAG.getNode(Opcode, DL, VT, BV);  } -SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N, +SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,                                             ArrayRef<int> VectorMask,                                             SDValue VecIn1, SDValue VecIn2,                                             unsigned LeftIdx) { @@ -13300,6 +13885,35 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {    if (ISD::allOperandsUndef(N))      return DAG.getUNDEF(VT); +  // Check if we can express BUILD VECTOR via subvector extract. +  if (!LegalTypes && (N->getNumOperands() > 1)) { +    SDValue Op0 = N->getOperand(0); +    auto checkElem = [&](SDValue Op) -> uint64_t { +      if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && +          (Op0.getOperand(0) == Op.getOperand(0))) +        if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) +          return CNode->getZExtValue(); +      return -1; +    }; + +    int Offset = checkElem(Op0); +    for (unsigned i = 0; i < N->getNumOperands(); ++i) { +      if (Offset + i != checkElem(N->getOperand(i))) { +        Offset = -1; +        break; +      } +    } + +    if ((Offset == 0) && +        (Op0.getOperand(0).getValueType() == N->getValueType(0))) +      return Op0.getOperand(0); +    if ((Offset != -1) && +        ((Offset % N->getValueType(0).getVectorNumElements()) == +         0)) // IDX must be multiple of output size. +      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), +                         Op0.getOperand(0), Op0.getOperand(1)); +  } +    if (SDValue V = reduceBuildVecExtToExtBuildVec(N))      return V; @@ -13491,8 +14105,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {        if (!SclTy.isFloatingPoint() && !SclTy.isInteger())          return SDValue(); -      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, -                                 VT.getSizeInBits() / SclTy.getSizeInBits()); +      unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); +      if (VNTNumElms < 2) +        return SDValue(); + +      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);        if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))          return SDValue(); @@ -13611,15 +14228,19 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {    EVT NVT = N->getValueType(0);    SDValue V = N->getOperand(0); -  if (V->getOpcode() == ISD::CONCAT_VECTORS) { -    // Combine: -    //    (extract_subvec (concat V1, V2, ...), i) -    // Into: -    //    Vi if possible -    // Only operand 0 is checked as 'concat' assumes all inputs of the same -    // type. -    if (V->getOperand(0).getValueType() != NVT) -      return SDValue(); +  // Extract from UNDEF is UNDEF. +  if (V.isUndef()) +    return DAG.getUNDEF(NVT); + +  // Combine: +  //    (extract_subvec (concat V1, V2, ...), i) +  // Into: +  //    Vi if possible +  // Only operand 0 is checked as 'concat' assumes all inputs of the same +  // type. +  if (V->getOpcode() == ISD::CONCAT_VECTORS && +      isa<ConstantSDNode>(N->getOperand(1)) && +      V->getOperand(0).getValueType() == NVT) {      unsigned Idx = N->getConstantOperandVal(1);      unsigned NumElems = NVT.getVectorNumElements();      assert((Idx % NumElems) == 0 && @@ -13633,19 +14254,16 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {    if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {      // Handle only simple case where vector being inserted and vector -    // being extracted are of same type, and are half size of larger vectors. -    EVT BigVT = V->getOperand(0).getValueType(); +    // being extracted are of same size.      EVT SmallVT = V->getOperand(1).getValueType(); -    if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits()) +    if (!NVT.bitsEq(SmallVT))        return SDValue(); -    // Only handle cases where both indexes are constants with the same type. +    // Only handle cases where both indexes are constants.      ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));      ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2)); -    if (InsIdx && ExtIdx && -        InsIdx->getValueType(0).getSizeInBits() <= 64 && -        ExtIdx->getValueType(0).getSizeInBits() <= 64) { +    if (InsIdx && ExtIdx) {        // Combine:        //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)        // Into: @@ -13892,6 +14510,113 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,    return DAG.getBuildVector(VT, SDLoc(SVN), Ops);  } +// Match shuffles that can be converted to any_vector_extend_in_reg. +// This is often generated during legalization. +// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) +// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. +SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, +                                     SelectionDAG &DAG, +                                     const TargetLowering &TLI, +                                     bool LegalOperations) { +  EVT VT = SVN->getValueType(0); +  bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + +  // TODO Add support for big-endian when we have a test case. +  if (!VT.isInteger() || IsBigEndian) +    return SDValue(); + +  unsigned NumElts = VT.getVectorNumElements(); +  unsigned EltSizeInBits = VT.getScalarSizeInBits(); +  ArrayRef<int> Mask = SVN->getMask(); +  SDValue N0 = SVN->getOperand(0); + +  // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) +  auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { +    for (unsigned i = 0; i != NumElts; ++i) { +      if (Mask[i] < 0) +        continue; +      if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) +        continue; +      return false; +    } +    return true; +  }; + +  // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for +  // power-of-2 extensions as they are the most likely. +  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { +    if (!isAnyExtend(Scale)) +      continue; + +    EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); +    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); +    if (!LegalOperations || +        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) +      return DAG.getBitcast(VT, +                            DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); +  } + +  return SDValue(); +} + +// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of +// each source element of a large type into the lowest elements of a smaller +// destination type. This is often generated during legalization. +// If the source node itself was a '*_extend_vector_inreg' node then we should +// then be able to remove it. +SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) { +  EVT VT = SVN->getValueType(0); +  bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + +  // TODO Add support for big-endian when we have a test case. +  if (!VT.isInteger() || IsBigEndian) +    return SDValue(); + +  SDValue N0 = SVN->getOperand(0); +  while (N0.getOpcode() == ISD::BITCAST) +    N0 = N0.getOperand(0); + +  unsigned Opcode = N0.getOpcode(); +  if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && +      Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && +      Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) +    return SDValue(); + +  SDValue N00 = N0.getOperand(0); +  ArrayRef<int> Mask = SVN->getMask(); +  unsigned NumElts = VT.getVectorNumElements(); +  unsigned EltSizeInBits = VT.getScalarSizeInBits(); +  unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); + +  // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> +  // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> +  // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> +  auto isTruncate = [&Mask, &NumElts](unsigned Scale) { +    for (unsigned i = 0; i != NumElts; ++i) { +      if (Mask[i] < 0) +        continue; +      if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) +        continue; +      return false; +    } +    return true; +  }; + +  // At the moment we just handle the case where we've truncated back to the +  // same size as before the extension. +  // TODO: handle more extension/truncation cases as cases arise. +  if (EltSizeInBits != ExtSrcSizeInBits) +    return SDValue(); + +  // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for +  // power-of-2 truncations as they are the most likely. +  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) +    if (isTruncate(Scale)) +      return DAG.getBitcast(VT, N00); + +  return SDValue(); +} +  SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {    EVT VT = N->getValueType(0);    unsigned NumElts = VT.getVectorNumElements(); @@ -13996,6 +14721,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {    if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))      return S; +  // Match shuffles that can be converted to any_vector_extend_in_reg. +  if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) +    return V; + +  // Combine "truncate_vector_in_reg" style shuffles. +  if (SDValue V = combineTruncationShuffle(SVN, DAG)) +    return V; +    if (N0.getOpcode() == ISD::CONCAT_VECTORS &&        Level < AfterLegalizeVectorOps &&        (N1.isUndef() || @@ -14253,6 +14986,16 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {    SDValue N1 = N->getOperand(1);    SDValue N2 = N->getOperand(2); +  // If inserting an UNDEF, just return the original vector. +  if (N1.isUndef()) +    return N0; + +  // If this is an insert of an extracted vector into an undef vector, we can +  // just use the input to the extract. +  if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && +      N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) +    return N1.getOperand(0); +    // Combine INSERT_SUBVECTORs where we are inserting to the same index.    // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )    // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -14262,26 +15005,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),                         N1, N2); -  if (N0.getValueType() != N1.getValueType()) +  if (!isa<ConstantSDNode>(N2))      return SDValue(); +  unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); + +  // Canonicalize insert_subvector dag nodes. +  // Example: +  // (insert_subvector (insert_subvector A, Idx0), Idx1) +  // -> (insert_subvector (insert_subvector A, Idx1), Idx0) +  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && +      N1.getValueType() == N0.getOperand(1).getValueType() && +      isa<ConstantSDNode>(N0.getOperand(2))) { +    unsigned OtherIdx = cast<ConstantSDNode>(N0.getOperand(2))->getZExtValue(); +    if (InsIdx < OtherIdx) { +      // Swap nodes. +      SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, +                                  N0.getOperand(0), N1, N2); +      AddToWorklist(NewOp.getNode()); +      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), +                         VT, NewOp, N0.getOperand(1), N0.getOperand(2)); +    } +  } +    // If the input vector is a concatenation, and the insert replaces -  // one of the halves, we can optimize into a single concat_vectors. -  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 && -      N2.getOpcode() == ISD::Constant) { -    APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue(); +  // one of the pieces, we can optimize into a single concat_vectors. +  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && +      N0.getOperand(0).getValueType() == N1.getValueType()) { +    unsigned Factor = N1.getValueType().getVectorNumElements(); -    // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) -> -    // (concat_vectors Z, Y) -    if (InsIdx == 0) -      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1, -                         N0.getOperand(1)); +    SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); +    Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1; -    // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) -> -    // (concat_vectors X, Z) -    if (InsIdx == VT.getVectorNumElements() / 2) -      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0), -                         N1); +    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);    }    return SDValue(); @@ -15257,7 +16013,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,    if (Base.getOpcode() == ISD::ADD) {      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {        Base = Base.getOperand(0); -      Offset += C->getZExtValue(); +      Offset += C->getSExtValue();      }    } @@ -15454,6 +16210,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,        ++Depth;        break; +    case ISD::CopyFromReg: +      // Forward past CopyFromReg. +      Chains.push_back(Chain.getOperand(0)); +      ++Depth; +      break; +      default:        // For all other instructions we will just have to take what we can get.        Aliases.push_back(Chain); @@ -15482,6 +16244,18 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {    return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);  } +// This function tries to collect a bunch of potentially interesting +// nodes to improve the chains of, all at once. This might seem +// redundant, as this function gets called when visiting every store +// node, so why not let the work be done on each store as it's visited? +// +// I believe this is mainly important because MergeConsecutiveStores +// is unable to deal with merging stores of different sizes, so unless +// we improve the chains of all the potential candidates up-front +// before running MergeConsecutiveStores, it might only see some of +// the nodes that will eventually be candidates, and then not be able +// to go from a partially-merged state to the desired final +// fully-merged state.  bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {    // This holds the base pointer, index, and the offset in bytes from the base    // pointer. @@ -15517,10 +16291,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {      if (!Ptr.equalBaseIndex(BasePtr))        break; -    // Find the next memory operand in the chain. If the next operand in the -    // chain is a store then move up and continue the scan with the next -    // memory operand. If the next operand is a load save it and use alias -    // information to check if it interferes with anything. +    // Walk up the chain to find the next store node, ignoring any +    // intermediate loads. Any other kind of node will halt the loop.      SDNode *NextInChain = Index->getChain().getNode();      while (true) {        if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { @@ -15539,9 +16311,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {          Index = nullptr;          break;        } -    } +    } // end while    } +  // At this point, ChainedStores lists all of the Store nodes +  // reachable by iterating up through chain nodes matching the above +  // conditions.  For each such store identified, try to find an +  // earlier chain to attach the store to which won't violate the +  // required ordering.    bool MadeChangeToSt = false;    SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains; diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index e2f33bb433ba..0584ab9f60d1 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1,4 +1,4 @@ -//===-- FastISel.cpp - Implementation of the FastISel class ---------------===// +//===- FastISel.cpp - Implementation of the FastISel class ----------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -39,35 +39,76 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/DenseMap.h"  #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/Loads.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/CodeGen/Analysis.h"  #include "llvm/CodeGen/FastISel.h"  #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h"  #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h"  #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h"  #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h"  #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetLowering.h"  #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> +  using namespace llvm;  #define DEBUG_TYPE "isel" @@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "                                      "target-specific selector");  STATISTIC(NumFastIselDead, "Number of dead insts removed on failure"); -void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS, -                                           unsigned AttrIdx) { -  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); -  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); -  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); -  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); -  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); -  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); -  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); -  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); -  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); -  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); -  Alignment = CS->getParamAlignment(AttrIdx); -} -  /// Set the current block to which generated machine instructions will be  /// appended, and clear the local CSE map.  void FastISel::startNewBlock() { @@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {        // Try to emit the constant by using an integer constant with a cast.        const APFloat &Flt = CF->getValueAPF();        EVT IntVT = TLI.getPointerTy(DL); - -      uint64_t x[2];        uint32_t IntBitWidth = IntVT.getSizeInBits(); +      APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false);        bool isExact; -      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true, -                                 APFloat::rmTowardZero, &isExact); +      (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact);        if (isExact) { -        APInt IntVal(IntBitWidth, x); -          unsigned IntegerReg = -            getRegForValue(ConstantInt::get(V->getContext(), IntVal)); +            getRegForValue(ConstantInt::get(V->getContext(), SIntVal));          if (IntegerReg != 0)            Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,                             /*Kill=*/false); @@ -646,7 +668,7 @@ bool FastISel::selectStackmap(const CallInst *I) {    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,                                      TII.get(TargetOpcode::STACKMAP));    for (auto const &MO : Ops) -    MIB.addOperand(MO); +    MIB.add(MO);    // Issue CALLSEQ_END    unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); @@ -672,10 +694,8 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,    Args.reserve(NumArgs);    // Populate the argument list. -  // Attributes for args start at offset 1, after the return attribute.    ImmutableCallSite CS(CI); -  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; -       ArgI != ArgE; ++ArgI) { +  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) {      Value *V = CI->getOperand(ArgI);      assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); @@ -683,7 +703,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,      ArgListEntry Entry;      Entry.Val = V;      Entry.Ty = V->getType(); -    Entry.setAttributes(&CS, AttrI); +    Entry.setAttributes(&CS, ArgIdx);      Args.push_back(Entry);    } @@ -826,7 +846,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) {                                      TII.get(TargetOpcode::PATCHPOINT));    for (auto &MO : Ops) -    MIB.addOperand(MO); +    MIB.add(MO);    MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI); @@ -841,9 +861,9 @@ bool FastISel::selectPatchpoint(const CallInst *I) {    return true;  } -/// Returns an AttributeSet representing the attributes applied to the return +/// Returns an AttributeList representing the attributes applied to the return  /// value of the given call. -static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) {    SmallVector<Attribute::AttrKind, 2> Attrs;    if (CLI.RetSExt)      Attrs.push_back(Attribute::SExt); @@ -852,8 +872,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {    if (CLI.IsInReg)      Attrs.push_back(Attribute::InReg); -  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, -                           Attrs); +  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, +                            Attrs);  }  bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName, @@ -885,9 +905,10 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,      ArgListEntry Entry;      Entry.Val = V;      Entry.Ty = V->getType(); -    Entry.setAttributes(&CS, ArgI + 1); +    Entry.setAttributes(&CS, ArgI);      Args.push_back(Entry);    } +  TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args);    CallLoweringInfo CLI;    CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs); @@ -1021,7 +1042,7 @@ bool FastISel::lowerCall(const CallInst *CI) {      Entry.Ty = V->getType();      // Skip the first return-type Attribute to get to params. -    Entry.setAttributes(&CS, i - CS.arg_begin() + 1); +    Entry.setAttributes(&CS, i - CS.arg_begin());      Args.push_back(Entry);    } @@ -1149,7 +1170,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {        } else          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,                  TII.get(TargetOpcode::DBG_VALUE)) -            .addOperand(*Op) +            .add(*Op)              .addImm(0)              .addMetadata(DI->getVariable())              .addMetadata(DI->getExpression()); @@ -1362,7 +1383,7 @@ bool FastISel::selectInstruction(const Instruction *I) {    if (const auto *Call = dyn_cast<CallInst>(I)) {      const Function *F = Call->getCalledFunction(); -    LibFunc::Func Func; +    LibFunc Func;      // As a special case, don't handle calls to builtin library functions that      // may be translated directly to target instructions. @@ -1665,7 +1686,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo,        TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),        SkipTargetIndependentISel(SkipTargetIndependentISel) {} -FastISel::~FastISel() {} +FastISel::~FastISel() = default;  bool FastISel::fastLowerArguments() { return false; } diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 4a9042cfb3f4..e85d1951e3ae 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -235,7 +235,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,      if (II.OpInfo[i].isOptionalDef()) {        // Optional def must be a physical register. -      unsigned NumResults = CountResults(Node);        VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();        assert(TargetRegisterInfo::isPhysicalRegister(VRBase));        MIB.addReg(VRBase, RegState::Define); diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b0028252836a..fc7cd020fe2e 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1192,8 +1192,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {        // If the index is dependent on the store we will introduce a cycle when        // creating the load (the load uses the index, and by replacing the chain -      // we will make the index dependent on the load). -      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist)) +      // we will make the index dependent on the load). Also, the store might be +      // dependent on the extractelement and introduce a cycle when creating  +      // the load. +      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) || +          ST->hasPredecessor(Op.getNode()))          continue;        StackPtr = ST->getBasePtr(); @@ -1909,8 +1912,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());      Entry.Node = Op;      Entry.Ty = ArgTy; -    Entry.isSExt = isSigned; -    Entry.isZExt = !isSigned; +    Entry.IsSExt = isSigned; +    Entry.IsZExt = !isSigned;      Args.push_back(Entry);    }    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1935,9 +1938,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,      InChain = TCChain;    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) -    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) -    .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned); +  CLI.setDebugLoc(SDLoc(Node)) +      .setChain(InChain) +      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, +                    std::move(Args)) +      .setTailCall(isTailCall) +      .setSExtResult(isSigned) +      .setZExtResult(!isSigned);    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1960,8 +1967,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,    for (unsigned i = 0; i != NumOps; ++i) {      Entry.Node = Ops[i];      Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); -    Entry.isSExt = isSigned; -    Entry.isZExt = !isSigned; +    Entry.IsSExt = isSigned; +    Entry.IsZExt = !isSigned;      Args.push_back(Entry);    }    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1970,9 +1977,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,    Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) -    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) -    .setSExtResult(isSigned).setZExtResult(!isSigned); +  CLI.setDebugLoc(dl) +      .setChain(DAG.getEntryNode()) +      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, +                    std::move(Args)) +      .setSExtResult(isSigned) +      .setZExtResult(!isSigned);    std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1994,8 +2004,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());      Entry.Node = Node->getOperand(i);      Entry.Ty = ArgTy; -    Entry.isSExt = isSigned; -    Entry.isZExt = !isSigned; +    Entry.IsSExt = isSigned; +    Entry.IsZExt = !isSigned;      Args.push_back(Entry);    }    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2004,9 +2014,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,    Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) -    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) -    .setSExtResult(isSigned).setZExtResult(!isSigned); +  CLI.setDebugLoc(SDLoc(Node)) +      .setChain(InChain) +      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, +                    std::move(Args)) +      .setSExtResult(isSigned) +      .setZExtResult(!isSigned);    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2081,8 +2094,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());      Entry.Node = Op;      Entry.Ty = ArgTy; -    Entry.isSExt = isSigned; -    Entry.isZExt = !isSigned; +    Entry.IsSExt = isSigned; +    Entry.IsZExt = !isSigned;      Args.push_back(Entry);    } @@ -2090,8 +2103,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,    SDValue FIPtr = DAG.CreateStackTemporary(RetVT);    Entry.Node = FIPtr;    Entry.Ty = RetTy->getPointerTo(); -  Entry.isSExt = isSigned; -  Entry.isZExt = !isSigned; +  Entry.IsSExt = isSigned; +  Entry.IsZExt = !isSigned;    Args.push_back(Entry);    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2099,9 +2112,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,    SDLoc dl(Node);    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(dl).setChain(InChain) -    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) -    .setSExtResult(isSigned).setZExtResult(!isSigned); +  CLI.setDebugLoc(dl) +      .setChain(InChain) +      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, +                    std::move(Args)) +      .setSExtResult(isSigned) +      .setZExtResult(!isSigned);    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2185,24 +2201,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,    // Pass the argument.    Entry.Node = Node->getOperand(0);    Entry.Ty = RetTy; -  Entry.isSExt = false; -  Entry.isZExt = false; +  Entry.IsSExt = false; +  Entry.IsZExt = false;    Args.push_back(Entry);    // Pass the return address of sin.    SDValue SinPtr = DAG.CreateStackTemporary(RetVT);    Entry.Node = SinPtr;    Entry.Ty = RetTy->getPointerTo(); -  Entry.isSExt = false; -  Entry.isZExt = false; +  Entry.IsSExt = false; +  Entry.IsZExt = false;    Args.push_back(Entry);    // Also pass the return address of the cos.    SDValue CosPtr = DAG.CreateStackTemporary(RetVT);    Entry.Node = CosPtr;    Entry.Ty = RetTy->getPointerTo(); -  Entry.isSExt = false; -  Entry.isZExt = false; +  Entry.IsSExt = false; +  Entry.IsZExt = false;    Args.push_back(Entry);    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2210,9 +2226,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,    SDLoc dl(Node);    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(dl).setChain(InChain) -    .setCallee(TLI.getLibcallCallingConv(LC), -               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args)); +  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( +      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, +      std::move(Args));    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2529,12 +2545,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {      APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);      APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);      for (unsigned J = 0; J != Sz; J += 8) { -      MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J)); -      MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J)); -      MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J)); -      MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J)); -      MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J)); -      MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J)); +      MaskHi4 = MaskHi4 | (0xF0ull << J); +      MaskLo4 = MaskLo4 | (0x0Full << J); +      MaskHi2 = MaskHi2 | (0xCCull << J); +      MaskLo2 = MaskLo2 | (0x33ull << J); +      MaskHi1 = MaskHi1 | (0xAAull << J); +      MaskLo1 = MaskLo1 | (0x55ull << J);      }      // BSWAP if the type is wider than a single byte. @@ -3091,7 +3107,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {                              TLI.getVectorIdxTy(DAG.getDataLayout()))));      } -    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +    Tmp1 = DAG.getBuildVector(VT, dl, Ops);      // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.      Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);      Results.push_back(Tmp1); @@ -3790,8 +3806,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {        Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,                                      VT.getScalarType(), Ex, Sh));      } -    SDValue Result = -      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars); + +    SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);      ReplaceNode(SDValue(Node, 0), Result);      break;    } @@ -3830,10 +3846,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {      TargetLowering::CallLoweringInfo CLI(DAG);      CLI.setDebugLoc(dl)          .setChain(Node->getOperand(0)) -        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), -                   DAG.getExternalSymbol("__sync_synchronize", -                                         TLI.getPointerTy(DAG.getDataLayout())), -                   std::move(Args)); +        .setLibCallee( +            CallingConv::C, Type::getVoidTy(*DAG.getContext()), +            DAG.getExternalSymbol("__sync_synchronize", +                                  TLI.getPointerTy(DAG.getDataLayout())), +            std::move(Args));      std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -3870,10 +3887,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {      TargetLowering::CallLoweringInfo CLI(DAG);      CLI.setDebugLoc(dl)          .setChain(Node->getOperand(0)) -        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), -                   DAG.getExternalSymbol("abort", -                                         TLI.getPointerTy(DAG.getDataLayout())), -                   std::move(Args)); +        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), +                      DAG.getExternalSymbol( +                          "abort", TLI.getPointerTy(DAG.getDataLayout())), +                      std::move(Args));      std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);      Results.push_back(CallResult.second); @@ -4424,8 +4441,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {        NewOps.push_back(Elt);      } -    SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps); - +    SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);      Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));      break;    } diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 72b56d84d945..6f2b1b94ce46 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -459,7 +459,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {    if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {      Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);      if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat) -      SoftenFloatResult(Op.getNode(), 0); +      AddToWorklist(Op.getNode());    }    if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) { @@ -472,8 +472,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {    }    RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); -  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat) -    Op = GetSoftenedFloat(Op);    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");    return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;  } @@ -1054,15 +1052,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {  void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,                                                   SDValue &Hi) {    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); -  assert(NVT.getSizeInBits() == integerPartWidth && +  assert(NVT.getSizeInBits() == 64 &&           "Do not know how to expand this float constant!");    APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();    SDLoc dl(N);    Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), -                                 APInt(integerPartWidth, C.getRawData()[1])), +                                 APInt(64, C.getRawData()[1])),                           dl, NVT);    Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), -                                 APInt(integerPartWidth, C.getRawData()[0])), +                                 APInt(64, C.getRawData()[0])),                           dl, NVT);  } diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index dc436ce04514..85068e890756 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -690,7 +690,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {    case TargetLowering::TypePromoteInteger:      Res = GetPromotedInteger(InOp);      break; -  case TargetLowering::TypeSplitVector: +  case TargetLowering::TypeSplitVector: {      EVT InVT = InOp.getValueType();      assert(InVT.isVector() && "Cannot split scalar types");      unsigned NumElts = InVT.getVectorNumElements(); @@ -709,6 +709,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {      return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);    } +  case TargetLowering::TypeWidenVector: { +    SDValue WideInOp = GetWidenedVector(InOp); + +    // Truncate widened InOp. +    unsigned NumElem = WideInOp.getValueType().getVectorNumElements(); +    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), +                                   N->getValueType(0).getScalarType(), NumElem); +    SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp); + +    // Zero extend so that the elements are of same type as those of NVT +    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(), +                                 NumElem); +    SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc); + +    // Extract the low NVT subvector. +    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); +    SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy); +    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx); +  } +  }    // Truncate to NVT instead of VT    return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); @@ -1089,6 +1109,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {    SDValue Cond = N->getOperand(0);    EVT OpTy = N->getOperand(1).getValueType(); +  if (N->getOpcode() == ISD::VSELECT) +    if (SDValue Res = WidenVSELECTAndMask(N)) +      return Res; +    // Promote all the way up to the canonical SetCC type.    EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;    Cond = PromoteTargetBoolean(Cond, OpVT); @@ -2586,24 +2610,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());      Entry.Node = Op;      Entry.Ty = ArgTy; -    Entry.isSExt = true; -    Entry.isZExt = false; +    Entry.IsSExt = true; +    Entry.IsZExt = false;      Args.push_back(Entry);    }    // Also pass the address of the overflow check.    Entry.Node = Temp;    Entry.Ty = PtrTy->getPointerTo(); -  Entry.isSExt = true; -  Entry.isZExt = false; +  Entry.IsSExt = true; +  Entry.IsZExt = false;    Args.push_back(Entry);    SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(dl).setChain(Chain) -    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) -    .setSExtResult(); +  CLI.setDebugLoc(dl) +      .setChain(Chain) +      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) +      .setSExtResult();    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index cf19d75676cd..0a2b680e1c66 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -199,8 +199,7 @@ bool DAGTypeLegalizer::run() {    // non-leaves.    for (SDNode &Node : DAG.allnodes()) {      if (Node.getNumOperands() == 0) { -      Node.setNodeId(ReadyToProcess); -      Worklist.push_back(&Node); +      AddToWorklist(&Node);      } else {        Node.setNodeId(Unanalyzed);      } @@ -331,6 +330,12 @@ ScanOperands:      // to the worklist etc.      if (NeedsReanalyzing) {        assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?"); + +      // Remove any result values from SoftenedFloats as N will be revisited +      // again. +      for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) +        SoftenedFloats.erase(SDValue(N, i)); +        N->setNodeId(NewNode);        // Recompute the NodeId and correct processed operands, adding the node to        // the worklist if ready. @@ -749,6 +754,8 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {      // new uses of From due to CSE. If this happens, replace the new uses of      // From with To.    } while (!From.use_empty()); + +  SoftenedFloats.erase(From);  }  void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) { @@ -1077,8 +1084,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());      Entry.Node = Node->getOperand(i);      Entry.Ty = ArgTy; -    Entry.isSExt = isSigned; -    Entry.isZExt = !isSigned; +    Entry.IsSExt = isSigned; +    Entry.IsZExt = !isSigned;      Args.push_back(Entry);    }    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1087,9 +1094,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,    Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());    TargetLowering::CallLoweringInfo CLI(DAG); -  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) -    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) -    .setSExtResult(isSigned).setZExtResult(!isSigned); +  CLI.setDebugLoc(SDLoc(Node)) +      .setChain(InChain) +      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, +                    std::move(Args)) +      .setSExtResult(isSigned) +      .setZExtResult(!isSigned);    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index ec55662d75c0..80c939700518 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -191,6 +191,11 @@ private:    void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,                      SDValue &Lo, SDValue &Hi); +  void AddToWorklist(SDNode *N) { +    N->setNodeId(ReadyToProcess); +    Worklist.push_back(N); +  } +    //===--------------------------------------------------------------------===//    // Integer Promotion Support: LegalizeIntegerTypes.cpp    //===--------------------------------------------------------------------===// @@ -597,6 +602,7 @@ private:    SDValue ScalarizeVecRes_TernaryOp(SDNode *N);    SDValue ScalarizeVecRes_UnaryOp(SDNode *N);    SDValue ScalarizeVecRes_InregOp(SDNode *N); +  SDValue ScalarizeVecRes_VecInregOp(SDNode *N);    SDValue ScalarizeVecRes_BITCAST(SDNode *N);    SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); @@ -672,6 +678,7 @@ private:    SDValue SplitVecOp_BITCAST(SDNode *N);    SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);    SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); +  SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);    SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);    SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);    SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); @@ -713,6 +720,7 @@ private:    SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);    SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);    SDValue WidenVecRes_SELECT(SDNode* N); +  SDValue WidenVSELECTAndMask(SDNode *N);    SDValue WidenVecRes_SELECT_CC(SDNode* N);    SDValue WidenVecRes_SETCC(SDNode* N);    SDValue WidenVecRes_UNDEF(SDNode *N); @@ -782,6 +790,13 @@ private:    /// By default, the vector will be widened with undefined values.    SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false); +  /// Return a mask of vector type MaskVT to replace InMask. Also adjust +  /// MaskVT to ToMaskVT if needed with vector extension or truncation. +  SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT); + +  /// Get the target mask VT, and widen if needed. +  EVT getSETCCWidenedResultTy(SDValue SetCC); +    //===--------------------------------------------------------------------===//    // Generic Splitting: LegalizeTypesGeneric.cpp    //===--------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 3682c32460c6..c02b8960b36c 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -512,8 +512,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,    GetSplitOp(Op, Lo, Hi);  } -void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, -                                       SDValue &Hi) { +static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, +                                               SelectionDAG &DAG) { +  SDLoc DL(N); +  EVT LoVT, HiVT; +  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + +  // Split the inputs. +  SDValue Lo, Hi, LL, LH, RL, RH; +  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); +  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); + +  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); +  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); + +  return std::make_pair(Lo, Hi); +} + +void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {    SDValue LL, LH, RL, RH, CL, CH;    SDLoc dl(N);    GetSplitOp(N->getOperand(1), LL, LH); @@ -522,9 +538,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,    SDValue Cond = N->getOperand(0);    CL = CH = Cond;    if (Cond.getValueType().isVector()) { +    if (SDValue Res = WidenVSELECTAndMask(N)) +      std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); +    // It seems to improve code to generate two narrow SETCCs as opposed to +    // splitting a wide result vector. +    else if (Cond.getOpcode() == ISD::SETCC) +      std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);      // Check if there are already splitted versions of the vector available and      // use those instead of splitting the mask operand again. -    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) +    else if (getTypeAction(Cond.getValueType()) == +             TargetLowering::TypeSplitVector)        GetSplitVector(Cond, CL, CH);      else        std::tie(CL, CH) = DAG.SplitVector(Cond, dl); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index d4fa20f35274..5f167f8de1cf 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ class VectorLegalizer {    SDValue ExpandLoad(SDValue Op);    SDValue ExpandStore(SDValue Op);    SDValue ExpandFNEG(SDValue Op); +  SDValue ExpandFSUB(SDValue Op);    SDValue ExpandBITREVERSE(SDValue Op);    SDValue ExpandCTLZ(SDValue Op);    SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); @@ -621,8 +622,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {      }      NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); -    Value = DAG.getNode(ISD::BUILD_VECTOR, dl, -                        Op.getNode()->getValueType(0), Vals); +    Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals);    } else {      SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG); @@ -692,6 +692,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {      return ExpandUINT_TO_FLOAT(Op);    case ISD::FNEG:      return ExpandFNEG(Op); +  case ISD::FSUB: +    return ExpandFSUB(Op);    case ISD::SETCC:      return UnrollVSETCC(Op);    case ISD::BITREVERSE: @@ -720,8 +722,6 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {    assert(VT.isVector() && !Mask.getValueType().isVector()           && Op1.getValueType() == Op2.getValueType() && "Invalid type"); -  unsigned NumElem = VT.getVectorNumElements(); -    // If we can't even use the basic vector operations of    // AND,OR,XOR, we will have to scalarize the op.    // Notice that the operation may be 'promoted' which means that it is @@ -745,8 +745,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {            DAG.getConstant(0, DL, BitTy));    // Broadcast the mask so that the entire vector is all-one or all zero. -  SmallVector<SDValue, 8> Ops(NumElem, Mask); -  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops); +  Mask = DAG.getSplatBuildVector(MaskTy, DL, Mask);    // Bitcast the operands to be the same type as the mask.    // This is needed when we select between FP types because @@ -1025,6 +1024,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {    return DAG.UnrollVectorOp(Op.getNode());  } +SDValue VectorLegalizer::ExpandFSUB(SDValue Op) { +  // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal, +  // we can defer this to operation legalization where it will be lowered as +  // a+(-b). +  EVT VT = Op.getValueType(); +  if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) && +      TLI.isOperationLegalOrCustom(ISD::FADD, VT)) +    return Op; // Defer to LegalizeDAG + +  return DAG.UnrollVectorOp(Op.getNode()); +} +  SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {    EVT VT = Op.getValueType();    unsigned NumBitsPerElt = VT.getScalarSizeInBits(); @@ -1102,7 +1113,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {                                             (EltVT.getSizeInBits()), dl, EltVT),                             DAG.getConstant(0, dl, EltVT));    } -  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +  return DAG.getBuildVector(VT, dl, Ops);  }  } diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 6906f67ebacb..78fddb5ce8f5 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {    case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;    case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;    case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; +  case ISD::ANY_EXTEND_VECTOR_INREG: +  case ISD::SIGN_EXTEND_VECTOR_INREG: +  case ISD::ZERO_EXTEND_VECTOR_INREG: +    R = ScalarizeVecRes_VecInregOp(N); +    break;    case ISD::ANY_EXTEND:    case ISD::BITREVERSE:    case ISD::BSWAP: @@ -97,6 +102,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {    case ISD::TRUNCATE:    case ISD::UINT_TO_FP:    case ISD::ZERO_EXTEND: +  case ISD::FCANONICALIZE:      R = ScalarizeVecRes_UnaryOp(N);      break; @@ -257,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {                       LHS, DAG.getValueType(ExtVT));  } +SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) { +  SDLoc DL(N); +  SDValue Op = N->getOperand(0); + +  EVT OpVT = Op.getValueType(); +  EVT OpEltVT = OpVT.getVectorElementType(); +  EVT EltVT = N->getValueType(0).getVectorElementType(); + +  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { +    Op = GetScalarizedVector(Op); +  } else { +    Op = DAG.getNode( +        ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op, +        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); +  } + +  switch (N->getOpcode()) { +  case ISD::ANY_EXTEND_VECTOR_INREG: +    return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op); +  case ISD::SIGN_EXTEND_VECTOR_INREG: +    return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op); +  case ISD::ZERO_EXTEND_VECTOR_INREG: +    return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op); +  } + +  llvm_unreachable("Illegal extend_vector_inreg opcode"); +} +  SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {    // If the operand is wider than the vector element type then it is implicitly    // truncated.  Make that explicit here. @@ -486,7 +520,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {    SmallVector<SDValue, 8> Ops(N->getNumOperands());    for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)      Ops[i] = GetScalarizedVector(N->getOperand(i)); -  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops); +  return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);  }  /// If the input is a vector that needs to be scalarized, it must be <1 x ty>, @@ -637,6 +671,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {    case ISD::SINT_TO_FP:    case ISD::TRUNCATE:    case ISD::UINT_TO_FP: +  case ISD::FCANONICALIZE:      SplitVecRes_UnaryOp(N, Lo, Hi);      break; @@ -781,10 +816,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));    unsigned LoNumElts = LoVT.getVectorNumElements();    SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts); -  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps); +  Lo = DAG.getBuildVector(LoVT, dl, LoOps);    SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end()); -  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps); +  Hi = DAG.getBuildVector(HiVT, dl, HiOps);  }  void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, @@ -928,7 +963,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,    SDLoc dl(N);    SDValue InLo, InHi; -  GetSplitVector(N0, InLo, InHi); + +  if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector) +    GetSplitVector(N0, InLo, InHi); +  else +    std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0); +    EVT InLoVT = InLo.getValueType();    unsigned InNumElements = InLoVT.getVectorNumElements(); @@ -1372,7 +1412,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,        }        // Construct the Lo/Hi output using a BUILD_VECTOR. -      Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps); +      Output = DAG.getBuildVector(NewVT, dl, SVOps);      } else if (InputUsed[0] == -1U) {        // No input vectors were used!  The result is undefined.        Output = DAG.getUNDEF(NewVT); @@ -1466,8 +1506,15 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {      case ISD::ZERO_EXTEND:      case ISD::ANY_EXTEND:      case ISD::FTRUNC: +    case ISD::FCANONICALIZE:        Res = SplitVecOp_UnaryOp(N);        break; + +    case ISD::ANY_EXTEND_VECTOR_INREG: +    case ISD::SIGN_EXTEND_VECTOR_INREG: +    case ISD::ZERO_EXTEND_VECTOR_INREG: +      Res = SplitVecOp_ExtVecInRegOp(N); +      break;      }    } @@ -1615,7 +1662,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {      EltVT = MVT::i8;      VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,                               VecVT.getVectorNumElements()); -    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps); +    Vec = DAG.getBuildVector(VecVT, dl, ElementOps);    }    // Store the vector to the stack. @@ -1629,6 +1676,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {                          MachinePointerInfo(), EltVT);  } +SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) { +  SDValue Lo, Hi; + +  // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so +  // splitting the result has the same effect as splitting the input operand. +  SplitVecRes_ExtVecInRegOp(N, Lo, Hi); + +  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); +} +  SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,                                               unsigned OpNo) {    EVT LoVT, HiVT; @@ -1881,7 +1938,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {      }    } -  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts); +  return DAG.getBuildVector(N->getValueType(0), DL, Elts);  }  SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { @@ -2323,6 +2380,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {          return DAG.getNode(Opcode, DL, WidenVT, InOp);        return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);      } +    if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) { +      // If both input and result vector types are of same width, extend +      // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which +      // accepts fewer elements in the result than in the input. +      if (Opcode == ISD::SIGN_EXTEND) +        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT); +      if (Opcode == ISD::ZERO_EXTEND) +        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT); +    }    }    if (TLI.isTypeLegal(InWidenVT)) { @@ -2375,7 +2441,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {    for (; i < WidenNumElts; ++i)      Ops[i] = UndefVal; -  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); +  return DAG.getBuildVector(WidenVT, DL, Ops);  }  SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -2430,7 +2496,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {    while (Ops.size() != WidenNumElts)      Ops.push_back(DAG.getUNDEF(WidenSVT)); -  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); +  return DAG.getBuildVector(WidenVT, DL, Ops);  }  SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) { @@ -2593,7 +2659,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {    assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");    NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); -  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps); +  return DAG.getBuildVector(WidenVT, dl, NewOps);  }  SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { @@ -2663,7 +2729,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {    SDValue UndefVal = DAG.getUNDEF(EltVT);    for (; Idx < WidenNumElts; ++Idx)      Ops[Idx] = UndefVal; -  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); +  return DAG.getBuildVector(WidenVT, dl, Ops);  }  SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { @@ -2704,7 +2770,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {    SDValue UndefVal = DAG.getUNDEF(EltVT);    for (; i < WidenNumElts; ++i)      Ops[i] = UndefVal; -  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); +  return DAG.getBuildVector(WidenVT, dl, Ops);  }  SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { @@ -2814,6 +2880,212 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {                       WidenVT, N->getOperand(0));  } +// Return true if this is a node that could have two SETCCs as operands. +static inline bool isLogicalMaskOp(unsigned Opcode) { +  switch (Opcode) { +  case ISD::AND: +  case ISD::OR: +  case ISD::XOR: +    return true; +  } +  return false; +} + +// This is used just for the assert in convertMask(). Check that this either +// a SETCC or a previously handled SETCC by convertMask(). +#ifndef NDEBUG +static inline bool isSETCCorConvertedSETCC(SDValue N) { +  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) +    N = N.getOperand(0); +  else if (N.getOpcode() == ISD::CONCAT_VECTORS) { +    for (unsigned i = 1; i < N->getNumOperands(); ++i) +      if (!N->getOperand(i)->isUndef()) +        return false; +    N = N.getOperand(0); +  } + +  if (N.getOpcode() == ISD::TRUNCATE) +    N = N.getOperand(0); +  else if (N.getOpcode() == ISD::SIGN_EXTEND) +    N = N.getOperand(0); + +  return (N.getOpcode() == ISD::SETCC); +} +#endif + +// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT +// to ToMaskVT if needed with vector extension or truncation. +SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT, +                                      EVT ToMaskVT) { +  LLVMContext &Ctx = *DAG.getContext(); + +  // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled. +  unsigned InMaskOpc = InMask->getOpcode(); +  assert((InMaskOpc == ISD::SETCC || +          (isLogicalMaskOp(InMaskOpc) && +           isSETCCorConvertedSETCC(InMask->getOperand(0)) && +           isSETCCorConvertedSETCC(InMask->getOperand(1)))) && +         "Unexpected mask argument."); + +  // Make a new Mask node, with a legal result VT. +  SmallVector<SDValue, 4> Ops; +  for (unsigned i = 0; i < InMask->getNumOperands(); ++i) +    Ops.push_back(InMask->getOperand(i)); +  SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops); + +  // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign +  // extend or truncate is needed. +  unsigned MaskScalarBits = MaskVT.getScalarSizeInBits(); +  unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits(); +  if (MaskScalarBits < ToMaskScalBits) { +    EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), +                                 MaskVT.getVectorNumElements()); +    Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask); +  } else if (MaskScalarBits > ToMaskScalBits) { +    EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), +                                   MaskVT.getVectorNumElements()); +    Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask); +  } + +  assert(Mask->getValueType(0).getScalarSizeInBits() == +             ToMaskVT.getScalarSizeInBits() && +         "Mask should have the right element size by now."); + +  // Adjust Mask to the right number of elements. +  unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements(); +  if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) { +    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); +    SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy); +    Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask, +                       ZeroIdx); +  } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) { +    unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls); +    EVT SubVT = Mask->getValueType(0); +    SmallVector<SDValue, 16> SubConcatOps(NumSubVecs); +    SubConcatOps[0] = Mask; +    for (unsigned i = 1; i < NumSubVecs; ++i) +      SubConcatOps[i] = DAG.getUNDEF(SubVT); +    Mask = +        DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps); +  } + +  assert((Mask->getValueType(0) == ToMaskVT) && +         "A mask of ToMaskVT should have been produced by now."); + +  return Mask; +} + +// Get the target mask VT, and widen if needed. +EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) { +  assert(SetCC->getOpcode() == ISD::SETCC); +  LLVMContext &Ctx = *DAG.getContext(); +  EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType()); +  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) +    MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT); +  return MaskVT; +} + +// This method tries to handle VSELECT and its mask by legalizing operands +// (which may require widening) and if needed adjusting the mask vector type +// to match that of the VSELECT. Without it, many cases end up with +// scalarization of the SETCC, with many unnecessary instructions. +SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) { +  LLVMContext &Ctx = *DAG.getContext(); +  SDValue Cond = N->getOperand(0); + +  if (N->getOpcode() != ISD::VSELECT) +    return SDValue(); + +  if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode())) +    return SDValue(); + +  // If this is a splitted VSELECT that was previously already handled, do +  // nothing. +  if (Cond->getValueType(0).getScalarSizeInBits() != 1) +    return SDValue(); + +  EVT VSelVT = N->getValueType(0); +  // Only handle vector types which are a power of 2. +  if (!isPowerOf2_64(VSelVT.getSizeInBits())) +    return SDValue(); + +  // Don't touch if this will be scalarized. +  EVT FinalVT = VSelVT; +  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector) +    FinalVT = EVT::getVectorVT(Ctx, FinalVT.getVectorElementType(), +                               FinalVT.getVectorNumElements() / 2); +  if (FinalVT.getVectorNumElements() == 1) +    return SDValue(); + +  // If there is support for an i1 vector mask, don't touch. +  if (Cond.getOpcode() == ISD::SETCC) { +    EVT SetCCOpVT = Cond->getOperand(0).getValueType(); +    while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal) +      SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT); +    EVT SetCCResVT = getSetCCResultType(SetCCOpVT); +    if (SetCCResVT.getScalarSizeInBits() == 1) +      return SDValue(); +  } + +  // Get the VT and operands for VSELECT, and widen if needed. +  SDValue VSelOp1 = N->getOperand(1); +  SDValue VSelOp2 = N->getOperand(2); +  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) { +    VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT); +    VSelOp1 = GetWidenedVector(VSelOp1); +    VSelOp2 = GetWidenedVector(VSelOp2); +  } + +  // The mask of the VSELECT should have integer elements. +  EVT ToMaskVT = VSelVT; +  if (!ToMaskVT.getScalarType().isInteger()) +    ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger(); + +  SDValue Mask; +  if (Cond->getOpcode() == ISD::SETCC) { +    EVT MaskVT = getSETCCWidenedResultTy(Cond); +    Mask = convertMask(Cond, MaskVT, ToMaskVT); +  } else if (isLogicalMaskOp(Cond->getOpcode()) && +             Cond->getOperand(0).getOpcode() == ISD::SETCC && +             Cond->getOperand(1).getOpcode() == ISD::SETCC) { +    // Cond is (AND/OR/XOR (SETCC, SETCC)) +    SDValue SETCC0 = Cond->getOperand(0); +    SDValue SETCC1 = Cond->getOperand(1); +    EVT VT0 = getSETCCWidenedResultTy(SETCC0); +    EVT VT1 = getSETCCWidenedResultTy(SETCC1); +    unsigned ScalarBits0 = VT0.getScalarSizeInBits(); +    unsigned ScalarBits1 = VT1.getScalarSizeInBits(); +    unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits(); +    EVT MaskVT; +    // If the two SETCCs have different VTs, either extend/truncate one of +    // them to the other "towards" ToMaskVT, or truncate one and extend the +    // other to ToMaskVT. +    if (ScalarBits0 != ScalarBits1) { +      EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1); +      EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0); +      if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits()) +        MaskVT = WideVT; +      else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits()) +        MaskVT = NarrowVT; +      else +        MaskVT = ToMaskVT; +    } else +      // If the two SETCCs have the same VT, don't change it. +      MaskVT = VT0; + +    // Make new SETCCs and logical nodes. +    SETCC0 = convertMask(SETCC0, VT0, MaskVT); +    SETCC1 = convertMask(SETCC1, VT1, MaskVT); +    Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1); + +    // Convert the logical op for VSELECT if needed. +    Mask = convertMask(Cond, MaskVT, ToMaskVT); +  } else +    return SDValue(); + +  return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2); +} +  SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {    EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));    unsigned WidenNumElts = WidenVT.getVectorNumElements(); @@ -2821,6 +3093,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {    SDValue Cond1 = N->getOperand(0);    EVT CondVT = Cond1.getValueType();    if (CondVT.isVector()) { +    if (SDValue Res = WidenVSELECTAndMask(N)) +      return Res; +      EVT CondEltVT = CondVT.getVectorElementType();      EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),                                          CondEltVT, WidenNumElts); @@ -3093,7 +3368,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {              ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,              DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())))); -  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +  return DAG.getBuildVector(VT, dl, Ops);  }  SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { @@ -3144,7 +3419,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,            DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));    } -  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +  return DAG.getBuildVector(VT, dl, Ops);  }  SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) { @@ -3565,10 +3840,9 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,    for (; i != WidenNumElts; ++i)      Ops[i] = UndefVal; -  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); +  return DAG.getBuildVector(WidenVT, dl, Ops);  } -  void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,                                              StoreSDNode *ST) {    // The strategy assumes that we can efficiently store power-of-two widths. @@ -3737,5 +4011,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,      DAG.getUNDEF(EltVT);    for ( ; Idx < WidenNumElts; ++Idx)      Ops[Idx] = FillVal; -  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); +  return DAG.getBuildVector(NVT, dl, Ops);  } diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index ded8e68fcbce..a1d70ab6f036 100644 --- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -57,10 +57,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)    RegPressure.resize(NumRC);    std::fill(RegLimit.begin(), RegLimit.end(), 0);    std::fill(RegPressure.begin(), RegPressure.end(), 0); -  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), -                                             E = TRI->regclass_end(); -       I != E; ++I) -    RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF); +  for (const TargetRegisterClass *RC : TRI->regclasses()) +    RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF);    ParallelLiveRanges = 0;    HorizontalVerticalBalance = 0; @@ -364,16 +362,11 @@ int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {      return RegBalance;    if (RawPressure) { -    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), -             E = TRI->regclass_end(); I != E; ++I) { -      const TargetRegisterClass *RC = *I; +    for (const TargetRegisterClass *RC : TRI->regclasses())        RegBalance += rawRegPressureDelta(SU, RC->getID()); -    }    }    else { -    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), -         E = TRI->regclass_end(); I != E; ++I) { -      const TargetRegisterClass *RC = *I; +    for (const TargetRegisterClass *RC : TRI->regclasses()) {        if ((RegPressure[RC->getID()] +             rawRegPressureDelta(SU, RC->getID()) > 0) &&            (RegPressure[RC->getID()] + diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 3549ccd9e345..e923e30e5037 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -422,11 +422,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner,      }      // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.      if (N->isMachineOpcode()) { -      if (N->getMachineOpcode() == -          (unsigned)TII->getCallFrameDestroyOpcode()) { +      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {          ++NestLevel; -      } else if (N->getMachineOpcode() == -                 (unsigned)TII->getCallFrameSetupOpcode()) { +      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {          if (NestLevel == 0)            return false;          --NestLevel; @@ -480,12 +478,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,      }      // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.      if (N->isMachineOpcode()) { -      if (N->getMachineOpcode() == -          (unsigned)TII->getCallFrameDestroyOpcode()) { +      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {          ++NestLevel;          MaxNest = std::max(MaxNest, NestLevel); -      } else if (N->getMachineOpcode() == -                 (unsigned)TII->getCallFrameSetupOpcode()) { +      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {          assert(NestLevel != 0);          --NestLevel;          if (NestLevel == 0) @@ -550,7 +546,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {    if (!LiveRegDefs[CallResource])      for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())        if (Node->isMachineOpcode() && -          Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { +          Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {          unsigned NestLevel = 0;          unsigned MaxNest = 0;          SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII); @@ -755,7 +751,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {      for (const SDNode *SUNode = SU->getNode(); SUNode;           SUNode = SUNode->getGluedNode()) {        if (SUNode->isMachineOpcode() && -          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { +          SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {          assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");          --NumLiveRegs;          LiveRegDefs[CallResource] = nullptr; @@ -826,7 +822,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {    for (const SDNode *SUNode = SU->getNode(); SUNode;         SUNode = SUNode->getGluedNode()) {      if (SUNode->isMachineOpcode() && -        SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { +        SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {        ++NumLiveRegs;        LiveRegDefs[CallResource] = SU;        LiveRegGens[CallResource] = CallSeqEndForStart[SU]; @@ -839,7 +835,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {      for (const SDNode *SUNode = SU->getNode(); SUNode;           SUNode = SUNode->getGluedNode()) {        if (SUNode->isMachineOpcode() && -          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { +          SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {          assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");          --NumLiveRegs;          LiveRegDefs[CallResource] = nullptr; @@ -1305,7 +1301,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {      // If we're in the middle of scheduling a call, don't begin scheduling      // another call. Also, don't allow any physical registers to be live across      // the call. -    if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { +    if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) || +        (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) {        // Check the special calling-sequence resource.        unsigned CallResource = TRI->getNumRegs();        if (LiveRegDefs[CallResource]) { @@ -1659,9 +1656,8 @@ public:        RegPressure.resize(NumRC);        std::fill(RegLimit.begin(), RegLimit.end(), 0);        std::fill(RegPressure.begin(), RegPressure.end(), 0); -      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), -             E = TRI->regclass_end(); I != E; ++I) -        RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF); +      for (const TargetRegisterClass *RC : TRI->regclasses()) +        RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF);      }    } @@ -1788,7 +1784,7 @@ public:    }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -  void dump(ScheduleDAG *DAG) const override { +  LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override {      // Emulate pop() without clobbering NodeQueueIds.      std::vector<SUnit*> DumpQueue = Queue;      SF DumpPicker = Picker; @@ -1924,19 +1920,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {  //                     Register Pressure Tracking  //===----------------------------------------------------------------------===// -void RegReductionPQBase::dumpRegPressure() const {  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), -         E = TRI->regclass_end(); I != E; ++I) { -    const TargetRegisterClass *RC = *I; +LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const { +  for (const TargetRegisterClass *RC : TRI->regclasses()) {      unsigned Id = RC->getID();      unsigned RP = RegPressure[Id];      if (!RP) continue;      DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "            << RegLimit[Id] << '\n');    } -#endif  } +#endif  bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {    if (!TLI) @@ -2092,7 +2086,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {        RegPressure[RCId] -= Cost;      }    } -  dumpRegPressure(); +  DEBUG(dumpRegPressure());  }  void RegReductionPQBase::unscheduledNode(SUnit *SU) { @@ -2172,7 +2166,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {      }    } -  dumpRegPressure(); +  DEBUG(dumpRegPressure());  }  //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 3be622f8c179..3c8526ebb702 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -650,6 +650,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,  }  void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const { +  // Cannot completely remove virtual function even in release mode.  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)    if (!SU->getNode()) {      dbgs() << "PHYS REG COPY\n"; @@ -704,8 +705,8 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,    if (!N->getHasDebugValue())      return; -  // Opportunistically insert immediate dbg_value uses, i.e. those with source -  // order number right after the N. +  // Opportunistically insert immediate dbg_value uses, i.e. those with the same +  // source order number as N.    MachineBasicBlock *BB = Emitter.getBlock();    MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();    ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N); @@ -713,7 +714,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,      if (DVs[i]->isInvalidated())        continue;      unsigned DVOrder = DVs[i]->getOrder(); -    if (!Order || DVOrder == ++Order) { +    if (!Order || DVOrder == Order) {        MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap);        if (DbgMI) {          Orders.push_back(std::make_pair(DVOrder, DbgMI)); @@ -835,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {        GluedNodes.push_back(N);      while (!GluedNodes.empty()) {        SDNode *N = GluedNodes.back(); -      Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned, -                       VRBaseMap); +      Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);        // Remember the source order of the inserted instruction.        if (HasDbg)          ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e225ba8703b7..003ea5030bfc 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -289,28 +289,28 @@ static int isSignedOp(ISD::CondCode Opcode) {  }  ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2, -                                       bool isInteger) { -  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) +                                       bool IsInteger) { +  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)      // Cannot fold a signed integer setcc with an unsigned integer setcc.      return ISD::SETCC_INVALID;    unsigned Op = Op1 | Op2;  // Combine all of the condition bits. -  // If the N and U bits get set then the resultant comparison DOES suddenly -  // care about orderedness, and is true when ordered. +  // If the N and U bits get set, then the resultant comparison DOES suddenly +  // care about orderedness, and it is true when ordered.    if (Op > ISD::SETTRUE2)      Op &= ~16;     // Clear the U bit if the N bit is set.    // Canonicalize illegal integer setcc's. -  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT +  if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT      Op = ISD::SETNE;    return ISD::CondCode(Op);  }  ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, -                                        bool isInteger) { -  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) +                                        bool IsInteger) { +  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)      // Cannot fold a signed setcc with an unsigned setcc.      return ISD::SETCC_INVALID; @@ -318,7 +318,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,    ISD::CondCode Result = ISD::CondCode(Op1 & Op2);    // Canonicalize illegal integer setcc's. -  if (isInteger) { +  if (IsInteger) {      switch (Result) {      default: break;      case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT @@ -871,11 +871,13 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)    DbgInfo = new SDDbgInfo();  } -void SelectionDAG::init(MachineFunction &mf) { -  MF = &mf; +void SelectionDAG::init(MachineFunction &NewMF, +                        OptimizationRemarkEmitter &NewORE) { +  MF = &NewMF; +  ORE = &NewORE;    TLI = getSubtarget().getTargetLowering();    TSI = getSubtarget().getSelectionDAGInfo(); -  Context = &mf.getFunction()->getContext(); +  Context = &MF->getFunction()->getContext();  }  SelectionDAG::~SelectionDAG() { @@ -1994,8 +1996,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,  /// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows  /// us to only collect the known bits that are shared by the requested vector  /// elements. -/// TODO: We only support DemandedElts on a few opcodes so far, the remainder -/// should be added when they become necessary.  void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,                                      APInt &KnownOne, const APInt &DemandedElts,                                      unsigned Depth) const { @@ -2251,10 +2251,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,                                 KnownZero2.countLeadingOnes(),                                 BitWidth) - BitWidth; -    TrailZ = std::min(TrailZ, BitWidth); -    LeadZ = std::min(LeadZ, BitWidth); -    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | -                APInt::getHighBitsSet(BitWidth, LeadZ); +    KnownZero.clearAllBits(); +    KnownZero.setLowBits(std::min(TrailZ, BitWidth)); +    KnownZero.setHighBits(std::min(LeadZ, BitWidth));      break;    }    case ISD::UDIV: { @@ -2272,7 +2271,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,        LeadZ = std::min(BitWidth,                         LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); -    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); +    KnownZero.setHighBits(LeadZ);      break;    }    case ISD::SELECT: @@ -2297,10 +2296,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      KnownOne &= KnownOne2;      KnownZero &= KnownZero2;      break; -  case ISD::SADDO: -  case ISD::UADDO: -  case ISD::SSUBO: -  case ISD::USUBO:    case ISD::SMULO:    case ISD::UMULO:      if (Op.getResNo() != 1) @@ -2312,14 +2307,14 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==              TargetLowering::ZeroOrOneBooleanContent &&          BitWidth > 1) -      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); +      KnownZero.setBitsFrom(1);      break;    case ISD::SETCC:      // If we know the result of a setcc has the top bits zero, use this info.      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==              TargetLowering::ZeroOrOneBooleanContent &&          BitWidth > 1) -      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); +      KnownZero.setBitsFrom(1);      break;    case ISD::SHL:      if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { @@ -2328,7 +2323,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,        KnownZero = KnownZero << *ShAmt;        KnownOne = KnownOne << *ShAmt;        // Low bits are known zero. -      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue()); +      KnownZero.setLowBits(ShAmt->getZExtValue());      }      break;    case ISD::SRL: @@ -2338,8 +2333,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,        KnownZero = KnownZero.lshr(*ShAmt);        KnownOne  = KnownOne.lshr(*ShAmt);        // High bits are known zero. -      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); -      KnownZero |= HighBits; +      KnownZero.setHighBits(ShAmt->getZExtValue());      }      break;    case ISD::SRA: @@ -2350,13 +2344,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,        KnownOne  = KnownOne.lshr(*ShAmt);        // If we know the value of the sign bit, then we know it is copied across        // the high bits by the shift amount. -      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());        APInt SignBit = APInt::getSignBit(BitWidth);        SignBit = SignBit.lshr(*ShAmt);  // Adjust to where it is now in the mask.        if (KnownZero.intersects(SignBit)) { -        KnownZero |= HighBits;  // New bits are known zero. +        KnownZero.setHighBits(ShAmt->getZExtValue());// New bits are known zero.        } else if (KnownOne.intersects(SignBit)) { -        KnownOne  |= HighBits;  // New bits are known one. +        KnownOne.setHighBits(ShAmt->getZExtValue()); // New bits are known one.        }      }      break; @@ -2401,9 +2394,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,    case ISD::CTLZ:    case ISD::CTLZ_ZERO_UNDEF:    case ISD::CTPOP: { -    unsigned LowBits = Log2_32(BitWidth)+1; -    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); -    KnownOne.clearAllBits(); +    KnownZero.setBitsFrom(Log2_32(BitWidth)+1);      break;    }    case ISD::LOAD: { @@ -2412,26 +2403,39 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {        EVT VT = LD->getMemoryVT();        unsigned MemBits = VT.getScalarSizeInBits(); -      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); +      KnownZero.setBitsFrom(MemBits);      } else if (const MDNode *Ranges = LD->getRanges()) {        if (LD->getExtensionType() == ISD::NON_EXTLOAD)          computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne);      }      break;    } +  case ISD::ZERO_EXTEND_VECTOR_INREG: { +    EVT InVT = Op.getOperand(0).getValueType(); +    unsigned InBits = InVT.getScalarSizeInBits(); +    KnownZero = KnownZero.trunc(InBits); +    KnownOne = KnownOne.trunc(InBits); +    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, +                     DemandedElts.zext(InVT.getVectorNumElements()), +                     Depth + 1); +    KnownZero = KnownZero.zext(BitWidth); +    KnownOne = KnownOne.zext(BitWidth); +    KnownZero.setBitsFrom(InBits); +    break; +  }    case ISD::ZERO_EXTEND: {      EVT InVT = Op.getOperand(0).getValueType();      unsigned InBits = InVT.getScalarSizeInBits(); -    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);      KnownZero = KnownZero.trunc(InBits);      KnownOne = KnownOne.trunc(InBits);      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,                       Depth + 1);      KnownZero = KnownZero.zext(BitWidth);      KnownOne = KnownOne.zext(BitWidth); -    KnownZero |= NewBits; +    KnownZero.setBitsFrom(InBits);      break;    } +  // TODO ISD::SIGN_EXTEND_VECTOR_INREG    case ISD::SIGN_EXTEND: {      EVT InVT = Op.getOperand(0).getValueType();      unsigned InBits = InVT.getScalarSizeInBits(); @@ -2478,10 +2482,21 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,    }    case ISD::FGETSIGN:      // All bits are zero except the low bit. -    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1); +    KnownZero.setBitsFrom(1);      break; - -  case ISD::SUB: { +  case ISD::USUBO: +  case ISD::SSUBO: +    if (Op.getResNo() == 1) { +      // If we know the result of a setcc has the top bits zero, use this info. +      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == +              TargetLowering::ZeroOrOneBooleanContent && +          BitWidth > 1) +        KnownZero.setBitsFrom(1); +      break; +    } +    LLVM_FALLTHROUGH; +  case ISD::SUB: +  case ISD::SUBC: {      if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {        // We know that the top bits of C-X are clear if X contains less bits        // than C (i.e. no wrap-around can happen).  For example, 20-X is @@ -2499,13 +2514,40 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,          if ((KnownZero2 & MaskV) == MaskV) {            unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();            // Top bits known zero. -          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); +          KnownZero.setHighBits(NLZ2);          }        }      } -    LLVM_FALLTHROUGH; + +    // If low bits are know to be zero in both operands, then we know they are +    // going to be 0 in the result. Both addition and complement operations +    // preserve the low zero bits. +    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, +                     Depth + 1); +    unsigned KnownZeroLow = KnownZero2.countTrailingOnes(); +    if (KnownZeroLow == 0) +      break; + +    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, +                     Depth + 1); +    KnownZeroLow = std::min(KnownZeroLow, +                            KnownZero2.countTrailingOnes()); +    KnownZero.setBits(0, KnownZeroLow); +    break;    } +  case ISD::UADDO: +  case ISD::SADDO: +    if (Op.getResNo() == 1) { +      // If we know the result of a setcc has the top bits zero, use this info. +      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == +              TargetLowering::ZeroOrOneBooleanContent && +          BitWidth > 1) +        KnownZero.setBitsFrom(1); +      break; +    } +    LLVM_FALLTHROUGH;    case ISD::ADD: +  case ISD::ADDC:    case ISD::ADDE: {      // Output known-0 bits are known if clear or set in both the low clear bits      // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the @@ -2526,19 +2568,19 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      KnownZeroLow = std::min(KnownZeroLow,                              KnownZero2.countTrailingOnes()); -    if (Opcode == ISD::ADD) { -      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow); -      if (KnownZeroHigh > 1) -        KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1); +    if (Opcode == ISD::ADDE) { +      // With ADDE, a carry bit may be added in, so we can only use this +      // information if we know (at least) that the low two bits are clear. +      // We then return to the caller that the low bit is unknown but that +      // other bits are known zero. +      if (KnownZeroLow >= 2) +        KnownZero.setBits(1, KnownZeroLow);        break;      } -    // With ADDE, a carry bit may be added in, so we can only use this -    // information if we know (at least) that the low two bits are clear.  We -    // then return to the caller that the low bit is unknown but that other bits -    // are known zero. -    if (KnownZeroLow >= 2) // ADDE -      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow); +    KnownZero.setLowBits(KnownZeroLow); +    if (KnownZeroHigh > 1) +      KnownZero.setHighBits(KnownZeroHigh - 1);      break;    }    case ISD::SREM: @@ -2591,7 +2633,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),                                  KnownZero2.countLeadingOnes());      KnownOne.clearAllBits(); -    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); +    KnownZero.clearAllBits(); +    KnownZero.setHighBits(Leaders);      break;    }    case ISD::EXTRACT_ELEMENT: { @@ -2673,6 +2716,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      }      break;    } +  case ISD::BITREVERSE: { +    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, +                     Depth + 1); +    KnownZero = KnownZero2.reverseBits(); +    KnownOne = KnownOne2.reverseBits(); +    break; +  }    case ISD::BSWAP: {      computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,                       Depth + 1); @@ -2680,12 +2730,62 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,      KnownOne = KnownOne2.byteSwap();      break;    } -  case ISD::SMIN: -  case ISD::SMAX: -  case ISD::UMIN: +  case ISD::ABS: { +    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, +                     Depth + 1); + +    // If the source's MSB is zero then we know the rest of the bits already. +    if (KnownZero2[BitWidth - 1]) { +      KnownZero = KnownZero2; +      KnownOne = KnownOne2; +      break; +    } + +    // We only know that the absolute values's MSB will be zero iff there is +    // a set bit that isn't the sign bit (otherwise it could be INT_MIN). +    KnownOne2.clearBit(BitWidth - 1); +    if (KnownOne2.getBoolValue()) { +      KnownZero = APInt::getSignBit(BitWidth); +      break; +    } +    break; +  } +  case ISD::UMIN: { +    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, +                     Depth + 1); +    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, +                     Depth + 1); + +    // UMIN - we know that the result will have the maximum of the +    // known zero leading bits of the inputs. +    unsigned LeadZero = KnownZero.countLeadingOnes(); +    LeadZero = std::max(LeadZero, KnownZero2.countLeadingOnes()); + +    KnownZero &= KnownZero2; +    KnownOne &= KnownOne2; +    KnownZero.setHighBits(LeadZero); +    break; +  }    case ISD::UMAX: {      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,                       Depth + 1); +    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, +                     Depth + 1); + +    // UMAX - we know that the result will have the maximum of the +    // known one leading bits of the inputs. +    unsigned LeadOne = KnownOne.countLeadingOnes(); +    LeadOne = std::max(LeadOne, KnownOne2.countLeadingOnes()); + +    KnownZero &= KnownZero2; +    KnownOne &= KnownOne2; +    KnownOne.setHighBits(LeadOne); +    break; +  } +  case ISD::SMIN: +  case ISD::SMAX: { +    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, +                     Depth + 1);      // If we don't know any bits, early out.      if (!KnownOne && !KnownZero)        break; @@ -2699,7 +2799,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,    case ISD::TargetFrameIndex:      if (unsigned Align = InferPtrAlignment(Op)) {        // The low bits are known zero if the pointer is aligned. -      KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align)); +      KnownZero.setLowBits(Log2_32(Align));        break;      }      break; @@ -2712,13 +2812,48 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,    case ISD::INTRINSIC_W_CHAIN:    case ISD::INTRINSIC_VOID:      // Allow the target to implement this method for its nodes. -    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth); +    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, DemandedElts, +                                       *this, Depth);      break;    }    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");  } +SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0, +                                                             SDValue N1) const { +  // X + 0 never overflow +  if (isNullConstant(N1)) +    return OFK_Never; + +  APInt N1Zero, N1One; +  computeKnownBits(N1, N1Zero, N1One); +  if (N1Zero.getBoolValue()) { +    APInt N0Zero, N0One; +    computeKnownBits(N0, N0Zero, N0One); + +    bool overflow; +    (~N0Zero).uadd_ov(~N1Zero, overflow); +    if (!overflow) +      return OFK_Never; +  } + +  // mulhi + 1 never overflow +  if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 && +      (~N1Zero & 0x01) == ~N1Zero) +    return OFK_Never; + +  if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) { +    APInt N0Zero, N0One; +    computeKnownBits(N0, N0Zero, N0One); + +    if ((~N0Zero & 0x01) == ~N0Zero) +      return OFK_Never; +  } + +  return OFK_Sometime; +} +  bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {    EVT OpVT = Val.getValueType();    unsigned BitWidth = OpVT.getScalarSizeInBits(); @@ -2745,7 +2880,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {    // Are all operands of a build vector constant powers of two?    if (Val.getOpcode() == ISD::BUILD_VECTOR) -    if (llvm::all_of(Val->ops(), [this, BitWidth](SDValue E) { +    if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {            if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))              return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();            return false; @@ -2764,6 +2899,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {  unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {    EVT VT = Op.getValueType(); +  APInt DemandedElts = VT.isVector() +                           ? APInt::getAllOnesValue(VT.getVectorNumElements()) +                           : APInt(1, 1); +  return ComputeNumSignBits(Op, DemandedElts, Depth); +} + +unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, +                                          unsigned Depth) const { +  EVT VT = Op.getValueType();    assert(VT.isInteger() && "Invalid VT!");    unsigned VTBits = VT.getScalarSizeInBits();    unsigned Tmp, Tmp2; @@ -2772,6 +2916,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {    if (Depth == 6)      return 1;  // Limit search depth. +  if (!DemandedElts) +    return 1;  // No demanded elts, better to assume we don't know anything. +    switch (Op.getOpcode()) {    default: break;    case ISD::AssertSext: @@ -2786,7 +2933,28 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {      return Val.getNumSignBits();    } +  case ISD::BUILD_VECTOR: +    Tmp = VTBits; +    for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) { +      if (!DemandedElts[i]) +        continue; + +      SDValue SrcOp = Op.getOperand(i); +      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1); + +      // BUILD_VECTOR can implicitly truncate sources, we must handle this. +      if (SrcOp.getValueSizeInBits() != VTBits) { +        assert(SrcOp.getValueSizeInBits() > VTBits && +               "Expected BUILD_VECTOR implicit truncation"); +        unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits; +        Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1); +      } +      Tmp = std::min(Tmp, Tmp2); +    } +    return Tmp; +    case ISD::SIGN_EXTEND: +  case ISD::SIGN_EXTEND_VECTOR_INREG:      Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();      return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; @@ -2799,7 +2967,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {      return std::max(Tmp, Tmp2);    case ISD::SRA: -    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); +    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);      // SRA X, C   -> adds C sign bits.      if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) {        APInt ShiftVal = C->getAPIntValue(); @@ -2887,6 +3055,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {      }      break;    case ISD::ADD: +  case ISD::ADDC:      // Add can have at most one carry bit.  Thus we know that the output      // is, at worst, one more bit than the inputs.      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); @@ -2961,19 +3130,63 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {      // result. Otherwise it gives either negative or > bitwidth result      return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);    } +  case ISD::INSERT_VECTOR_ELT: { +    SDValue InVec = Op.getOperand(0); +    SDValue InVal = Op.getOperand(1); +    SDValue EltNo = Op.getOperand(2); +    unsigned NumElts = InVec.getValueType().getVectorNumElements(); + +    ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo); +    if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) { +      // If we know the element index, split the demand between the +      // source vector and the inserted element. +      unsigned EltIdx = CEltNo->getZExtValue(); + +      // If we demand the inserted element then get its sign bits. +      Tmp = UINT_MAX; +      if (DemandedElts[EltIdx]) +        Tmp = ComputeNumSignBits(InVal, Depth + 1); + +      // If we demand the source vector then get its sign bits, and determine +      // the minimum. +      APInt VectorElts = DemandedElts; +      VectorElts.clearBit(EltIdx); +      if (!!VectorElts) { +        Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1); +        Tmp = std::min(Tmp, Tmp2); +      } +    } else { +      // Unknown element index, so ignore DemandedElts and demand them all. +      Tmp = ComputeNumSignBits(InVec, Depth + 1); +      Tmp2 = ComputeNumSignBits(InVal, Depth + 1); +      Tmp = std::min(Tmp, Tmp2); +    } +    assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); +    return Tmp; +  }    case ISD::EXTRACT_VECTOR_ELT: { -    // At the moment we keep this simple and skip tracking the specific -    // element. This way we get the lowest common denominator for all elements -    // of the vector. -    // TODO: get information for given vector element +    SDValue InVec = Op.getOperand(0); +    SDValue EltNo = Op.getOperand(1); +    EVT VecVT = InVec.getValueType();      const unsigned BitWidth = Op.getValueSizeInBits();      const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); +    const unsigned NumSrcElts = VecVT.getVectorNumElements(); +      // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know      // anything about sign bits. But if the sizes match we can derive knowledge      // about sign bits from the vector operand. -    if (BitWidth == EltBitWidth) -      return ComputeNumSignBits(Op.getOperand(0), Depth+1); -    break; +    if (BitWidth != EltBitWidth) +      break; + +    // If we know the element index, just demand that vector element, else for +    // an unknown element index, ignore DemandedElts and demand them all. +    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts); +    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); +    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) +      DemandedSrcElts = +          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue()); + +    return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);    }    case ISD::EXTRACT_SUBVECTOR:      return ComputeNumSignBits(Op.getOperand(0), Depth + 1); @@ -3008,14 +3221,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {        Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||        Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||        Op.getOpcode() == ISD::INTRINSIC_VOID) { -    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth); -    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits); +    unsigned NumBits = +        TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth); +    if (NumBits > 1) +      FirstAnswer = std::max(FirstAnswer, NumBits);    }    // Finally, if we can prove that the top bits of the result are 0's or 1's,    // use this information.    APInt KnownZero, KnownOne; -  computeKnownBits(Op, KnownZero, KnownOne, Depth); +  computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth);    APInt Mask;    if (KnownZero.isNegative()) {        // sign bit is 0 @@ -3054,6 +3269,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {    if (getTarget().Options.NoNaNsFPMath)      return true; +  if (const BinaryWithFlagsSDNode *BF = dyn_cast<BinaryWithFlagsSDNode>(Op)) +    return BF->Flags.hasNoNaNs(); +    // If the value is a constant, we can obviously see if it is a NaN or not.    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))      return !C->getValueAPF().isNaN(); @@ -3206,6 +3424,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,        if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)          return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);        break; +    case ISD::ABS: +      return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(), +                         C->isOpaque()); +    case ISD::BITREVERSE: +      return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(), +                         C->isOpaque());      case ISD::BSWAP:        return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),                           C->isOpaque()); @@ -3220,6 +3444,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,      case ISD::CTTZ_ZERO_UNDEF:        return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),                           C->isOpaque()); +    case ISD::FP16_TO_FP: { +      bool Ignored; +      APFloat FPV(APFloat::IEEEhalf(), +                  (Val.getBitWidth() == 16) ? Val : Val.trunc(16)); + +      // This can return overflow, underflow, or inexact; we don't care. +      // FIXME need to be more flexible about rounding mode. +      (void)FPV.convert(EVTToAPFloatSemantics(VT), +                        APFloat::rmNearestTiesToEven, &Ignored); +      return getConstantFP(FPV, DL, VT); +    }      }    } @@ -3261,17 +3496,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,      }      case ISD::FP_TO_SINT:      case ISD::FP_TO_UINT: { -      integerPart x[2];        bool ignored; -      static_assert(integerPartWidth >= 64, "APFloat parts too small!"); +      APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);        // FIXME need to be more flexible about rounding mode. -      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(), -                            Opcode==ISD::FP_TO_SINT, -                            APFloat::rmTowardZero, &ignored); -      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual +      APFloat::opStatus s = +          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored); +      if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual          break; -      APInt api(VT.getSizeInBits(), x); -      return getConstant(api, DL, VT); +      return getConstant(IntVal, DL, VT);      }      case ISD::BITCAST:        if (VT == MVT::i16 && C->getValueType(0) == MVT::f16) @@ -3281,6 +3513,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,        else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)          return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);        break; +    case ISD::FP_TO_FP16: { +      bool Ignored; +      // This can return overflow, underflow, or inexact; we don't care. +      // FIXME need to be more flexible about rounding mode. +      (void)V.convert(APFloat::IEEEhalf(), +                      APFloat::rmNearestTiesToEven, &Ignored); +      return getConstant(V.bitcastToAPInt(), DL, VT); +    }      }    } @@ -3303,6 +3543,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,        case ISD::TRUNCATE:        case ISD::UINT_TO_FP:        case ISD::SINT_TO_FP: +      case ISD::ABS: +      case ISD::BITREVERSE:        case ISD::BSWAP:        case ISD::CTLZ:        case ISD::CTLZ_ZERO_UNDEF: @@ -3420,6 +3662,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,      if (OpOpcode == ISD::UNDEF)        return getUNDEF(VT);      break; +  case ISD::ABS: +    assert(VT.isInteger() && VT == Operand.getValueType() && +           "Invalid ABS!"); +    if (OpOpcode == ISD::UNDEF) +      return getUNDEF(VT); +    break;    case ISD::BSWAP:      assert(VT.isInteger() && VT == Operand.getValueType() &&             "Invalid BSWAP!"); @@ -3569,6 +3817,30 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,                            GA->getOffset() + uint64_t(Offset));  } +bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { +  switch (Opcode) { +  case ISD::SDIV: +  case ISD::UDIV: +  case ISD::SREM: +  case ISD::UREM: { +    // If a divisor is zero/undef or any element of a divisor vector is +    // zero/undef, the whole op is undef. +    assert(Ops.size() == 2 && "Div/rem should have 2 operands"); +    SDValue Divisor = Ops[1]; +    if (Divisor.isUndef() || isNullConstant(Divisor)) +      return true; + +    return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) && +           any_of(Divisor->op_values(), +                  [](SDValue V) { return V.isUndef() || isNullConstant(V); }); +    // TODO: Handle signed overflow. +  } +  // TODO: Handle oversized shifts. +  default: +    return false; +  } +} +  SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,                                               EVT VT, SDNode *Cst1,                                               SDNode *Cst2) { @@ -3578,6 +3850,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,    if (Opcode >= ISD::BUILTIN_OP_END)      return SDValue(); +  if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)})) +    return getUNDEF(VT); +    // Handle the case of two scalars.    if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {      if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) { @@ -3645,6 +3920,9 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,    if (Opcode >= ISD::BUILTIN_OP_END)      return SDValue(); +  if (isUndef(Opcode, Ops)) +    return getUNDEF(VT); +    // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?    if (!VT.isVector())      return SDValue(); @@ -3676,7 +3954,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,    // Find legal integer scalar type for constant promotion and    // ensure that its scalar size is at least as large as source.    EVT LegalSVT = VT.getScalarType(); -  if (LegalSVT.isInteger()) { +  if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {      LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);      if (LegalSVT.bitsLT(VT.getScalarType()))        return SDValue(); @@ -3910,35 +4188,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,      assert(EVT.bitsLE(VT) && "Not extending!");      if (EVT == VT) return N1;  // Not actually extending -    auto SignExtendInReg = [&](APInt Val) { +    auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {        unsigned FromBits = EVT.getScalarSizeInBits();        Val <<= Val.getBitWidth() - FromBits;        Val = Val.ashr(Val.getBitWidth() - FromBits); -      return getConstant(Val, DL, VT.getScalarType()); +      return getConstant(Val, DL, ConstantVT);      };      if (N1C) {        const APInt &Val = N1C->getAPIntValue(); -      return SignExtendInReg(Val); +      return SignExtendInReg(Val, VT);      }      if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {        SmallVector<SDValue, 8> Ops; +      llvm::EVT OpVT = N1.getOperand(0).getValueType();        for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {          SDValue Op = N1.getOperand(i);          if (Op.isUndef()) { -          Ops.push_back(getUNDEF(VT.getScalarType())); +          Ops.push_back(getUNDEF(OpVT));            continue;          } -        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { -          APInt Val = C->getAPIntValue(); -          Val = Val.zextOrTrunc(VT.getScalarSizeInBits()); -          Ops.push_back(SignExtendInReg(Val)); -          continue; -        } -        break; +        ConstantSDNode *C = cast<ConstantSDNode>(Op); +        APInt Val = C->getAPIntValue(); +        Ops.push_back(SignExtendInReg(Val, OpVT));        } -      if (Ops.size() == VT.getVectorNumElements()) -        return getBuildVector(VT, DL, Ops); +      return getBuildVector(VT, DL, Ops);      }      break;    } @@ -4040,6 +4314,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,        if (VT.getSimpleVT() == N1.getSimpleValueType())          return N1; +      // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF. +      if (N1.isUndef()) +        return getUNDEF(VT); + +      // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of +      // the concat have the same type as the extract. +      if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS && +          N1.getNumOperands() > 0 && +          VT == N1.getOperand(0).getValueType()) { +        unsigned Factor = VT.getVectorNumElements(); +        return N1.getOperand(N2C->getZExtValue() / Factor); +      } +        // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created        // during shuffle legalization.        if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) && @@ -4943,11 +5230,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,    TargetLowering::CallLoweringInfo CLI(*this);    CLI.setDebugLoc(dl)        .setChain(Chain) -      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), -                 Dst.getValueType().getTypeForEVT(*getContext()), -                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), -                                   TLI->getPointerTy(getDataLayout())), -                 std::move(Args)) +      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), +                    Dst.getValueType().getTypeForEVT(*getContext()), +                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), +                                      TLI->getPointerTy(getDataLayout())), +                    std::move(Args))        .setDiscardResult()        .setTailCall(isTailCall); @@ -5004,11 +5291,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,    TargetLowering::CallLoweringInfo CLI(*this);    CLI.setDebugLoc(dl)        .setChain(Chain) -      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), -                 Dst.getValueType().getTypeForEVT(*getContext()), -                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), -                                   TLI->getPointerTy(getDataLayout())), -                 std::move(Args)) +      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), +                    Dst.getValueType().getTypeForEVT(*getContext()), +                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), +                                      TLI->getPointerTy(getDataLayout())), +                    std::move(Args))        .setDiscardResult()        .setTailCall(isTailCall); @@ -5066,11 +5353,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,    TargetLowering::CallLoweringInfo CLI(*this);    CLI.setDebugLoc(dl)        .setChain(Chain) -      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), -                 Dst.getValueType().getTypeForEVT(*getContext()), -                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), -                                   TLI->getPointerTy(getDataLayout())), -                 std::move(Args)) +      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), +                    Dst.getValueType().getTypeForEVT(*getContext()), +                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), +                                      TLI->getPointerTy(getDataLayout())), +                    std::move(Args))        .setDiscardResult()        .setTailCall(isTailCall); @@ -7049,6 +7336,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {    return Seen;  } +/// Return true if the only users of N are contained in Nodes. +bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) { +  bool Seen = false; +  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { +    SDNode *User = *I; +    if (llvm::any_of(Nodes, +                     [&User](const SDNode *Node) { return User == Node; })) +      Seen = true; +    else +      return false; +  } + +  return Seen; +} +  /// isOperand - Return true if this node is an operand of N.  ///  bool SDValue::isOperandOf(const SDNode *N) const { @@ -7070,21 +7372,39 @@ bool SDNode::isOperandOf(const SDNode *N) const {  /// side-effecting instructions on any chain path.  In practice, this looks  /// through token factors and non-volatile loads.  In order to remain efficient,  /// this only looks a couple of nodes in, it does not do an exhaustive search. +/// +/// Note that we only need to examine chains when we're searching for +/// side-effects; SelectionDAG requires that all side-effects are represented +/// by chains, even if another operand would force a specific ordering. This +/// constraint is necessary to allow transformations like splitting loads.  bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, -                                               unsigned Depth) const { +                                             unsigned Depth) const {    if (*this == Dest) return true;    // Don't search too deeply, we just want to be able to see through    // TokenFactor's etc.    if (Depth == 0) return false; -  // If this is a token factor, all inputs to the TF happen in parallel.  If any -  // of the operands of the TF does not reach dest, then we cannot do the xform. +  // If this is a token factor, all inputs to the TF happen in parallel.    if (getOpcode() == ISD::TokenFactor) { -    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) -      if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1)) -        return false; -    return true; +    // First, try a shallow search. +    if (is_contained((*this)->ops(), Dest)) { +      // We found the chain we want as an operand of this TokenFactor. +      // Essentially, we reach the chain without side-effects if we could +      // serialize the TokenFactor into a simple chain of operations with +      // Dest as the last operation. This is automatically true if the +      // chain has one use: there are no other ordering constraints. +      // If the chain has more than one use, we give up: some other +      // use of Dest might force a side-effect between Dest and the current +      // node. +      if (Dest.hasOneUse()) +        return true; +    } +    // Next, try a deep search: check whether every operand of the TokenFactor +    // reaches Dest. +    return all_of((*this)->ops(), [=](SDValue Op) { +      return Op.reachesChainWithoutSideEffects(Dest, Depth - 1); +    });    }    // Loads don't have side effects, look through them. @@ -7102,11 +7422,6 @@ bool SDNode::hasPredecessor(const SDNode *N) const {    return hasPredecessorHelper(N, Visited, Worklist);  } -uint64_t SDNode::getConstantOperandVal(unsigned Num) const { -  assert(Num < NumOperands && "Invalid child # of SDNode!"); -  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue(); -} -  const SDNodeFlags *SDNode::getFlags() const {    if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))      return &FlagsNode->Flags; @@ -7377,13 +7692,13 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,      unsigned BitPos = j * EltBitSize;      if (OpVal.isUndef()) -      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize); +      SplatUndef.setBits(BitPos, BitPos + EltBitSize);      else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) -      SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize). -                    zextOrTrunc(sz) << BitPos; +      SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltBitSize), +                            BitPos);      else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) -      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos; -     else +      SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos); +    else        return false;    } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 996c95bd5f07..8708f58f1e63 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -84,10 +84,6 @@ LimitFPPrecision("limit-float-precision",                   cl::location(LimitFloatPrecision),                   cl::init(0)); -static cl::opt<bool> -EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden, -                cl::desc("Enable fast-math-flags for DAG nodes")); -  /// Minimum jump table density for normal functions.  static cl::opt<unsigned>  JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden, @@ -634,10 +630,6 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,    }  } -/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from -/// this value and returns the result as a ValueVT value.  This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used.  SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,                                        FunctionLoweringInfo &FuncInfo,                                        const SDLoc &dl, SDValue &Chain, @@ -739,10 +731,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,    return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);  } -/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the -/// specified value into the registers specified by this object.  This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used.  void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,                                   const SDLoc &dl, SDValue &Chain, SDValue *Flag,                                   const Value *V, @@ -796,9 +784,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);  } -/// AddInlineAsmOperands - Add this value to the specified inlineasm node -/// operand list.  This adds the code marker and includes the number of -/// values added into it.  void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,                                          unsigned MatchingIdx, const SDLoc &dl,                                          SelectionDAG &DAG, @@ -850,12 +835,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,    LPadToCallSiteMap.clear();  } -/// clear - Clear out the current SelectionDAG and the associated -/// state and prepare this SelectionDAGBuilder object to be used -/// for a new block. This doesn't clear out information about -/// additional blocks that are needed to complete switch lowering -/// or PHI node updating; that information is cleared out as it is -/// consumed.  void SelectionDAGBuilder::clear() {    NodeMap.clear();    UnusedArgNodeMap.clear(); @@ -867,21 +846,10 @@ void SelectionDAGBuilder::clear() {    StatepointLowering.clear();  } -/// clearDanglingDebugInfo - Clear the dangling debug information -/// map. This function is separated from the clear so that debug -/// information that is dangling in a basic block can be properly -/// resolved in a different basic block. This allows the -/// SelectionDAG to resolve dangling debug information attached -/// to PHI nodes.  void SelectionDAGBuilder::clearDanglingDebugInfo() {    DanglingDebugInfoMap.clear();  } -/// getRoot - Return the current virtual root of the Selection DAG, -/// flushing any PendingLoad items. This must be done before emitting -/// a store or any other node that may need to be ordered after any -/// prior load instructions. -///  SDValue SelectionDAGBuilder::getRoot() {    if (PendingLoads.empty())      return DAG.getRoot(); @@ -901,10 +869,6 @@ SDValue SelectionDAGBuilder::getRoot() {    return Root;  } -/// getControlRoot - Similar to getRoot, but instead of flushing all the -/// PendingLoad items, flush all the PendingExports items. It is necessary -/// to do this before emitting a terminator instruction. -///  SDValue SelectionDAGBuilder::getControlRoot() {    SDValue Root = DAG.getRoot(); @@ -937,7 +901,9 @@ void SelectionDAGBuilder::visit(const Instruction &I) {      HandlePHINodesInSuccessorBlocks(I.getParent());    } -  ++SDNodeOrder; +  // Increase the SDNodeOrder if dealing with a non-debug instruction. +  if (!isa<DbgInfoIntrinsic>(I)) +    ++SDNodeOrder;    CurInst = &I; @@ -1403,16 +1369,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {        const Function *F = I.getParent()->getParent();        ISD::NodeType ExtendKind = ISD::ANY_EXTEND; -      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, +      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,                                            Attribute::SExt))          ExtendKind = ISD::SIGN_EXTEND; -      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, +      else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,                                                 Attribute::ZExt))          ExtendKind = ISD::ZERO_EXTEND;        LLVMContext &Context = F->getContext(); -      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, -                                                      Attribute::InReg); +      bool RetInReg = F->getAttributes().hasAttribute( +          AttributeList::ReturnIndex, Attribute::InReg);        for (unsigned j = 0; j != NumValues; ++j) {          EVT VT = ValueVTs[j]; @@ -1582,7 +1548,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,                                                    MachineBasicBlock *CurBB,                                                    MachineBasicBlock *SwitchBB,                                                    BranchProbability TProb, -                                                  BranchProbability FProb) { +                                                  BranchProbability FProb, +                                                  bool InvertCond) {    const BasicBlock *BB = CurBB->getBasicBlock();    // If the leaf of the tree is a comparison, merge the condition into @@ -1596,10 +1563,14 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,           isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {        ISD::CondCode Condition;        if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) { -        Condition = getICmpCondCode(IC->getPredicate()); +        ICmpInst::Predicate Pred = +            InvertCond ? IC->getInversePredicate() : IC->getPredicate(); +        Condition = getICmpCondCode(Pred);        } else {          const FCmpInst *FC = cast<FCmpInst>(Cond); -        Condition = getFCmpCondCode(FC->getPredicate()); +        FCmpInst::Predicate Pred = +            InvertCond ? FC->getInversePredicate() : FC->getPredicate(); +        Condition = getFCmpCondCode(Pred);          if (TM.Options.NoNaNsFPMath)            Condition = getFCmpCodeWithoutNaN(Condition);        } @@ -1612,7 +1583,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,    }    // Create a CaseBlock record representing this branch. -  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()), +  ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ; +  CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),                 nullptr, TBB, FBB, CurBB, TProb, FProb);    SwitchCases.push_back(CB);  } @@ -1625,16 +1597,44 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,                                                 MachineBasicBlock *SwitchBB,                                                 Instruction::BinaryOps Opc,                                                 BranchProbability TProb, -                                               BranchProbability FProb) { -  // If this node is not part of the or/and tree, emit it as a branch. +                                               BranchProbability FProb, +                                               bool InvertCond) { +  // Skip over not part of the tree and remember to invert op and operands at +  // next level. +  if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) { +    const Value *CondOp = BinaryOperator::getNotArgument(Cond); +    if (InBlock(CondOp, CurBB->getBasicBlock())) { +      FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb, +                           !InvertCond); +      return; +    } +  } +    const Instruction *BOp = dyn_cast<Instruction>(Cond); +  // Compute the effective opcode for Cond, taking into account whether it needs +  // to be inverted, e.g. +  //   and (not (or A, B)), C +  // gets lowered as +  //   and (and (not A, not B), C) +  unsigned BOpc = 0; +  if (BOp) { +    BOpc = BOp->getOpcode(); +    if (InvertCond) { +      if (BOpc == Instruction::And) +        BOpc = Instruction::Or; +      else if (BOpc == Instruction::Or) +        BOpc = Instruction::And; +    } +  } + +  // If this node is not part of the or/and tree, emit it as a branch.    if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) || -      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() || +      BOpc != Opc || !BOp->hasOneUse() ||        BOp->getParent() != CurBB->getBasicBlock() ||        !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||        !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {      EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, -                                 TProb, FProb); +                                 TProb, FProb, InvertCond);      return;    } @@ -1669,14 +1669,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,      auto NewFalseProb = TProb / 2 + FProb;      // Emit the LHS condition.      FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, -                         NewTrueProb, NewFalseProb); +                         NewTrueProb, NewFalseProb, InvertCond);      // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).      SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};      BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());      // Emit the RHS condition into TmpBB.      FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, -                         Probs[0], Probs[1]); +                         Probs[0], Probs[1], InvertCond);    } else {      assert(Opc == Instruction::And && "Unknown merge op!");      // Codegen X & Y as: @@ -1702,14 +1702,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,      auto NewFalseProb = FProb / 2;      // Emit the LHS condition.      FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, -                         NewTrueProb, NewFalseProb); +                         NewTrueProb, NewFalseProb, InvertCond);      // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).      SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};      BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());      // Emit the RHS condition into TmpBB.      FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, -                         Probs[0], Probs[1]); +                         Probs[0], Probs[1], InvertCond);    }  } @@ -1793,7 +1793,8 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {        FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,                             Opcode,                             getEdgeProbability(BrMBB, Succ0MBB), -                           getEdgeProbability(BrMBB, Succ1MBB)); +                           getEdgeProbability(BrMBB, Succ1MBB), +                           /*InvertCond=*/false);        // If the compares in later blocks need to use values not currently        // exported from this block, export them now.  This block should always        // be the first entry. @@ -2027,7 +2028,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,      Entry.Node = StackSlot;      Entry.Ty = FnTy->getParamType(0);      if (Fn->hasAttribute(1, Attribute::AttrKind::InReg)) -      Entry.isInReg = true; +      Entry.IsInReg = true;      Args.push_back(Entry);      TargetLowering::CallLoweringInfo CLI(DAG); @@ -2581,13 +2582,13 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {    Flags.setNoSignedWrap(nsw);    Flags.setNoUnsignedWrap(nuw);    Flags.setVectorReduction(vec_redux); -  if (EnableFMFInDAG) { -    Flags.setAllowReciprocal(FMF.allowReciprocal()); -    Flags.setNoInfs(FMF.noInfs()); -    Flags.setNoNaNs(FMF.noNaNs()); -    Flags.setNoSignedZeros(FMF.noSignedZeros()); -    Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); -  } +  Flags.setAllowReciprocal(FMF.allowReciprocal()); +  Flags.setAllowContract(FMF.allowContract()); +  Flags.setNoInfs(FMF.noInfs()); +  Flags.setNoNaNs(FMF.noNaNs()); +  Flags.setNoSignedZeros(FMF.noSignedZeros()); +  Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); +    SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),                                       Op1, Op2, &Flags);    setValue(&I, BinNodeValue); @@ -2914,7 +2915,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) {                               DestVT, N)); // convert types.    // Check if the original LLVM IR Operand was a ConstantInt, because getValue()    // might fold any kind of constant expression to an integer constant and that -  // is not what we are looking for. Only regcognize a bitcast of a genuine +  // is not what we are looking for. Only recognize a bitcast of a genuine    // constant integer as an opaque constant.    else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))      setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false, @@ -3067,14 +3068,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {    if (SrcNumElts > MaskNumElts) {      // Analyze the access pattern of the vector to see if we can extract -    // two subvectors and do the shuffle. The analysis is done by calculating -    // the range of elements the mask access on both vectors. -    int MinRange[2] = { static_cast<int>(SrcNumElts), -                        static_cast<int>(SrcNumElts)}; -    int MaxRange[2] = {-1, -1}; - -    for (unsigned i = 0; i != MaskNumElts; ++i) { -      int Idx = Mask[i]; +    // two subvectors and do the shuffle. +    int StartIdx[2] = { -1, -1 };  // StartIdx to extract from +    bool CanExtract = true; +    for (int Idx : Mask) {        unsigned Input = 0;        if (Idx < 0)          continue; @@ -3083,41 +3080,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {          Input = 1;          Idx -= SrcNumElts;        } -      if (Idx > MaxRange[Input]) -        MaxRange[Input] = Idx; -      if (Idx < MinRange[Input]) -        MinRange[Input] = Idx; -    } - -    // Check if the access is smaller than the vector size and can we find -    // a reasonable extract index. -    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not -                                   // Extract. -    int StartIdx[2];  // StartIdx to extract from -    for (unsigned Input = 0; Input < 2; ++Input) { -      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) { -        RangeUse[Input] = 0; // Unused -        StartIdx[Input] = 0; -        continue; -      } -      // Find a good start index that is a multiple of the mask length. Then -      // see if the rest of the elements are in range. -      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts; -      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts && -          StartIdx[Input] + MaskNumElts <= SrcNumElts) -        RangeUse[Input] = 1; // Extract from a multiple of the mask length. +      // If all the indices come from the same MaskNumElts sized portion of +      // the sources we can use extract. Also make sure the extract wouldn't +      // extract past the end of the source. +      int NewStartIdx = alignDown(Idx, MaskNumElts); +      if (NewStartIdx + MaskNumElts > SrcNumElts || +          (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx)) +        CanExtract = false; +      // Make sure we always update StartIdx as we use it to track if all +      // elements are undef. +      StartIdx[Input] = NewStartIdx;      } -    if (RangeUse[0] == 0 && RangeUse[1] == 0) { +    if (StartIdx[0] < 0 && StartIdx[1] < 0) {        setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.        return;      } -    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) { +    if (CanExtract) {        // Extract appropriate subvector and generate a vector shuffle        for (unsigned Input = 0; Input < 2; ++Input) {          SDValue &Src = Input == 0 ? Src1 : Src2; -        if (RangeUse[Input] == 0) +        if (StartIdx[Input] < 0)            Src = DAG.getUNDEF(VT);          else {            Src = DAG.getNode( @@ -3128,16 +3112,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {        }        // Calculate new mask. -      SmallVector<int, 8> MappedOps; -      for (unsigned i = 0; i != MaskNumElts; ++i) { -        int Idx = Mask[i]; -        if (Idx >= 0) { -          if (Idx < (int)SrcNumElts) -            Idx -= StartIdx[0]; -          else -            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; -        } -        MappedOps.push_back(Idx); +      SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end()); +      for (int &Idx : MappedOps) { +        if (Idx >= (int)SrcNumElts) +          Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; +        else if (Idx >= 0) +          Idx -= StartIdx[0];        }        setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps)); @@ -3151,8 +3131,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {    EVT EltVT = VT.getVectorElementType();    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());    SmallVector<SDValue,8> Ops; -  for (unsigned i = 0; i != MaskNumElts; ++i) { -    int Idx = Mask[i]; +  for (int Idx : Mask) {      SDValue Res;      if (Idx < 0) { @@ -3281,7 +3260,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {          // N = N + Offset          uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); -        // In an inbouds GEP with an offset that is nonnegative even when +        // In an inbounds GEP with an offset that is nonnegative even when          // interpreted as signed, assume there is no unsigned overflow.          SDNodeFlags Flags;          if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds()) @@ -4752,7 +4731,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(    else      FuncInfo.ArgDbgValues.push_back(          BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE)) -            .addOperand(*Op) +            .add(*Op)              .addImm(Offset)              .addMetadata(Variable)              .addMetadata(Expr)); @@ -4764,7 +4743,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(  SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,                                               DILocalVariable *Variable,                                               DIExpression *Expr, int64_t Offset, -                                             DebugLoc dl, +                                             const DebugLoc &dl,                                               unsigned DbgSDNodeOrder) {    SDDbgValue *SDV;    auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode()); @@ -4794,9 +4773,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,  #  define setjmp_undefined_for_msvc  #endif -/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If -/// we want to emit this as a call to a named external function, return the name -/// otherwise lower it and return null. +/// Lower the call to the specified intrinsic function. If we want to emit this +/// as a call to a named external function, return the name. Otherwise, lower it +/// and return null.  const char *  SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {    const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -4929,14 +4908,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {        report_fatal_error("Unsupported element size");      TargetLowering::CallLoweringInfo CLI(DAG); -    CLI.setDebugLoc(sdl) -        .setChain(getRoot()) -        .setCallee(TLI.getLibcallCallingConv(LibraryCall), -                   Type::getVoidTy(*DAG.getContext()), -                   DAG.getExternalSymbol( -                       TLI.getLibcallName(LibraryCall), -                       TLI.getPointerTy(DAG.getDataLayout())), -                   std::move(Args)); +    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( +        TLI.getLibcallCallingConv(LibraryCall), +        Type::getVoidTy(*DAG.getContext()), +        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), +                              TLI.getPointerTy(DAG.getDataLayout())), +        std::move(Args));      std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);      DAG.setRoot(CallResult.second); @@ -5301,6 +5278,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {                               getValue(I.getArgOperand(1)),                               getValue(I.getArgOperand(2))));      return nullptr; +  case Intrinsic::experimental_constrained_fadd: +  case Intrinsic::experimental_constrained_fsub: +  case Intrinsic::experimental_constrained_fmul: +  case Intrinsic::experimental_constrained_fdiv: +  case Intrinsic::experimental_constrained_frem: +    visitConstrainedFPIntrinsic(I, Intrinsic); +    return nullptr;    case Intrinsic::fmuladd: {      EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());      if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && @@ -5537,7 +5521,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {    case Intrinsic::trap: {      StringRef TrapFuncName =          I.getAttributes() -            .getAttribute(AttributeSet::FunctionIndex, "trap-func-name") +            .getAttribute(AttributeList::FunctionIndex, "trap-func-name")              .getValueAsString();      if (TrapFuncName.empty()) {        ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? @@ -5548,7 +5532,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {      TargetLowering::ArgListTy Args;      TargetLowering::CallLoweringInfo CLI(DAG); -    CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee( +    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(          CallingConv::C, I.getType(),          DAG.getExternalSymbol(TrapFuncName.data(),                                TLI.getPointerTy(DAG.getDataLayout())), @@ -5749,6 +5733,46 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {    }  } +void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I, +                                                      unsigned Intrinsic) { +  SDLoc sdl = getCurSDLoc(); +  unsigned Opcode; +  switch (Intrinsic) { +  default: llvm_unreachable("Impossible intrinsic");  // Can't reach here. +  case Intrinsic::experimental_constrained_fadd:  +    Opcode = ISD::STRICT_FADD; +    break; +  case Intrinsic::experimental_constrained_fsub: +    Opcode = ISD::STRICT_FSUB; +    break; +  case Intrinsic::experimental_constrained_fmul: +    Opcode = ISD::STRICT_FMUL; +    break; +  case Intrinsic::experimental_constrained_fdiv: +    Opcode = ISD::STRICT_FDIV; +    break; +  case Intrinsic::experimental_constrained_frem: +    Opcode = ISD::STRICT_FREM; +    break; +  } +  const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +  SDValue Chain = getRoot(); +  SDValue Ops[3] = { Chain, getValue(I.getArgOperand(0)), +                     getValue(I.getArgOperand(1)) }; +  SmallVector<EVT, 4> ValueVTs; +  ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); +  ValueVTs.push_back(MVT::Other); // Out chain + +  SDVTList VTs = DAG.getVTList(ValueVTs); +  SDValue Result = DAG.getNode(Opcode, sdl, VTs, Ops); + +  assert(Result.getNode()->getNumValues() == 2); +  SDValue OutChain = Result.getValue(1); +  DAG.setRoot(OutChain); +  SDValue FPResult = Result.getValue(0); +  setValue(&I, FPResult); +} +  std::pair<SDValue, SDValue>  SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,                                      const BasicBlock *EHPadBB) { @@ -5827,7 +5851,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,    Type *RetTy = CS.getType();    TargetLowering::ArgListTy Args; -  TargetLowering::ArgListEntry Entry;    Args.reserve(CS.arg_size());    const Value *SwiftErrorVal = nullptr; @@ -5843,6 +5866,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,    for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();         i != e; ++i) { +    TargetLowering::ArgListEntry Entry;      const Value *V = *i;      // Skip empty types @@ -5852,11 +5876,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,      SDValue ArgNode = getValue(V);      Entry.Node = ArgNode; Entry.Ty = V->getType(); -    // Skip the first return-type Attribute to get to params. -    Entry.setAttributes(&CS, i - CS.arg_begin() + 1); +    Entry.setAttributes(&CS, i - CS.arg_begin());      // Use swifterror virtual register as input to the call. -    if (Entry.isSwiftError && TLI.supportSwiftError()) { +    if (Entry.IsSwiftError && TLI.supportSwiftError()) {        SwiftErrorVal = V;        // We find the virtual register for the actual swifterror argument.        // Instead of using the Value, we use the virtual register instead. @@ -5869,7 +5892,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,      // If we have an explicit sret argument that is an Instruction, (i.e., it      // might point to function-local memory), we can't meaningfully tail-call. -    if (Entry.isSRet && isa<Instruction>(V)) +    if (Entry.IsSRet && isa<Instruction>(V))        isTailCall = false;    } @@ -5912,8 +5935,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,    }  } -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. +/// Return true if it only matters that the value is equal or not-equal to zero.  static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {    for (const User *U : V->users()) {      if (const ICmpInst *IC = dyn_cast<ICmpInst>(U)) @@ -5928,13 +5950,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {  }  static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, -                             Type *LoadTy,                               SelectionDAGBuilder &Builder) {    // Check to see if this load can be trivially constant folded, e.g. if the    // input is from a string literal.    if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {      // Cast pointer to the type we really want to load. +    Type *LoadTy = +        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits()); +    if (LoadVT.isVector()) +      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements()); +      LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),                                           PointerType::getUnqual(LoadTy)); @@ -5967,8 +5993,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,    return LoadVal;  } -/// processIntegerCallValue - Record the value for an instruction that -/// produces an integer result, converting the type where necessary. +/// Record the value for an instruction that produces an integer result, +/// converting the type where necessary.  void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,                                                    SDValue Value,                                                    bool IsSigned) { @@ -5981,20 +6007,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,    setValue(&I, Value);  } -/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a memcmp call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { -  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t) -  if (I.getNumArgOperands() != 3) -    return false; -    const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); -  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() || -      !I.getArgOperand(2)->getType()->isIntegerTy() || -      !I.getType()->isIntegerTy()) -    return false; -    const Value *Size = I.getArgOperand(2);    const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);    if (CSize && CSize->getZExtValue() == 0) { @@ -6005,11 +6024,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {    }    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); -  std::pair<SDValue, SDValue> Res = -    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(), -                                getValue(LHS), getValue(RHS), getValue(Size), -                                MachinePointerInfo(LHS), -                                MachinePointerInfo(RHS)); +  std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp( +      DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS), +      getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));    if (Res.first.getNode()) {      processIntegerCallValue(I, Res.first, true);      PendingLoads.push_back(Res.second); @@ -6018,88 +6035,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {    // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0    // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0 -  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) { -    bool ActuallyDoIt = true; -    MVT LoadVT; -    Type *LoadTy; -    switch (CSize->getZExtValue()) { -    default: -      LoadVT = MVT::Other; -      LoadTy = nullptr; -      ActuallyDoIt = false; -      break; -    case 2: -      LoadVT = MVT::i16; -      LoadTy = Type::getInt16Ty(CSize->getContext()); -      break; -    case 4: -      LoadVT = MVT::i32; -      LoadTy = Type::getInt32Ty(CSize->getContext()); -      break; -    case 8: -      LoadVT = MVT::i64; -      LoadTy = Type::getInt64Ty(CSize->getContext()); -      break; -        /* -    case 16: -      LoadVT = MVT::v4i32; -      LoadTy = Type::getInt32Ty(CSize->getContext()); -      LoadTy = VectorType::get(LoadTy, 4); -      break; -         */ -    } - -    // This turns into unaligned loads.  We only do this if the target natively -    // supports the MVT we'll be loading or if it is small enough (<= 4) that -    // we'll only produce a small number of byte loads. +  if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I)) +    return false; -    // Require that we can find a legal MVT, and only do this if the target -    // supports unaligned loads of that type.  Expanding into byte loads would -    // bloat the code. +  // If the target has a fast compare for the given size, it will return a +  // preferred load type for that size. Require that the load VT is legal and +  // that the target supports unaligned loads of that type. Otherwise, return +  // INVALID. +  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {      const TargetLowering &TLI = DAG.getTargetLoweringInfo(); -    if (ActuallyDoIt && CSize->getZExtValue() > 4) { -      unsigned DstAS = LHS->getType()->getPointerAddressSpace(); -      unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); +    MVT LVT = TLI.hasFastEqualityCompare(NumBits); +    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {        // TODO: Handle 5 byte compare as 4-byte + 1 byte.        // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.        // TODO: Check alignment of src and dest ptrs. -      if (!TLI.isTypeLegal(LoadVT) || -          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || -          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS)) -        ActuallyDoIt = false; +      unsigned DstAS = LHS->getType()->getPointerAddressSpace(); +      unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); +      if (!TLI.isTypeLegal(LVT) || +          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || +          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) +        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;      } -    if (ActuallyDoIt) { -      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); -      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); +    return LVT; +  }; -      SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, -                                 ISD::SETNE); -      processIntegerCallValue(I, Res, false); -      return true; -    } +  // This turns into unaligned loads. We only do this if the target natively +  // supports the MVT we'll be loading or if it is small enough (<= 4) that +  // we'll only produce a small number of byte loads. +  MVT LoadVT; +  unsigned NumBitsToCompare = CSize->getZExtValue() * 8; +  switch (NumBitsToCompare) { +  default: +    return false; +  case 16: +    LoadVT = MVT::i16; +    break; +  case 32: +    LoadVT = MVT::i32; +    break; +  case 64: +  case 128: +  case 256: +    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); +    break;    } +  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) +    return false; -  return false; +  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); +  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + +  // Bitcast to a wide integer type if the loads are vectors. +  if (LoadVT.isVector()) { +    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); +    LoadL = DAG.getBitcast(CmpVT, LoadL); +    LoadR = DAG.getBitcast(CmpVT, LoadR); +  } + +  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); +  processIntegerCallValue(I, Cmp, false); +  return true;  } -/// visitMemChrCall -- See if we can lower a memchr call into an optimized -/// form.  If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a memchr call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { -  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t) -  if (I.getNumArgOperands() != 3) -    return false; -    const Value *Src = I.getArgOperand(0);    const Value *Char = I.getArgOperand(1);    const Value *Length = I.getArgOperand(2); -  if (!Src->getType()->isPointerTy() || -      !Char->getType()->isIntegerTy() || -      !Length->getType()->isIntegerTy() || -      !I.getType()->isPointerTy()) -    return false;    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();    std::pair<SDValue, SDValue> Res = @@ -6115,15 +6123,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {    return false;  } -/// -/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to -/// to adjust the dst pointer by the size of the copied memory. +/// See if we can lower a mempcpy call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { - -  // Verify argument count: void *mempcpy(void *, const void *, size_t) -  if (I.getNumArgOperands() != 3) -    return false; -    SDValue Dst = getValue(I.getArgOperand(0));    SDValue Src = getValue(I.getArgOperand(1));    SDValue Size = getValue(I.getArgOperand(2)); @@ -6158,19 +6163,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {    return true;  } -/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an -/// optimized form.  If so, return true and lower it, otherwise return false -/// and it will be lowered like a normal call. +/// See if we can lower a strcpy call into an optimized form.  If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { -  // Verify that the prototype makes sense.  char *strcpy(char *, char *) -  if (I.getNumArgOperands() != 2) -    return false; -    const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); -  if (!Arg0->getType()->isPointerTy() || -      !Arg1->getType()->isPointerTy() || -      !I.getType()->isPointerTy()) -    return false;    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();    std::pair<SDValue, SDValue> Res = @@ -6187,19 +6186,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {    return false;  } -/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a strcmp call into an optimized form.  If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { -  // Verify that the prototype makes sense.  int strcmp(void*,void*) -  if (I.getNumArgOperands() != 2) -    return false; -    const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); -  if (!Arg0->getType()->isPointerTy() || -      !Arg1->getType()->isPointerTy() || -      !I.getType()->isIntegerTy()) -    return false;    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();    std::pair<SDValue, SDValue> Res = @@ -6216,17 +6209,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {    return false;  } -/// visitStrLenCall -- See if we can lower a strlen call into an optimized -/// form.  If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strlen call into an optimized form.  If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { -  // Verify that the prototype makes sense.  size_t strlen(char *) -  if (I.getNumArgOperands() != 1) -    return false; -    const Value *Arg0 = I.getArgOperand(0); -  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy()) -    return false;    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();    std::pair<SDValue, SDValue> Res = @@ -6241,19 +6230,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {    return false;  } -/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized -/// form.  If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strnlen call into an optimized form.  If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { -  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t) -  if (I.getNumArgOperands() != 2) -    return false; -    const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); -  if (!Arg0->getType()->isPointerTy() || -      !Arg1->getType()->isIntegerTy() || -      !I.getType()->isIntegerTy()) -    return false;    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();    std::pair<SDValue, SDValue> Res = @@ -6269,16 +6252,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {    return false;  } -/// visitUnaryFloatCall - If a call instruction is a unary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a unary floating-point operation into an SDNode with +/// the specified Opcode.  If so, return true and lower it, otherwise return +/// false and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,                                                unsigned Opcode) { -  // Sanity check that it really is a unary floating-point call. -  if (I.getNumArgOperands() != 1 || -      !I.getArgOperand(0)->getType()->isFloatingPointTy() || -      I.getType() != I.getArgOperand(0)->getType() || -      !I.onlyReadsMemory()) +  // We already checked this call's prototype; verify it doesn't modify errno. +  if (!I.onlyReadsMemory())      return false;    SDValue Tmp = getValue(I.getArgOperand(0)); @@ -6286,17 +6268,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,    return true;  } -/// visitBinaryFloatCall - If a call instruction is a binary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a binary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it. Otherwise return +/// false, and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype.  bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,                                                 unsigned Opcode) { -  // Sanity check that it really is a binary floating-point call. -  if (I.getNumArgOperands() != 2 || -      !I.getArgOperand(0)->getType()->isFloatingPointTy() || -      I.getType() != I.getArgOperand(0)->getType() || -      I.getType() != I.getArgOperand(1)->getType() || -      !I.onlyReadsMemory()) +  // We already checked this call's prototype; verify it doesn't modify errno. +  if (!I.onlyReadsMemory())      return false;    SDValue Tmp0 = getValue(I.getArgOperand(0)); @@ -6336,20 +6316,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {      // Check for well-known libc/libm calls.  If the function is internal, it      // can't be a library call.  Don't do the check if marked as nobuiltin for      // some reason. -    LibFunc::Func Func; +    LibFunc Func;      if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && -        LibInfo->getLibFunc(F->getName(), Func) && +        LibInfo->getLibFunc(*F, Func) &&          LibInfo->hasOptimizedCodeGen(Func)) {        switch (Func) {        default: break; -      case LibFunc::copysign: -      case LibFunc::copysignf: -      case LibFunc::copysignl: -        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks. -            I.getArgOperand(0)->getType()->isFloatingPointTy() && -            I.getType() == I.getArgOperand(0)->getType() && -            I.getType() == I.getArgOperand(1)->getType() && -            I.onlyReadsMemory()) { +      case LibFunc_copysign: +      case LibFunc_copysignf: +      case LibFunc_copysignl: +        // We already checked this call's prototype; verify it doesn't modify +        // errno. +        if (I.onlyReadsMemory()) {            SDValue LHS = getValue(I.getArgOperand(0));            SDValue RHS = getValue(I.getArgOperand(1));            setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(), @@ -6357,122 +6335,122 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {            return;          }          break; -      case LibFunc::fabs: -      case LibFunc::fabsf: -      case LibFunc::fabsl: +      case LibFunc_fabs: +      case LibFunc_fabsf: +      case LibFunc_fabsl:          if (visitUnaryFloatCall(I, ISD::FABS))            return;          break; -      case LibFunc::fmin: -      case LibFunc::fminf: -      case LibFunc::fminl: +      case LibFunc_fmin: +      case LibFunc_fminf: +      case LibFunc_fminl:          if (visitBinaryFloatCall(I, ISD::FMINNUM))            return;          break; -      case LibFunc::fmax: -      case LibFunc::fmaxf: -      case LibFunc::fmaxl: +      case LibFunc_fmax: +      case LibFunc_fmaxf: +      case LibFunc_fmaxl:          if (visitBinaryFloatCall(I, ISD::FMAXNUM))            return;          break; -      case LibFunc::sin: -      case LibFunc::sinf: -      case LibFunc::sinl: +      case LibFunc_sin: +      case LibFunc_sinf: +      case LibFunc_sinl:          if (visitUnaryFloatCall(I, ISD::FSIN))            return;          break; -      case LibFunc::cos: -      case LibFunc::cosf: -      case LibFunc::cosl: +      case LibFunc_cos: +      case LibFunc_cosf: +      case LibFunc_cosl:          if (visitUnaryFloatCall(I, ISD::FCOS))            return;          break; -      case LibFunc::sqrt: -      case LibFunc::sqrtf: -      case LibFunc::sqrtl: -      case LibFunc::sqrt_finite: -      case LibFunc::sqrtf_finite: -      case LibFunc::sqrtl_finite: +      case LibFunc_sqrt: +      case LibFunc_sqrtf: +      case LibFunc_sqrtl: +      case LibFunc_sqrt_finite: +      case LibFunc_sqrtf_finite: +      case LibFunc_sqrtl_finite:          if (visitUnaryFloatCall(I, ISD::FSQRT))            return;          break; -      case LibFunc::floor: -      case LibFunc::floorf: -      case LibFunc::floorl: +      case LibFunc_floor: +      case LibFunc_floorf: +      case LibFunc_floorl:          if (visitUnaryFloatCall(I, ISD::FFLOOR))            return;          break; -      case LibFunc::nearbyint: -      case LibFunc::nearbyintf: -      case LibFunc::nearbyintl: +      case LibFunc_nearbyint: +      case LibFunc_nearbyintf: +      case LibFunc_nearbyintl:          if (visitUnaryFloatCall(I, ISD::FNEARBYINT))            return;          break; -      case LibFunc::ceil: -      case LibFunc::ceilf: -      case LibFunc::ceill: +      case LibFunc_ceil: +      case LibFunc_ceilf: +      case LibFunc_ceill:          if (visitUnaryFloatCall(I, ISD::FCEIL))            return;          break; -      case LibFunc::rint: -      case LibFunc::rintf: -      case LibFunc::rintl: +      case LibFunc_rint: +      case LibFunc_rintf: +      case LibFunc_rintl:          if (visitUnaryFloatCall(I, ISD::FRINT))            return;          break; -      case LibFunc::round: -      case LibFunc::roundf: -      case LibFunc::roundl: +      case LibFunc_round: +      case LibFunc_roundf: +      case LibFunc_roundl:          if (visitUnaryFloatCall(I, ISD::FROUND))            return;          break; -      case LibFunc::trunc: -      case LibFunc::truncf: -      case LibFunc::truncl: +      case LibFunc_trunc: +      case LibFunc_truncf: +      case LibFunc_truncl:          if (visitUnaryFloatCall(I, ISD::FTRUNC))            return;          break; -      case LibFunc::log2: -      case LibFunc::log2f: -      case LibFunc::log2l: +      case LibFunc_log2: +      case LibFunc_log2f: +      case LibFunc_log2l:          if (visitUnaryFloatCall(I, ISD::FLOG2))            return;          break; -      case LibFunc::exp2: -      case LibFunc::exp2f: -      case LibFunc::exp2l: +      case LibFunc_exp2: +      case LibFunc_exp2f: +      case LibFunc_exp2l:          if (visitUnaryFloatCall(I, ISD::FEXP2))            return;          break; -      case LibFunc::memcmp: +      case LibFunc_memcmp:          if (visitMemCmpCall(I))            return;          break; -      case LibFunc::mempcpy: +      case LibFunc_mempcpy:          if (visitMemPCpyCall(I))            return;          break; -      case LibFunc::memchr: +      case LibFunc_memchr:          if (visitMemChrCall(I))            return;          break; -      case LibFunc::strcpy: +      case LibFunc_strcpy:          if (visitStrCpyCall(I, false))            return;          break; -      case LibFunc::stpcpy: +      case LibFunc_stpcpy:          if (visitStrCpyCall(I, true))            return;          break; -      case LibFunc::strcmp: +      case LibFunc_strcmp:          if (visitStrCmpCall(I))            return;          break; -      case LibFunc::strlen: +      case LibFunc_strlen:          if (visitStrLenCall(I))            return;          break; -      case LibFunc::strnlen: +      case LibFunc_strnlen:          if (visitStrNLenCall(I))            return;          break; @@ -7361,7 +7339,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(    // Populate the argument list.    // Attributes for args start at offset 1, after the return attribute. -  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; +  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;         ArgI != ArgE; ++ArgI) {      const Value *V = CS->getOperand(ArgI); @@ -7370,7 +7348,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(      TargetLowering::ArgListEntry Entry;      Entry.Node = getValue(V);      Entry.Ty = V->getType(); -    Entry.setAttributes(&CS, AttrI); +    Entry.setAttributes(&CS, ArgIdx);      Args.push_back(Entry);    } @@ -7631,9 +7609,9 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,    FuncInfo.MF->getFrameInfo().setHasPatchPoint();  } -/// Returns an AttributeSet representing the attributes applied to the return +/// Returns an AttributeList representing the attributes applied to the return  /// value of the given call. -static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {    SmallVector<Attribute::AttrKind, 2> Attrs;    if (CLI.RetSExt)      Attrs.push_back(Attribute::SExt); @@ -7642,8 +7620,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {    if (CLI.IsInReg)      Attrs.push_back(Attribute::InReg); -  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, -                           Attrs); +  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, +                            Attrs);  }  /// TargetLowering::LowerCallTo - This is the default LowerCallTo @@ -7683,15 +7661,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {      ArgListEntry Entry;      Entry.Node = DemoteStackSlot;      Entry.Ty = StackSlotPtrType; -    Entry.isSExt = false; -    Entry.isZExt = false; -    Entry.isInReg = false; -    Entry.isSRet = true; -    Entry.isNest = false; -    Entry.isByVal = false; -    Entry.isReturned = false; -    Entry.isSwiftSelf = false; -    Entry.isSwiftError = false; +    Entry.IsSExt = false; +    Entry.IsZExt = false; +    Entry.IsInReg = false; +    Entry.IsSRet = true; +    Entry.IsNest = false; +    Entry.IsByVal = false; +    Entry.IsReturned = false; +    Entry.IsSwiftSelf = false; +    Entry.IsSwiftError = false;      Entry.Alignment = Align;      CLI.getArgs().insert(CLI.getArgs().begin(), Entry);      CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext()); @@ -7724,7 +7702,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {    ArgListTy &Args = CLI.getArgs();    if (supportSwiftError()) {      for (unsigned i = 0, e = Args.size(); i != e; ++i) { -      if (Args[i].isSwiftError) { +      if (Args[i].IsSwiftError) {          ISD::InputArg MyFlags;          MyFlags.VT = getPointerTy(DL);          MyFlags.ArgVT = EVT(getPointerTy(DL)); @@ -7741,7 +7719,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {      SmallVector<EVT, 4> ValueVTs;      ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);      Type *FinalType = Args[i].Ty; -    if (Args[i].isByVal) +    if (Args[i].IsByVal)        FinalType = cast<PointerType>(Args[i].Ty)->getElementType();      bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(          FinalType, CLI.CallConv, CLI.IsVarArg); @@ -7754,11 +7732,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {        ISD::ArgFlagsTy Flags;        unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); -      if (Args[i].isZExt) +      if (Args[i].IsZExt)          Flags.setZExt(); -      if (Args[i].isSExt) +      if (Args[i].IsSExt)          Flags.setSExt(); -      if (Args[i].isInReg) { +      if (Args[i].IsInReg) {          // If we are using vectorcall calling convention, a structure that is          // passed InReg - is surely an HVA          if (CLI.CallConv == CallingConv::X86_VectorCall && @@ -7771,15 +7749,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {          // Set InReg Flag          Flags.setInReg();        } -      if (Args[i].isSRet) +      if (Args[i].IsSRet)          Flags.setSRet(); -      if (Args[i].isSwiftSelf) +      if (Args[i].IsSwiftSelf)          Flags.setSwiftSelf(); -      if (Args[i].isSwiftError) +      if (Args[i].IsSwiftError)          Flags.setSwiftError(); -      if (Args[i].isByVal) +      if (Args[i].IsByVal)          Flags.setByVal(); -      if (Args[i].isInAlloca) { +      if (Args[i].IsInAlloca) {          Flags.setInAlloca();          // Set the byval flag for CCAssignFn callbacks that don't know about          // inalloca.  This way we can know how many bytes we should've allocated @@ -7788,7 +7766,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {          // in the various CC lowering callbacks.          Flags.setByVal();        } -      if (Args[i].isByVal || Args[i].isInAlloca) { +      if (Args[i].IsByVal || Args[i].IsInAlloca) {          PointerType *Ty = cast<PointerType>(Args[i].Ty);          Type *ElementTy = Ty->getElementType();          Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); @@ -7801,7 +7779,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {            FrameAlign = getByValTypeAlignment(ElementTy, DL);          Flags.setByValAlign(FrameAlign);        } -      if (Args[i].isNest) +      if (Args[i].IsNest)          Flags.setNest();        if (NeedsRegBlock)          Flags.setInConsecutiveRegs(); @@ -7812,13 +7790,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {        SmallVector<SDValue, 4> Parts(NumParts);        ISD::NodeType ExtendKind = ISD::ANY_EXTEND; -      if (Args[i].isSExt) +      if (Args[i].IsSExt)          ExtendKind = ISD::SIGN_EXTEND; -      else if (Args[i].isZExt) +      else if (Args[i].IsZExt)          ExtendKind = ISD::ZERO_EXTEND;        // Conservatively only handle 'returned' on non-vectors for now -      if (Args[i].isReturned && !Op.getValueType().isVector()) { +      if (Args[i].IsReturned && !Op.getValueType().isVector()) {          assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&                 "unexpected use of 'returned'");          // Before passing 'returned' to the target lowering code, ensure that @@ -7832,9 +7810,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {          // parameter extension method is not compatible with the return          // extension method          if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) || -            (ExtendKind != ISD::ANY_EXTEND && -             CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt)) -        Flags.setReturned(); +            (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt && +             CLI.RetZExt == Args[i].IsZExt)) +          Flags.setReturned();        }        getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, @@ -8010,6 +7988,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {    return true;  } +typedef DenseMap<const Argument *, +                 std::pair<const AllocaInst *, const StoreInst *>> +    ArgCopyElisionMapTy; + +/// Scan the entry block of the function in FuncInfo for arguments that look +/// like copies into a local alloca. Record any copied arguments in +/// ArgCopyElisionCandidates. +static void +findArgumentCopyElisionCandidates(const DataLayout &DL, +                                  FunctionLoweringInfo *FuncInfo, +                                  ArgCopyElisionMapTy &ArgCopyElisionCandidates) { +  // Record the state of every static alloca used in the entry block. Argument +  // allocas are all used in the entry block, so we need approximately as many +  // entries as we have arguments. +  enum StaticAllocaInfo { Unknown, Clobbered, Elidable }; +  SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas; +  unsigned NumArgs = FuncInfo->Fn->arg_size(); +  StaticAllocas.reserve(NumArgs * 2); + +  auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * { +    if (!V) +      return nullptr; +    V = V->stripPointerCasts(); +    const auto *AI = dyn_cast<AllocaInst>(V); +    if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI)) +      return nullptr; +    auto Iter = StaticAllocas.insert({AI, Unknown}); +    return &Iter.first->second; +  }; + +  // Look for stores of arguments to static allocas. Look through bitcasts and +  // GEPs to handle type coercions, as long as the alloca is fully initialized +  // by the store. Any non-store use of an alloca escapes it and any subsequent +  // unanalyzed store might write it. +  // FIXME: Handle structs initialized with multiple stores. +  for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) { +    // Look for stores, and handle non-store uses conservatively. +    const auto *SI = dyn_cast<StoreInst>(&I); +    if (!SI) { +      // We will look through cast uses, so ignore them completely. +      if (I.isCast()) +        continue; +      // Ignore debug info intrinsics, they don't escape or store to allocas. +      if (isa<DbgInfoIntrinsic>(I)) +        continue; +      // This is an unknown instruction. Assume it escapes or writes to all +      // static alloca operands. +      for (const Use &U : I.operands()) { +        if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U)) +          *Info = StaticAllocaInfo::Clobbered; +      } +      continue; +    } + +    // If the stored value is a static alloca, mark it as escaped. +    if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand())) +      *Info = StaticAllocaInfo::Clobbered; + +    // Check if the destination is a static alloca. +    const Value *Dst = SI->getPointerOperand()->stripPointerCasts(); +    StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst); +    if (!Info) +      continue; +    const AllocaInst *AI = cast<AllocaInst>(Dst); + +    // Skip allocas that have been initialized or clobbered. +    if (*Info != StaticAllocaInfo::Unknown) +      continue; + +    // Check if the stored value is an argument, and that this store fully +    // initializes the alloca. Don't elide copies from the same argument twice. +    const Value *Val = SI->getValueOperand()->stripPointerCasts(); +    const auto *Arg = dyn_cast<Argument>(Val); +    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() || +        Arg->getType()->isEmptyTy() || +        DL.getTypeStoreSize(Arg->getType()) != +            DL.getTypeAllocSize(AI->getAllocatedType()) || +        ArgCopyElisionCandidates.count(Arg)) { +      *Info = StaticAllocaInfo::Clobbered; +      continue; +    } + +    DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n'); + +    // Mark this alloca and store for argument copy elision. +    *Info = StaticAllocaInfo::Elidable; +    ArgCopyElisionCandidates.insert({Arg, {AI, SI}}); + +    // Stop scanning if we've seen all arguments. This will happen early in -O0 +    // builds, which is useful, because -O0 builds have large entry blocks and +    // many allocas. +    if (ArgCopyElisionCandidates.size() == NumArgs) +      break; +  } +} + +/// Try to elide argument copies from memory into a local alloca. Succeeds if +/// ArgVal is a load from a suitable fixed stack object. +static void tryToElideArgumentCopy( +    FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains, +    DenseMap<int, int> &ArgCopyElisionFrameIndexMap, +    SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs, +    ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg, +    SDValue ArgVal, bool &ArgHasUses) { +  // Check if this is a load from a fixed stack object. +  auto *LNode = dyn_cast<LoadSDNode>(ArgVal); +  if (!LNode) +    return; +  auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()); +  if (!FINode) +    return; + +  // Check that the fixed stack object is the right size and alignment. +  // Look at the alignment that the user wrote on the alloca instead of looking +  // at the stack object. +  auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg); +  assert(ArgCopyIter != ArgCopyElisionCandidates.end()); +  const AllocaInst *AI = ArgCopyIter->second.first; +  int FixedIndex = FINode->getIndex(); +  int &AllocaIndex = FuncInfo->StaticAllocaMap[AI]; +  int OldIndex = AllocaIndex; +  MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo(); +  if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) { +    DEBUG(dbgs() << "  argument copy elision failed due to bad fixed stack " +                    "object size\n"); +    return; +  } +  unsigned RequiredAlignment = AI->getAlignment(); +  if (!RequiredAlignment) { +    RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment( +        AI->getAllocatedType()); +  } +  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) { +    DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca " +                    "greater than stack argument alignment (" +                 << RequiredAlignment << " vs " +                 << MFI.getObjectAlignment(FixedIndex) << ")\n"); +    return; +  } + +  // Perform the elision. Delete the old stack object and replace its only use +  // in the variable info map. Mark the stack object as mutable. +  DEBUG({ +    dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n' +           << "  Replacing frame index " << OldIndex << " with " << FixedIndex +           << '\n'; +  }); +  MFI.RemoveStackObject(OldIndex); +  MFI.setIsImmutableObjectIndex(FixedIndex, false); +  AllocaIndex = FixedIndex; +  ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex}); +  Chains.push_back(ArgVal.getValue(1)); + +  // Avoid emitting code for the store implementing the copy. +  const StoreInst *SI = ArgCopyIter->second.second; +  ElidedArgCopyInstrs.insert(SI); + +  // Check for uses of the argument again so that we can avoid exporting ArgVal +  // if it is't used by anything other than the store. +  for (const Value *U : Arg.users()) { +    if (U != SI) { +      ArgHasUses = true; +      break; +    } +  } +} +  void SelectionDAGISel::LowerArguments(const Function &F) {    SelectionDAG &DAG = SDB->DAG;    SDLoc dl = SDB->getCurSDLoc(); @@ -8032,15 +8177,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) {      Ins.push_back(RetArg);    } +  // Look for stores of arguments to static allocas. Mark such arguments with a +  // flag to ask the target to give us the memory location of that argument if +  // available. +  ArgCopyElisionMapTy ArgCopyElisionCandidates; +  findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates); +    // Set up the incoming argument description vector. -  unsigned Idx = 1; -  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); -       I != E; ++I, ++Idx) { +  unsigned Idx = 0; +  for (const Argument &Arg : F.args()) { +    ++Idx;      SmallVector<EVT, 4> ValueVTs; -    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); -    bool isArgValueUsed = !I->use_empty(); +    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); +    bool isArgValueUsed = !Arg.use_empty();      unsigned PartBase = 0; -    Type *FinalType = I->getType(); +    Type *FinalType = Arg.getType();      if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))        FinalType = cast<PointerType>(FinalType)->getElementType();      bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( @@ -8060,7 +8211,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {          // If we are using vectorcall calling convention, a structure that is          // passed InReg - is surely an HVA          if (F.getCallingConv() == CallingConv::X86_VectorCall && -            isa<StructType>(I->getType())) { +            isa<StructType>(Arg.getType())) {            // The first value of a structure is marked            if (0 == Value)              Flags.setHvaStart(); @@ -8092,7 +8243,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {            Flags.setByVal();        }        if (Flags.isByVal() || Flags.isInAlloca()) { -        PointerType *Ty = cast<PointerType>(I->getType()); +        PointerType *Ty = cast<PointerType>(Arg.getType());          Type *ElementTy = Ty->getElementType();          Flags.setByValSize(DL.getTypeAllocSize(ElementTy));          // For ByVal, alignment should be passed from FE.  BE will guess if @@ -8109,6 +8260,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {        if (NeedsRegBlock)          Flags.setInConsecutiveRegs();        Flags.setOrigAlign(OriginalAlignment); +      if (ArgCopyElisionCandidates.count(&Arg)) +        Flags.setCopyElisionCandidate();        MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);        unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT); @@ -8155,7 +8308,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {    // Set up the argument values.    unsigned i = 0; -  Idx = 1; +  Idx = 0;    if (!FuncInfo->CanLowerReturn) {      // Create a virtual register for the sret pointer, and put in a copy      // from the sret argument into it. @@ -8181,25 +8334,39 @@ void SelectionDAGISel::LowerArguments(const Function &F) {      ++i;    } -  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; -      ++I, ++Idx) { +  SmallVector<SDValue, 4> Chains; +  DenseMap<int, int> ArgCopyElisionFrameIndexMap; +  for (const Argument &Arg : F.args()) { +    ++Idx;      SmallVector<SDValue, 4> ArgValues;      SmallVector<EVT, 4> ValueVTs; -    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); +    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);      unsigned NumValues = ValueVTs.size(); +    if (NumValues == 0) +      continue; + +    bool ArgHasUses = !Arg.use_empty(); + +    // Elide the copying store if the target loaded this argument from a +    // suitable fixed stack object. +    if (Ins[i].Flags.isCopyElisionCandidate()) { +      tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap, +                             ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg, +                             InVals[i], ArgHasUses); +    }      // If this argument is unused then remember its value. It is used to generate      // debugging information.      bool isSwiftErrorArg =          TLI->supportSwiftError() &&          F.getAttributes().hasAttribute(Idx, Attribute::SwiftError); -    if (I->use_empty() && NumValues && !isSwiftErrorArg) { -      SDB->setUnusedArgValue(&*I, InVals[i]); +    if (!ArgHasUses && !isSwiftErrorArg) { +      SDB->setUnusedArgValue(&Arg, InVals[i]);        // Also remember any frame index for use in FastISel.        if (FrameIndexSDNode *FI =            dyn_cast<FrameIndexSDNode>(InVals[i].getNode())) -        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); +        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());      }      for (unsigned Val = 0; Val != NumValues; ++Val) { @@ -8210,16 +8377,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {        // Even an apparant 'unused' swifterror argument needs to be returned. So        // we do generate a copy for it that can be used on return from the        // function. -      if (!I->use_empty() || isSwiftErrorArg) { +      if (ArgHasUses || isSwiftErrorArg) {          Optional<ISD::NodeType> AssertOp;          if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))            AssertOp = ISD::AssertSext;          else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))            AssertOp = ISD::AssertZext; -        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], -                                             NumParts, PartVT, VT, -                                             nullptr, AssertOp)); +        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, +                                             PartVT, VT, nullptr, AssertOp));        }        i += NumParts; @@ -8232,18 +8398,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {      // Note down frame index.      if (FrameIndexSDNode *FI =          dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) -      FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); +      FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());      SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),                                       SDB->getCurSDLoc()); -    SDB->setValue(&*I, Res); +    SDB->setValue(&Arg, Res);      if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {        if (LoadSDNode *LNode =            dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))          if (FrameIndexSDNode *FI =              dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) -        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); +        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());      }      // Update the SwiftErrorVRegDefMap. @@ -8263,18 +8429,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) {        // uses with vregs.        unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();        if (TargetRegisterInfo::isVirtualRegister(Reg)) { -        FuncInfo->ValueMap[&*I] = Reg; +        FuncInfo->ValueMap[&Arg] = Reg;          continue;        }      } -    if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) { -      FuncInfo->InitializeRegForValue(&*I); -      SDB->CopyToExportRegsIfNeeded(&*I); +    if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) { +      FuncInfo->InitializeRegForValue(&Arg); +      SDB->CopyToExportRegsIfNeeded(&Arg);      }    } +  if (!Chains.empty()) { +    Chains.push_back(NewRoot); +    NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); +  } + +  DAG.setRoot(NewRoot); +    assert(i == InVals.size() && "Argument register count mismatch!"); +  // If any argument copy elisions occurred and we have debug info, update the +  // stale frame indices used in the dbg.declare variable info table. +  MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo(); +  if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) { +    for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) { +      auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot); +      if (I != ArgCopyElisionFrameIndexMap.end()) +        VI.Slot = I->second; +    } +  } +    // Finally, if the target has anything special to do, allow it to do so.    EmitFunctionEntryCode();  } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index abde8a89befc..c6acc09b6602 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -616,33 +616,27 @@ public:    void init(GCFunctionInfo *gfi, AliasAnalysis &aa,              const TargetLibraryInfo *li); -  /// clear - Clear out the current SelectionDAG and the associated -  /// state and prepare this SelectionDAGBuilder object to be used -  /// for a new block. This doesn't clear out information about -  /// additional blocks that are needed to complete switch lowering -  /// or PHI node updating; that information is cleared out as it is -  /// consumed. +  /// Clear out the current SelectionDAG and the associated state and prepare +  /// this SelectionDAGBuilder object to be used for a new block. This doesn't +  /// clear out information about additional blocks that are needed to complete +  /// switch lowering or PHI node updating; that information is cleared out as +  /// it is consumed.    void clear(); -  /// clearDanglingDebugInfo - Clear the dangling debug information -  /// map. This function is separated from the clear so that debug -  /// information that is dangling in a basic block can be properly -  /// resolved in a different basic block. This allows the -  /// SelectionDAG to resolve dangling debug information attached -  /// to PHI nodes. +  /// Clear the dangling debug information map. This function is separated from +  /// the clear so that debug information that is dangling in a basic block can +  /// be properly resolved in a different basic block. This allows the +  /// SelectionDAG to resolve dangling debug information attached to PHI nodes.    void clearDanglingDebugInfo(); -  /// getRoot - Return the current virtual root of the Selection DAG, -  /// flushing any PendingLoad items. This must be done before emitting -  /// a store or any other node that may need to be ordered after any -  /// prior load instructions. -  /// +  /// Return the current virtual root of the Selection DAG, flushing any +  /// PendingLoad items. This must be done before emitting a store or any other +  /// node that may need to be ordered after any prior load instructions.    SDValue getRoot(); -  /// getControlRoot - Similar to getRoot, but instead of flushing all the -  /// PendingLoad items, flush all the PendingExports items. It is necessary -  /// to do this before emitting a terminator instruction. -  /// +  /// Similar to getRoot, but instead of flushing all the PendingLoad items, +  /// flush all the PendingExports items. It is necessary to do this before +  /// emitting a terminator instruction.    SDValue getControlRoot();    SDLoc getCurSDLoc() const { @@ -688,12 +682,13 @@ public:                              MachineBasicBlock *FBB, MachineBasicBlock *CurBB,                              MachineBasicBlock *SwitchBB,                              Instruction::BinaryOps Opc, BranchProbability TW, -                            BranchProbability FW); +                            BranchProbability FW, bool InvertCond);    void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,                                      MachineBasicBlock *FBB,                                      MachineBasicBlock *CurBB,                                      MachineBasicBlock *SwitchBB, -                                    BranchProbability TW, BranchProbability FW); +                                    BranchProbability TW, BranchProbability FW, +                                    bool InvertCond);    bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);    bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);    void CopyToExportRegsIfNeeded(const Value *V); @@ -900,6 +895,7 @@ private:    void visitInlineAsm(ImmutableCallSite CS);    const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);    void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); +  void visitConstrainedFPIntrinsic(const CallInst &I, unsigned Intrinsic);    void visitVAStart(const CallInst &I);    void visitVAArg(const VAArgInst &I); @@ -944,8 +940,8 @@ private:    /// Return the appropriate SDDbgValue based on N.    SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable, -                          DIExpression *Expr, int64_t Offset, DebugLoc dl, -                          unsigned DbgSDNodeOrder); +                          DIExpression *Expr, int64_t Offset, +                          const DebugLoc &dl, unsigned DbgSDNodeOrder);  };  /// RegsForValue - This struct represents the registers (physical or virtual) @@ -958,26 +954,23 @@ private:  /// type.  ///  struct RegsForValue { -  /// ValueVTs - The value types of the values, which may not be legal, and +  /// The value types of the values, which may not be legal, and    /// may need be promoted or synthesized from one or more registers. -  ///    SmallVector<EVT, 4> ValueVTs; -  /// RegVTs - The value types of the registers. This is the same size as -  /// ValueVTs and it records, for each value, what the type of the assigned -  /// register or registers are. (Individual values are never synthesized -  /// from more than one type of register.) +  /// The value types of the registers. This is the same size as ValueVTs and it +  /// records, for each value, what the type of the assigned register or +  /// registers are. (Individual values are never synthesized from more than one +  /// type of register.)    ///    /// With virtual registers, the contents of RegVTs is redundant with TLI's    /// getRegisterType member function, however when with physical registers    /// it is necessary to have a separate record of the types. -  ///    SmallVector<MVT, 4> RegVTs; -  /// Regs - This list holds the registers assigned to the values. +  /// This list holds the registers assigned to the values.    /// Each legal or promoted value requires one register, and each    /// expanded value requires multiple registers. -  ///    SmallVector<unsigned, 4> Regs;    RegsForValue(); @@ -987,33 +980,33 @@ struct RegsForValue {    RegsForValue(LLVMContext &Context, const TargetLowering &TLI,                 const DataLayout &DL, unsigned Reg, Type *Ty); -  /// append - Add the specified values to this one. +  /// Add the specified values to this one.    void append(const RegsForValue &RHS) {      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());      Regs.append(RHS.Regs.begin(), RHS.Regs.end());    } -  /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from -  /// this value and returns the result as a ValueVTs value.  This uses -  /// Chain/Flag as the input and updates them for the output Chain/Flag. -  /// If the Flag pointer is NULL, no flag is used. +  /// Emit a series of CopyFromReg nodes that copies from this value and returns +  /// the result as a ValueVTs value. This uses Chain/Flag as the input and +  /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no +  /// flag is used.    SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,                            const SDLoc &dl, SDValue &Chain, SDValue *Flag,                            const Value *V = nullptr) const; -  /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified -  /// value into the registers specified by this object.  This uses Chain/Flag -  /// as the input and updates them for the output Chain/Flag.  If the Flag -  /// pointer is nullptr, no flag is used.  If V is not nullptr, then it is used -  /// in printing better diagnostic messages on error. +  /// Emit a series of CopyToReg nodes that copies the specified value into the +  /// registers specified by this object. This uses Chain/Flag as the input and +  /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no +  /// flag is used. If V is not nullptr, then it is used in printing better +  /// diagnostic messages on error.    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,                       SDValue &Chain, SDValue *Flag, const Value *V = nullptr,                       ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const; -  /// AddInlineAsmOperands - Add this value to the specified inlineasm node -  /// operand list.  This adds the code marker, matching input operand index -  /// (if applicable), and includes the number of values added into it. +  /// Add this value to the specified inlineasm node operand list. This adds the +  /// code marker, matching input operand index (if applicable), and includes +  /// the number of values added into it.    void AddInlineAsmOperands(unsigned Kind, bool HasMatching,                              unsigned MatchingIdx, const SDLoc &dl,                              SelectionDAG &DAG, std::vector<SDValue> &Ops) const; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0faaad8a21b7..488c60a28ffb 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -300,6 +300,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {    case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";    // Bit manipulation +  case ISD::ABS:                        return "abs";    case ISD::BITREVERSE:                 return "bitreverse";    case ISD::BSWAP:                      return "bswap";    case ISD::CTPOP:                      return "ctpop"; @@ -366,11 +367,13 @@ static Printable PrintNodeId(const SDNode &Node) {    });  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); } -void SDNode::dump(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const {    print(dbgs(), G);    dbgs() << '\n';  } +#endif  void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {    for (unsigned i = 0, e = getNumValues(); i != e; ++i) { @@ -416,7 +419,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {        OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';      else {        OS << "<APFloat("; -      CSDN->getValueAPF().bitcastToAPInt().dump(); +      CSDN->getValueAPF().bitcastToAPInt().print(OS, false);        OS << ")>";      }    } else if (const GlobalAddressSDNode *GADN = @@ -566,6 +569,7 @@ static bool shouldPrintInline(const SDNode &Node) {    return Node.getNumOperands() == 0;  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {    for (const SDValue &Op : N->op_values()) {      if (shouldPrintInline(*Op.getNode())) @@ -592,6 +596,7 @@ LLVM_DUMP_METHOD void SelectionDAG::dump() const {    if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);    dbgs() << "\n\n";  } +#endif  void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {    OS << PrintNodeId(*this) << ": "; @@ -618,6 +623,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G,    }  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet;  static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,                         const SelectionDAG *G, VisitedSDNodeSet &once) { @@ -646,15 +652,16 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,      DumpNodesr(OS, Op.getNode(), indent+2, G, once);  } -void SDNode::dumpr() const { +LLVM_DUMP_METHOD void SDNode::dumpr() const {    VisitedSDNodeSet once;    DumpNodesr(dbgs(), this, 0, nullptr, once);  } -void SDNode::dumpr(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const {    VisitedSDNodeSet once;    DumpNodesr(dbgs(), this, 0, G, once);  } +#endif  static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,                                    const SelectionDAG *G, unsigned depth, @@ -688,14 +695,17 @@ void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {    printrWithDepth(OS, G, 10);  } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {    printrWithDepth(dbgs(), G, depth);  } -void SDNode::dumprFull(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const {    // Don't print impossibly deep things.    dumprWithDepth(G, 10);  } +#endif  void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {    printr(OS, G); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 64e6c221229b..e21204dbb966 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -11,40 +11,65 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h"  #include "ScheduleDAGSDNodes.h"  #include "SelectionDAGBuilder.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h"  #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/BranchProbabilityInfo.h"  #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/CodeGen/FastISel.h"  #include "llvm/CodeGen/FunctionLoweringInfo.h"  #include "llvm/CodeGen/GCMetadata.h" -#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h"  #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h"  #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/MachineValueType.h"  #include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/CodeGen/SelectionDAG.h"  #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SelectionDAGNodes.h"  #include "llvm/CodeGen/StackProtector.h" -#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/BasicBlock.h"  #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h"  #include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h" @@ -59,6 +84,13 @@  #include "llvm/Target/TargetSubtargetInfo.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <memory> +#include <string> +#include <utility> +#include <vector>  using namespace llvm; @@ -73,104 +105,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");  STATISTIC(NumFastIselFailLowerArguments,            "Number of entry blocks where fast isel failed to lower arguments"); -#ifndef NDEBUG -static cl::opt<bool> -EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden, -          cl::desc("Enable extra verbose messages in the \"fast\" " -                   "instruction selector")); - -  // Terminators -STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret"); -STATISTIC(NumFastIselFailBr,"Fast isel fails on Br"); -STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch"); -STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr"); -STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke"); -STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume"); -STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable"); - -  // Standard binary operators... -STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add"); -STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd"); -STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub"); -STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub"); -STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul"); -STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul"); -STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv"); -STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv"); -STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv"); -STATISTIC(NumFastIselFailURem,"Fast isel fails on URem"); -STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem"); -STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem"); - -  // Logical operators... -STATISTIC(NumFastIselFailAnd,"Fast isel fails on And"); -STATISTIC(NumFastIselFailOr,"Fast isel fails on Or"); -STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor"); - -  // Memory instructions... -STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca"); -STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load"); -STATISTIC(NumFastIselFailStore,"Fast isel fails on Store"); -STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg"); -STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM"); -STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence"); -STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr"); - -  // Convert instructions... -STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc"); -STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt"); -STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt"); -STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc"); -STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt"); -STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI"); -STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI"); -STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP"); -STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP"); -STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr"); -STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt"); -STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast"); - -  // Other instructions... -STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp"); -STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp"); -STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI"); -STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select"); -STATISTIC(NumFastIselFailCall,"Fast isel fails on Call"); -STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl"); -STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr"); -STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr"); -STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg"); -STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement"); -STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement"); -STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector"); -STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue"); -STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue"); -STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad"); - -// Intrinsic instructions... -STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call"); -STATISTIC(NumFastIselFailSAddWithOverflow, -          "Fast isel fails on sadd.with.overflow"); -STATISTIC(NumFastIselFailUAddWithOverflow, -          "Fast isel fails on uadd.with.overflow"); -STATISTIC(NumFastIselFailSSubWithOverflow, -          "Fast isel fails on ssub.with.overflow"); -STATISTIC(NumFastIselFailUSubWithOverflow, -          "Fast isel fails on usub.with.overflow"); -STATISTIC(NumFastIselFailSMulWithOverflow, -          "Fast isel fails on smul.with.overflow"); -STATISTIC(NumFastIselFailUMulWithOverflow, -          "Fast isel fails on umul.with.overflow"); -STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress"); -STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call"); -STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call"); -STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call"); -#endif - -static cl::opt<bool> -EnableFastISelVerbose("fast-isel-verbose", cl::Hidden, -          cl::desc("Enable verbose messages in the \"fast\" " -                   "instruction selector"));  static cl::opt<int> EnableFastISelAbort(      "fast-isel-abort", cl::Hidden,      cl::desc("Enable abort calls when \"fast\" instruction selection " @@ -179,6 +113,11 @@ static cl::opt<int> EnableFastISelAbort(               "abort for argument lowering, and 3 will never fallback "               "to SelectionDAG.")); +static cl::opt<bool> EnableFastISelFallbackReport( +    "fast-isel-report-on-fallback", cl::Hidden, +    cl::desc("Emit a diagnostic when \"fast\" instruction selection " +             "falls back to SelectionDAG.")); +  static cl::opt<bool>  UseMBPI("use-mbpi",          cl::desc("use Machine Branch Probability Info"), @@ -238,7 +177,7 @@ MachinePassRegistry RegisterScheduler::Registry;  ///  //===---------------------------------------------------------------------===//  static cl::opt<RegisterScheduler::FunctionPassCtor, false, -               RegisterPassParser<RegisterScheduler> > +               RegisterPassParser<RegisterScheduler>>  ISHeuristic("pre-RA-sched",              cl::init(&createDefaultScheduler), cl::Hidden,              cl::desc("Instruction schedulers available (before register" @@ -249,6 +188,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target",                          createDefaultScheduler);  namespace llvm { +    //===--------------------------------------------------------------------===//    /// \brief This class is used by SelectionDAGISel to temporarily override    /// the optimization level on a per-function basis. @@ -318,6 +258,7 @@ namespace llvm {             "Unknown sched type!");      return createILPListDAGScheduler(IS, OptLevel);    } +  } // end namespace llvm  // EmitInstrWithCustomInserter - This method should be implemented by targets @@ -431,8 +372,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {            MachineFunctionProperties::Property::Selected))      return false;    // Do some sanity-checking on the command-line options. -  assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) && -         "-fast-isel-verbose requires -fast-isel");    assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&           "-fast-isel-abort > 0 requires -fast-isel"); @@ -457,12 +396,13 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr; +  ORE = make_unique<OptimizationRemarkEmitter>(&Fn);    DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");    SplitCriticalSideEffectEdges(const_cast<Function &>(Fn)); -  CurDAG->init(*MF); +  CurDAG->init(*MF, *ORE);    FuncInfo->set(Fn, *MF, CurDAG);    if (UseMBPI && OptLevel != CodeGenOpt::None) @@ -502,6 +442,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {      TLI->initializeSplitCSR(EntryMBB);    SelectAllBasicBlocks(Fn); +  if (FastISelFailed && EnableFastISelFallbackReport) { +    DiagnosticInfoISelFallback DiagFallback(Fn); +    Fn.getContext().diagnose(DiagFallback); +  }    // If the first basic block in the function has live ins that need to be    // copied into vregs, emit the copies into the top of the block before @@ -628,7 +572,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {      unsigned To = I->second;      // If To is also scheduled to be replaced, find what its ultimate      // replacement is. -    for (;;) { +    while (true) {        DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);        if (J == E) break;        To = J->second; @@ -666,13 +610,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {    return true;  } +static void reportFastISelFailure(MachineFunction &MF, +                                  OptimizationRemarkEmitter &ORE, +                                  OptimizationRemarkMissed &R, +                                  bool ShouldAbort) { +  // Print the function name explicitly if we don't have a debug location (which +  // makes the diagnostic less useful) or if we're going to emit a raw error. +  if (!R.getLocation().isValid() || ShouldAbort) +    R << (" (in function: " + MF.getName() + ")").str(); + +  if (ShouldAbort) +    report_fatal_error(R.getMsg()); + +  ORE.emit(R); +} +  void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,                                          BasicBlock::const_iterator End,                                          bool &HadTailCall) {    // Lower the instructions. If a call is emitted as a tail call, cease emitting    // nodes for this block. -  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) -    SDB->visit(*I); +  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) { +    if (!ElidedArgCopyInstrs.count(&*I)) +      SDB->visit(*I); +  }    // Make sure the root of the DAG is up-to-date.    CurDAG->setRoot(SDB->getControlRoot()); @@ -731,6 +692,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {    int BlockNumber = -1;    (void)BlockNumber;    bool MatchFilterBB = false; (void)MatchFilterBB; + +  // Pre-type legalization allow creation of any node types. +  CurDAG->NewNodesMustHaveLegalTypes = false; +  #ifndef NDEBUG    MatchFilterBB = (FilterDAGBasicBlockName.empty() ||                     FilterDAGBasicBlockName == @@ -777,6 +742,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {    DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber          << " '" << BlockName << "'\n"; CurDAG->dump()); +  // Only allow creation of legal node types.    CurDAG->NewNodesMustHaveLegalTypes = true;    if (Changed) { @@ -802,12 +768,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {    }    if (Changed) { +    DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber +          << " '" << BlockName << "'\n"; CurDAG->dump()); +      {        NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,                           GroupDescription, TimePassesIsEnabled);        CurDAG->LegalizeTypes();      } +    DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber +          << " '" << BlockName << "'\n"; CurDAG->dump()); +      if (ViewDAGCombineLT && MatchFilterBB)        CurDAG->viewGraph("dag-combine-lv input for " + BlockName); @@ -907,10 +879,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {  }  namespace { +  /// ISelUpdater - helper class to handle updates of the instruction selection  /// graph.  class ISelUpdater : public SelectionDAG::DAGUpdateListener {    SelectionDAG::allnodes_iterator &ISelPosition; +  public:    ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)      : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {} @@ -923,8 +897,53 @@ public:        ++ISelPosition;    }  }; +  } // end anonymous namespace +static bool isStrictFPOp(SDNode *Node, unsigned &NewOpc) { +  unsigned OrigOpc = Node->getOpcode(); +  switch (OrigOpc) { +    case ISD::STRICT_FADD: NewOpc = ISD::FADD; return true; +    case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; return true; +    case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; return true; +    case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; return true; +    case ISD::STRICT_FREM: NewOpc = ISD::FREM; return true; +    default: return false; +  } +} + +SDNode* SelectionDAGISel::MutateStrictFPToFP(SDNode *Node, unsigned NewOpc) { +  assert(((Node->getOpcode() == ISD::STRICT_FADD && NewOpc == ISD::FADD) || +          (Node->getOpcode() == ISD::STRICT_FSUB && NewOpc == ISD::FSUB) || +          (Node->getOpcode() == ISD::STRICT_FMUL && NewOpc == ISD::FMUL) || +          (Node->getOpcode() == ISD::STRICT_FDIV && NewOpc == ISD::FDIV) || +          (Node->getOpcode() == ISD::STRICT_FREM && NewOpc == ISD::FREM)) && +          "Unexpected StrictFP opcode!"); + +  // We're taking this node out of the chain, so we need to re-link things. +  SDValue InputChain = Node->getOperand(0); +  SDValue OutputChain = SDValue(Node, 1); +  CurDAG->ReplaceAllUsesOfValueWith(OutputChain, InputChain); + +  SDVTList VTs = CurDAG->getVTList(Node->getOperand(1).getValueType()); +  SDValue Ops[2] = { Node->getOperand(1), Node->getOperand(2) }; +  SDNode *Res = CurDAG->MorphNodeTo(Node, NewOpc, VTs, Ops); +   +  // MorphNodeTo can operate in two ways: if an existing node with the +  // specified operands exists, it can just return it.  Otherwise, it +  // updates the node in place to have the requested operands. +  if (Res == Node) { +    // If we updated the node in place, reset the node ID.  To the isel, +    // this should be just like a newly allocated machine node. +    Res->setNodeId(-1); +  } else { +    CurDAG->ReplaceAllUsesWith(Node, Res); +    CurDAG->RemoveDeadNode(Node); +  } + +  return Res;  +} +  void SelectionDAGISel::DoInstructionSelection() {    DEBUG(dbgs() << "===== Instruction selection begins: BB#"          << FuncInfo->MBB->getNumber() @@ -960,7 +979,23 @@ void SelectionDAGISel::DoInstructionSelection() {        if (Node->use_empty())          continue; +      // When we are using non-default rounding modes or FP exception behavior +      // FP operations are represented by StrictFP pseudo-operations.  They +      // need to be simplified here so that the target-specific instruction +      // selectors know how to handle them. +      // +      // If the current node is a strict FP pseudo-op, the isStrictFPOp() +      // function will provide the corresponding normal FP opcode to which the +      // node should be mutated. +      unsigned NormalFPOpc = ISD::UNDEF; +      bool IsStrictFPOp = isStrictFPOp(Node, NormalFPOpc); +      if (IsStrictFPOp) +        Node = MutateStrictFPToFP(Node, NormalFPOpc); +        Select(Node); + +      // FIXME: Add code here to attach an implicit def and use of +      // target-specific FP environment registers.      }      CurDAG->setRoot(Dummy.getValue()); @@ -1046,116 +1081,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,           !FuncInfo->isExportedInst(I); // Exported instrs must be computed.  } -#ifndef NDEBUG -// Collect per Instruction statistics for fast-isel misses.  Only those -// instructions that cause the bail are accounted for.  It does not account for -// instructions higher in the block.  Thus, summing the per instructions stats -// will not add up to what is reported by NumFastIselFailures. -static void collectFailStats(const Instruction *I) { -  switch (I->getOpcode()) { -  default: assert (0 && "<Invalid operator> "); - -  // Terminators -  case Instruction::Ret:         NumFastIselFailRet++; return; -  case Instruction::Br:          NumFastIselFailBr++; return; -  case Instruction::Switch:      NumFastIselFailSwitch++; return; -  case Instruction::IndirectBr:  NumFastIselFailIndirectBr++; return; -  case Instruction::Invoke:      NumFastIselFailInvoke++; return; -  case Instruction::Resume:      NumFastIselFailResume++; return; -  case Instruction::Unreachable: NumFastIselFailUnreachable++; return; - -  // Standard binary operators... -  case Instruction::Add:  NumFastIselFailAdd++; return; -  case Instruction::FAdd: NumFastIselFailFAdd++; return; -  case Instruction::Sub:  NumFastIselFailSub++; return; -  case Instruction::FSub: NumFastIselFailFSub++; return; -  case Instruction::Mul:  NumFastIselFailMul++; return; -  case Instruction::FMul: NumFastIselFailFMul++; return; -  case Instruction::UDiv: NumFastIselFailUDiv++; return; -  case Instruction::SDiv: NumFastIselFailSDiv++; return; -  case Instruction::FDiv: NumFastIselFailFDiv++; return; -  case Instruction::URem: NumFastIselFailURem++; return; -  case Instruction::SRem: NumFastIselFailSRem++; return; -  case Instruction::FRem: NumFastIselFailFRem++; return; - -  // Logical operators... -  case Instruction::And: NumFastIselFailAnd++; return; -  case Instruction::Or:  NumFastIselFailOr++; return; -  case Instruction::Xor: NumFastIselFailXor++; return; - -  // Memory instructions... -  case Instruction::Alloca:        NumFastIselFailAlloca++; return; -  case Instruction::Load:          NumFastIselFailLoad++; return; -  case Instruction::Store:         NumFastIselFailStore++; return; -  case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return; -  case Instruction::AtomicRMW:     NumFastIselFailAtomicRMW++; return; -  case Instruction::Fence:         NumFastIselFailFence++; return; -  case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return; - -  // Convert instructions... -  case Instruction::Trunc:    NumFastIselFailTrunc++; return; -  case Instruction::ZExt:     NumFastIselFailZExt++; return; -  case Instruction::SExt:     NumFastIselFailSExt++; return; -  case Instruction::FPTrunc:  NumFastIselFailFPTrunc++; return; -  case Instruction::FPExt:    NumFastIselFailFPExt++; return; -  case Instruction::FPToUI:   NumFastIselFailFPToUI++; return; -  case Instruction::FPToSI:   NumFastIselFailFPToSI++; return; -  case Instruction::UIToFP:   NumFastIselFailUIToFP++; return; -  case Instruction::SIToFP:   NumFastIselFailSIToFP++; return; -  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return; -  case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return; -  case Instruction::BitCast:  NumFastIselFailBitCast++; return; - -  // Other instructions... -  case Instruction::ICmp:           NumFastIselFailICmp++; return; -  case Instruction::FCmp:           NumFastIselFailFCmp++; return; -  case Instruction::PHI:            NumFastIselFailPHI++; return; -  case Instruction::Select:         NumFastIselFailSelect++; return; -  case Instruction::Call: { -    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) { -      switch (Intrinsic->getIntrinsicID()) { -      default: -        NumFastIselFailIntrinsicCall++; return; -      case Intrinsic::sadd_with_overflow: -        NumFastIselFailSAddWithOverflow++; return; -      case Intrinsic::uadd_with_overflow: -        NumFastIselFailUAddWithOverflow++; return; -      case Intrinsic::ssub_with_overflow: -        NumFastIselFailSSubWithOverflow++; return; -      case Intrinsic::usub_with_overflow: -        NumFastIselFailUSubWithOverflow++; return; -      case Intrinsic::smul_with_overflow: -        NumFastIselFailSMulWithOverflow++; return; -      case Intrinsic::umul_with_overflow: -        NumFastIselFailUMulWithOverflow++; return; -      case Intrinsic::frameaddress: -        NumFastIselFailFrameaddress++; return; -      case Intrinsic::sqrt: -          NumFastIselFailSqrt++; return; -      case Intrinsic::experimental_stackmap: -        NumFastIselFailStackMap++; return; -      case Intrinsic::experimental_patchpoint_void: // fall-through -      case Intrinsic::experimental_patchpoint_i64: -        NumFastIselFailPatchPoint++; return; -      } -    } -    NumFastIselFailCall++; -    return; -  } -  case Instruction::Shl:            NumFastIselFailShl++; return; -  case Instruction::LShr:           NumFastIselFailLShr++; return; -  case Instruction::AShr:           NumFastIselFailAShr++; return; -  case Instruction::VAArg:          NumFastIselFailVAArg++; return; -  case Instruction::ExtractElement: NumFastIselFailExtractElement++; return; -  case Instruction::InsertElement:  NumFastIselFailInsertElement++; return; -  case Instruction::ShuffleVector:  NumFastIselFailShuffleVector++; return; -  case Instruction::ExtractValue:   NumFastIselFailExtractValue++; return; -  case Instruction::InsertValue:    NumFastIselFailInsertValue++; return; -  case Instruction::LandingPad:     NumFastIselFailLandingPad++; return; -  } -} -#endif // NDEBUG -  /// Set up SwiftErrorVals by going through the function. If the function has  /// swifterror argument, it will be the first entry.  static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, @@ -1190,9 +1115,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,  }  static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, +                                                FastISel *FastIS,                                                  const TargetLowering *TLI,                                                  const TargetInstrInfo *TII, -                                                const BasicBlock *LLVMBB,                                                  SelectionDAGBuilder *SDB) {    if (!TLI->supportSwiftError())      return; @@ -1202,22 +1127,27 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,    if (FuncInfo->SwiftErrorVals.empty())      return; -  if (pred_begin(LLVMBB) == pred_end(LLVMBB)) { -    auto &DL = FuncInfo->MF->getDataLayout(); -    auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); -    for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { -      // We will always generate a copy from the argument. It is always used at -      // least by the 'return' of the swifterror. -      if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) -        continue; -      unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); -      // Assign Undef to Vreg. We construct MI directly to make sure it works -      // with FastISel. -      BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), -              SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), -              VReg); -      FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); -    } +  assert(FuncInfo->MBB == &*FuncInfo->MF->begin() && +         "expected to insert into entry block"); +  auto &DL = FuncInfo->MF->getDataLayout(); +  auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); +  for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { +    // We will always generate a copy from the argument. It is always used at +    // least by the 'return' of the swifterror. +    if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) +      continue; +    unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); +    // Assign Undef to Vreg. We construct MI directly to make sure it works +    // with FastISel. +    BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), +            SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), +            VReg); + +    // Keep FastIS informed about the value we just inserted. +    if (FastIS) +      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + +    FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);    }  } @@ -1340,6 +1270,7 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {  }  void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { +  FastISelFailed = false;    // Initialize the Fast-ISel state, if needed.    FastISel *FastIS = nullptr;    if (TM.Options.EnableFastISel) @@ -1347,12 +1278,53 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {    setupSwiftErrorVals(Fn, TLI, FuncInfo); -  // Iterate over all basic blocks in the function.    ReversePostOrderTraversal<const Function*> RPOT(&Fn); -  for (ReversePostOrderTraversal<const Function*>::rpo_iterator -       I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { -    const BasicBlock *LLVMBB = *I; +  // Lower arguments up front. An RPO iteration always visits the entry block +  // first. +  assert(*RPOT.begin() == &Fn.getEntryBlock()); +  ++NumEntryBlocks; + +  // Set up FuncInfo for ISel. Entry blocks never have PHIs. +  FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()]; +  FuncInfo->InsertPt = FuncInfo->MBB->begin(); + +  if (!FastIS) { +    LowerArguments(Fn); +  } else { +    // See if fast isel can lower the arguments. +    FastIS->startNewBlock(); +    if (!FastIS->lowerArguments()) { +      FastISelFailed = true; +      // Fast isel failed to lower these arguments +      ++NumFastIselFailLowerArguments; + +      OptimizationRemarkMissed R("sdagisel", "FastISelFailure", +                                 Fn.getSubprogram(), +                                 &Fn.getEntryBlock()); +      R << "FastISel didn't lower all arguments: " +        << ore::NV("Prototype", Fn.getType()); +      reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1); + +      // Use SelectionDAG argument lowering +      LowerArguments(Fn); +      CurDAG->setRoot(SDB->getControlRoot()); +      SDB->clear(); +      CodeGenAndEmitDAG(); +    } + +    // If we inserted any instructions at the beginning, make a note of +    // where they are, so we can be sure to emit subsequent instructions +    // after them. +    if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) +      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); +    else +      FastIS->setLastLocalValue(nullptr); +  } +  createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB); + +  // Iterate over all basic blocks in the function. +  for (const BasicBlock *LLVMBB : RPOT) {      if (OptLevel != CodeGenOpt::None) {        bool AllPredsVisited = true;        for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB); @@ -1384,8 +1356,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {      FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];      if (!FuncInfo->MBB)        continue; // Some blocks like catchpads have no code or MBB. -    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI(); -    createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB); + +    // Insert new instructions after any phi or argument setup code. +    FuncInfo->InsertPt = FuncInfo->MBB->end();      // Setup an EH landing-pad block.      FuncInfo->ExceptionPointerVirtReg = 0; @@ -1396,35 +1369,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {      // Before doing SelectionDAG ISel, see if FastISel has been requested.      if (FastIS) { -      FastIS->startNewBlock(); - -      // Emit code for any incoming arguments. This must happen before -      // beginning FastISel on the entry block. -      if (LLVMBB == &Fn.getEntryBlock()) { -        ++NumEntryBlocks; - -        // Lower any arguments needed in this block if this is the entry block. -        if (!FastIS->lowerArguments()) { -          // Fast isel failed to lower these arguments -          ++NumFastIselFailLowerArguments; -          if (EnableFastISelAbort > 1) -            report_fatal_error("FastISel didn't lower all arguments"); - -          // Use SelectionDAG argument lowering -          LowerArguments(Fn); -          CurDAG->setRoot(SDB->getControlRoot()); -          SDB->clear(); -          CodeGenAndEmitDAG(); -        } - -        // If we inserted any instructions at the beginning, make a note of -        // where they are, so we can be sure to emit subsequent instructions -        // after them. -        if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) -          FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); -        else -          FastIS->setLastLocalValue(nullptr); -      } +      if (LLVMBB != &Fn.getEntryBlock()) +        FastIS->startNewBlock();        unsigned NumFastIselRemaining = std::distance(Begin, End);        // Do FastISel on as many instructions as possible. @@ -1432,7 +1378,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {          const Instruction *Inst = &*std::prev(BI);          // If we no longer require this instruction, skip it. -        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) { +        if (isFoldedOrDeadInstruction(Inst, FuncInfo) || +            ElidedArgCopyInstrs.count(Inst)) {            --NumFastIselRemaining;            continue;          } @@ -1443,6 +1390,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {          // Try to select the instruction with FastISel.          if (FastIS->selectInstruction(Inst)) { +          FastISelFailed = true;            --NumFastIselRemaining;            ++NumFastIselSuccess;            // If fast isel succeeded, skip over all the folded instructions, and @@ -1465,22 +1413,22 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {            continue;          } -#ifndef NDEBUG -        if (EnableFastISelVerbose2) -          collectFailStats(Inst); -#endif -          // Then handle certain instructions as single-LLVM-Instruction blocks.          if (isa<CallInst>(Inst)) { +          OptimizationRemarkMissed R("sdagisel", "FastISelFailure", +                                     Inst->getDebugLoc(), LLVMBB); + +          R << "FastISel missed call"; + +          if (R.isEnabled() || EnableFastISelAbort) { +            std::string InstStrStorage; +            raw_string_ostream InstStr(InstStrStorage); +            InstStr << *Inst; -          if (EnableFastISelVerbose || EnableFastISelAbort) { -            dbgs() << "FastISel missed call: "; -            Inst->dump(); +            R << ": " << InstStr.str();            } -          if (EnableFastISelAbort > 2) -            // FastISel selector couldn't handle something and bailed. -            // For the purpose of debugging, just abort. -            report_fatal_error("FastISel didn't select the entire block"); + +          reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2);            if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&                !Inst->use_empty()) { @@ -1509,35 +1457,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {            continue;          } +        OptimizationRemarkMissed R("sdagisel", "FastISelFailure", +                                   Inst->getDebugLoc(), LLVMBB); +          bool ShouldAbort = EnableFastISelAbort; -        if (EnableFastISelVerbose || EnableFastISelAbort) { -          if (isa<TerminatorInst>(Inst)) { -            // Use a different message for terminator misses. -            dbgs() << "FastISel missed terminator: "; -            // Don't abort unless for terminator unless the level is really high -            ShouldAbort = (EnableFastISelAbort > 2); -          } else { -            dbgs() << "FastISel miss: "; -          } -          Inst->dump(); +        if (isa<TerminatorInst>(Inst)) { +          // Use a different message for terminator misses. +          R << "FastISel missed terminator"; +          // Don't abort for terminator unless the level is really high +          ShouldAbort = (EnableFastISelAbort > 2); +        } else { +          R << "FastISel missed"; +        } + +        if (R.isEnabled() || EnableFastISelAbort) { +          std::string InstStrStorage; +          raw_string_ostream InstStr(InstStrStorage); +          InstStr << *Inst; +          R << ": " << InstStr.str();          } -        if (ShouldAbort) -          // FastISel selector couldn't handle something and bailed. -          // For the purpose of debugging, just abort. -          report_fatal_error("FastISel didn't select the entire block"); + +        reportFastISelFailure(*MF, *ORE, R, ShouldAbort);          NumFastIselFailures += NumFastIselRemaining;          break;        }        FastIS->recomputeInsertPt(); -    } else { -      // Lower any arguments needed in this block if this is the entry block. -      if (LLVMBB == &Fn.getEntryBlock()) { -        ++NumEntryBlocks; -        LowerArguments(Fn); -      }      } +      if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {        bool FunctionBasedInstrumentation =            TLI->getSSPStackGuardCheck(*Fn.getParent()); @@ -1556,10 +1504,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {        // block.        bool HadTailCall;        SelectBasicBlock(Begin, BI, HadTailCall); + +      // But if FastISel was run, we already selected some of the block. +      // If we emitted a tail-call, we need to delete any previously emitted +      // instruction that follows it. +      if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end()) +        FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());      }      FinishBasicBlock();      FuncInfo->PHINodesToUpdate.clear(); +    ElidedArgCopyInstrs.clear();    }    propagateSwiftErrorVRegs(FuncInfo); @@ -2177,7 +2132,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,      IgnoreChains = false;    } -    SmallPtrSet<SDNode*, 16> Visited;    return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);  } @@ -2554,7 +2508,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,  LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool  CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,            SDValue N, -          const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { +          const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {    // Accept if it is exactly the same as a previously recorded node.    unsigned RecNo = MatcherTable[MatcherIndex++];    assert(RecNo < RecordedNodes.size() && "Invalid CheckSame"); @@ -2564,9 +2518,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,  /// CheckChildSame - Implements OP_CheckChildXSame.  LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool  CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, -             SDValue N, -             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes, -             unsigned ChildNo) { +              SDValue N, +              const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes, +              unsigned ChildNo) {    if (ChildNo >= N.getNumOperands())      return false;  // Match fails if out of range child #.    return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo), @@ -2688,7 +2642,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,                                         unsigned Index, SDValue N,                                         bool &Result,                                         const SelectionDAGISel &SDISel, -                 SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { +                  SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {    switch (Table[Index++]) {    default:      Result = false; @@ -2756,6 +2710,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,  }  namespace { +  struct MatchScope {    /// FailIndex - If this match fails, this is the index to continue with.    unsigned FailIndex; @@ -2785,6 +2740,7 @@ class MatchStateUpdater : public SelectionDAG::DAGUpdateListener    SDNode **NodeToMatch;    SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;    SmallVectorImpl<MatchScope> &MatchScopes; +  public:    MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,                      SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN, @@ -2816,6 +2772,7 @@ public:            J.setNode(E);    }  }; +  } // end anonymous namespace  void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, @@ -2921,7 +2878,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,      // with an OPC_SwitchOpcode instruction.  Populate the table now, since this      // is the first time we're selecting an instruction.      unsigned Idx = 1; -    while (1) { +    while (true) {        // Get the size of this case.        unsigned CaseSize = MatcherTable[Idx++];        if (CaseSize & 128) @@ -2942,7 +2899,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        MatcherIndex = OpcodeOffset[N.getOpcode()];    } -  while (1) { +  while (true) {      assert(MatcherIndex < TableSize && "Invalid index");  #ifndef NDEBUG      unsigned CurrentOpcodeIndex = MatcherIndex; @@ -2957,7 +2914,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        // immediately fail, don't even bother pushing a scope for them.        unsigned FailIndex; -      while (1) { +      while (true) {          unsigned NumToSkip = MatcherTable[MatcherIndex++];          if (NumToSkip & 128)            NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex); @@ -3118,7 +3075,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        unsigned CurNodeOpcode = N.getOpcode();        unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;        unsigned CaseSize; -      while (1) { +      while (true) {          // Get the size of this case.          CaseSize = MatcherTable[MatcherIndex++];          if (CaseSize & 128) @@ -3149,7 +3106,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        MVT CurNodeVT = N.getSimpleValueType();        unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;        unsigned CaseSize; -      while (1) { +      while (true) {          // Get the size of this case.          CaseSize = MatcherTable[MatcherIndex++];          if (CaseSize & 128) @@ -3215,7 +3172,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        // a single use.        bool HasMultipleUses = false;        for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) -        if (!NodeStack[i].hasOneUse()) { +        if (!NodeStack[i].getNode()->hasOneUse()) {            HasMultipleUses = true;            break;          } @@ -3381,6 +3338,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,        RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));        continue;      } +    case OPC_Coverage: { +      // This is emitted right before MorphNode/EmitNode. +      // So it should be safe to assume that this node has been selected +      unsigned index = MatcherTable[MatcherIndex++]; +      index |= (MatcherTable[MatcherIndex++] << 8); +      dbgs() << "COVERED: " << getPatternForIndex(index) << "\n"; +      dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n"; +      continue; +    }      case OPC_EmitNode:     case OPC_MorphNodeTo:      case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2: @@ -3473,7 +3439,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,            RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),                                                               nullptr));          } -        } else {          assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&                 "NodeToMatch was removed partway through selection"); @@ -3610,7 +3575,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,      // find a case to check.      DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");      ++NumDAGIselRetries; -    while (1) { +    while (true) {        if (MatchScopes.empty()) {          CannotYetSelect(NodeToMatch);          return; diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 690f0d2c8082..2756e276c6a9 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -55,14 +55,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,    // Conservatively require the attributes of the call to match those of    // the return. Ignore noalias because it doesn't affect the call sequence. -  AttributeSet CallerAttrs = F->getAttributes(); -  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex) -      .removeAttribute(Attribute::NoAlias).hasAttributes()) +  AttributeList CallerAttrs = F->getAttributes(); +  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex) +          .removeAttribute(Attribute::NoAlias) +          .hasAttributes())      return false;    // It's not safe to eliminate the sign / zero extension of the return value. -  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || -      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) +  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) || +      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))      return false;    // Check if the only use is a function return node. @@ -96,19 +97,20 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,  /// \brief Set CallLoweringInfo attribute flags based on a call instruction  /// and called function attributes. -void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS, -                                                 unsigned AttrIdx) { -  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt); -  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt); -  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg); -  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet); -  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest); -  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal); -  isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); -  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); -  isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); -  isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); -  Alignment  = CS->getParamAlignment(AttrIdx); +void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS, +                                                     unsigned ArgIdx) { +  IsSExt = CS->paramHasAttr(ArgIdx, Attribute::SExt); +  IsZExt = CS->paramHasAttr(ArgIdx, Attribute::ZExt); +  IsInReg = CS->paramHasAttr(ArgIdx, Attribute::InReg); +  IsSRet = CS->paramHasAttr(ArgIdx, Attribute::StructRet); +  IsNest = CS->paramHasAttr(ArgIdx, Attribute::Nest); +  IsByVal = CS->paramHasAttr(ArgIdx, Attribute::ByVal); +  IsInAlloca = CS->paramHasAttr(ArgIdx, Attribute::InAlloca); +  IsReturned = CS->paramHasAttr(ArgIdx, Attribute::Returned); +  IsSwiftSelf = CS->paramHasAttr(ArgIdx, Attribute::SwiftSelf); +  IsSwiftError = CS->paramHasAttr(ArgIdx, Attribute::SwiftError); +  // FIXME: getParamAlignment is off by one from argument index. +  Alignment  = CS->getParamAlignment(ArgIdx + 1);  }  /// Generate a libcall taking the given operands as arguments and returning a @@ -125,8 +127,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,    for (SDValue Op : Ops) {      Entry.Node = Op;      Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); -    Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); -    Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); +    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); +    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);      Args.push_back(Entry);    } @@ -138,10 +140,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,    Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());    TargetLowering::CallLoweringInfo CLI(DAG);    bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned); -  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) -    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) -    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed) -    .setSExtResult(signExtend).setZExtResult(!signExtend); +  CLI.setDebugLoc(dl) +      .setChain(DAG.getEntryNode()) +      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) +      .setNoReturn(doesNotReturn) +      .setDiscardResult(!isReturnValueUsed) +      .setSExtResult(signExtend) +      .setZExtResult(!signExtend);    return LowerCallTo(CLI);  } @@ -334,34 +339,35 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {  //  Optimization Methods  //===----------------------------------------------------------------------===// -/// Check to see if the specified operand of the specified instruction is a -/// constant integer. If so, check to see if there are any bits set in the -/// constant that are not demanded. If so, shrink the constant and return true. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, -                                                        const APInt &Demanded) { -  SDLoc dl(Op); +/// If the specified instruction has a constant integer operand and there are +/// bits set in that constant that are not demanded, then clear those bits and +/// return true. +bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant( +    SDValue Op, const APInt &Demanded) { +  SDLoc DL(Op); +  unsigned Opcode = Op.getOpcode();    // FIXME: ISD::SELECT, ISD::SELECT_CC -  switch (Op.getOpcode()) { -  default: break; +  switch (Opcode) { +  default: +    break;    case ISD::XOR:    case ISD::AND:    case ISD::OR: { -    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); -    if (!C) return false; +    auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); +    if (!Op1C) +      return false; -    if (Op.getOpcode() == ISD::XOR && -        (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) +    // If this is a 'not' op, don't touch it because that's a canonical form. +    const APInt &C = Op1C->getAPIntValue(); +    if (Opcode == ISD::XOR && (C | ~Demanded).isAllOnesValue())        return false; -    // if we can expand it to have all bits set, do it -    if (C->getAPIntValue().intersects(~Demanded)) { +    if (C.intersects(~Demanded)) {        EVT VT = Op.getValueType(); -      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0), -                                DAG.getConstant(Demanded & -                                                C->getAPIntValue(), -                                                dl, VT)); -      return CombineTo(Op, New); +      SDValue NewC = DAG.getConstant(Demanded & C, DL, VT); +      SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC); +      return CombineTo(Op, NewOp);      }      break; @@ -470,6 +476,21 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,    return true;  } +bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask, +                                          DAGCombinerInfo &DCI) const { + +  SelectionDAG &DAG = DCI.DAG; +  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), +                        !DCI.isBeforeLegalizeOps()); +  APInt KnownZero, KnownOne; + +  bool Simplified = SimplifyDemandedBits(Op, DemandedMask, KnownZero, KnownOne, +                                         TLO); +  if (Simplified) +    DCI.CommitTargetLoweringOpt(TLO); +  return Simplified; +} +  /// Look at Op. At this point, we know that only the DemandedMask bits of the  /// result of Op are ever used downstream. If we can use this information to  /// simplify Op, create a new simplified DAG node and return true, returning the @@ -711,8 +732,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,        }      } -    KnownZero = KnownZeroOut; -    KnownOne  = KnownOneOut; +    KnownZero = std::move(KnownZeroOut); +    KnownOne  = std::move(KnownOneOut);      break;    case ISD::SELECT:      if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero, @@ -750,6 +771,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,      KnownOne &= KnownOne2;      KnownZero &= KnownZero2;      break; +  case ISD::SETCC: { +    SDValue Op0 = Op.getOperand(0); +    SDValue Op1 = Op.getOperand(1); +    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); +    // If (1) we only need the sign-bit, (2) the setcc operands are the same +    // width as the setcc result, and (3) the result of a setcc conforms to 0 or +    // -1, we may be able to bypass the setcc. +    if (NewMask.isSignBit() && Op0.getScalarValueSizeInBits() == BitWidth && +        getBooleanContents(Op.getValueType()) == +            BooleanContent::ZeroOrNegativeOneBooleanContent) { +      // If we're testing X < 0, then this compare isn't needed - just use X! +      // FIXME: We're limiting to integer types here, but this should also work +      // if we don't care about FP signed-zero. The use of SETLT with FP means +      // that we don't care about NaNs. +      if (CC == ISD::SETLT && Op1.getValueType().isInteger() && +          (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode()))) +        return TLO.CombineTo(Op, Op0); + +      // TODO: Should we check for other forms of sign-bit comparisons? +      // Examples: X <= -1, X >= 0 +    } +    if (getBooleanContents(Op0.getValueType()) == +            TargetLowering::ZeroOrOneBooleanContent && +        BitWidth > 1) +      KnownZero.setBitsFrom(1); +    break; +  }    case ISD::SHL:      if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {        unsigned ShAmt = SA->getZExtValue(); @@ -834,7 +882,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,        KnownZero <<= SA->getZExtValue();        KnownOne  <<= SA->getZExtValue();        // low bits known zero. -      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue()); +      KnownZero.setLowBits(SA->getZExtValue());      }      break;    case ISD::SRL: @@ -853,7 +901,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,        // If the shift is exact, then it does demand the low bits (and knows that        // they are zero).        if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) -        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); +        InDemandedMask.setLowBits(ShAmt);        // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a        // single shift.  We can do this if the top bits (which are shifted out) @@ -884,8 +932,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,        KnownZero = KnownZero.lshr(ShAmt);        KnownOne  = KnownOne.lshr(ShAmt); -      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); -      KnownZero |= HighBits;  // High bits known zero. +      KnownZero.setHighBits(ShAmt);  // High bits known zero.      }      break;    case ISD::SRA: @@ -911,7 +958,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,        // If the shift is exact, then it does demand the low bits (and knows that        // they are zero).        if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) -        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); +        InDemandedMask.setLowBits(ShAmt);        // If any of the demanded bits are produced by the sign extension, we also        // demand the input sign bit. @@ -1075,7 +1122,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,      EVT InVT = Op.getOperand(0).getValueType();      unsigned InBits = InVT.getScalarSizeInBits();      APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits); -    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits); +    APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1);      APInt NewBits   = ~InMask & NewMask;      // If none of the top bits are demanded, convert this into an any_extend. @@ -1191,7 +1238,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,        return true;      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); -    KnownZero |= ~InMask & NewMask; +    KnownZero |= ~InMask;      break;    }    case ISD::BITCAST: @@ -1281,6 +1328,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,  void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,                                                     APInt &KnownZero,                                                     APInt &KnownOne, +                                                   const APInt &DemandedElts,                                                     const SelectionDAG &DAG,                                                     unsigned Depth) const {    assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -1295,6 +1343,7 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,  /// This method can be implemented by targets that want to expose additional  /// information about sign bits to the DAG Combiner.  unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, +                                                         const APInt &,                                                           const SelectionDAG &,                                                           unsigned Depth) const {    assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -2050,6 +2099,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,      if (Cond == ISD::SETO || Cond == ISD::SETUO)        return DAG.getSetCC(dl, VT, N0, N0, Cond); +    // setcc (fneg x), C -> setcc swap(pred) x, -C +    if (N0.getOpcode() == ISD::FNEG) { +      ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond); +      if (DCI.isBeforeLegalizeOps() || +          isCondCodeLegal(SwapCond, N0.getSimpleValueType())) { +        SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1); +        return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond); +      } +    } +      // If the condition is not legal, see if we can find an equivalent one      // which is legal.      if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) { @@ -2470,10 +2529,7 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,      std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));    // Figure out which register class contains this reg. -  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(), -       E = RI->regclass_end(); RCI != E; ++RCI) { -    const TargetRegisterClass *RC = *RCI; - +  for (const TargetRegisterClass *RC : RI->regclasses()) {      // If none of the value types for this register class are valid, we      // can't use it.  For example, 64-bit reg classes on 32-bit targets.      if (!isLegalRC(RC)) @@ -2933,7 +2989,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,  SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,                                        SelectionDAG &DAG,                                        std::vector<SDNode *> *Created) const { -  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();    const TargetLowering &TLI = DAG.getTargetLoweringInfo();    if (TLI.isIntDivCheap(N->getValueType(0), Attr))      return SDValue(N,0); // Lower SDIV as SDIV @@ -3808,7 +3864,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,    TargetLowering::CallLoweringInfo CLI(DAG);    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()); -  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); +  CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);    // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index 209bbe54ea23..ab578df4069d 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -64,6 +64,7 @@ public:  private:    bool setupEntryBlockAndCallSites(Function &F); +  bool undoSwiftErrorSelect(Function &F);    void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);    Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);    void lowerIncomingArguments(Function &F); @@ -174,8 +175,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,    // because the value needs to be added to the global context list.    auto &DL = F.getParent()->getDataLayout();    unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy); -  FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context", -                           &EntryBB->front()); +  FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), +                           nullptr, Align, "fn_context", &EntryBB->front());    // Fill in the function context structure.    for (LandingPadInst *LPI : LPads) { @@ -458,14 +459,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {    return true;  } +bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) { +  // We have inserted dummy copies 'select true, arg, undef' in the entry block +  // for arguments to simplify this pass. +  // swifterror arguments cannot be used in this way. Undo the select for the +  // swifterror argument. +  for (auto &AI : F.args()) { +    if (AI.isSwiftError()) { +      assert(AI.hasOneUse() && "Must have converted the argument to a select"); +      auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser()); +      assert(Select && "There must be single select user"); +      auto *OrigSwiftError = cast<Argument>(Select->getTrueValue()); +      Select->replaceAllUsesWith(OrigSwiftError); +      Select->eraseFromParent(); +      return true; +    } +  } +  return false; +} +  bool SjLjEHPrepare::runOnFunction(Function &F) {    Module &M = *F.getParent();    RegisterFn = M.getOrInsertFunction(        "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()), -      PointerType::getUnqual(FunctionContextTy), nullptr); +      PointerType::getUnqual(FunctionContextTy));    UnregisterFn = M.getOrInsertFunction(        "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()), -      PointerType::getUnqual(FunctionContextTy), nullptr); +      PointerType::getUnqual(FunctionContextTy));    FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);    StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);    StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); @@ -476,5 +496,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {    FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);    bool Res = setupEntryBlockAndCallSites(F); +  if (Res) +    Res |= undoSwiftErrorSelect(F);    return Res;  } diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp index dba103e9bfb1..bc2a1d09056b 100644 --- a/lib/CodeGen/SlotIndexes.cpp +++ b/lib/CodeGen/SlotIndexes.cpp @@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {    return false;  } +void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) { +  assert(!MI.isBundledWithPred() && +         "Use removeSingleMachineInstrFromMaps() instread"); +  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); +  if (mi2iItr == mi2iMap.end()) +    return; + +  SlotIndex MIIndex = mi2iItr->second; +  IndexListEntry &MIEntry = *MIIndex.listEntry(); +  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); +  mi2iMap.erase(mi2iItr); +  // FIXME: Eventually we want to actually delete these indexes. +  MIEntry.setInstr(nullptr); +} + +void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) { +  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); +  if (mi2iItr == mi2iMap.end()) +    return; + +  SlotIndex MIIndex = mi2iItr->second; +  IndexListEntry &MIEntry = *MIIndex.listEntry(); +  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); +  mi2iMap.erase(mi2iItr); + +  // When removing the first instruction of a bundle update mapping to next +  // instruction. +  if (MI.isBundledWithSucc()) { +    // Only the first instruction of a bundle should have an index assigned. +    assert(!MI.isBundledWithPred() && "Should have first bundle isntruction"); + +    MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator()); +    MachineInstr &NextMI = *Next; +    MIEntry.setInstr(&NextMI); +    mi2iMap.insert(std::make_pair(&NextMI, MIIndex)); +    return; +  } else { +    // FIXME: Eventually we want to actually delete these indexes. +    MIEntry.setInstr(nullptr); +  } +} +  void SlotIndexes::renumberIndexes() {    // Renumber updates the index of every element of the index list.    DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n"); diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp index 1c6a84e53944..3a50aaa69985 100644 --- a/lib/CodeGen/SplitKit.cpp +++ b/lib/CodeGen/SplitKit.cpp @@ -23,6 +23,7 @@  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/VirtRegMap.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetMachine.h" @@ -487,12 +488,126 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {    VFP = ValueForcePair(nullptr, true);  } +SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, +    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, +    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { +  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); +  bool FirstCopy = !Def.isValid(); +  MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) +      .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) +              | getInternalReadRegState(!FirstCopy), SubIdx) +      .addReg(FromReg, 0, SubIdx); + +  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); +  if (FirstCopy) { +    SlotIndexes &Indexes = *LIS.getSlotIndexes(); +    Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); +  } else { +    CopyMI->bundleWithPred(); +  } +  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx); +  DestLI.refineSubRanges(Allocator, LaneMask, +                         [Def, &Allocator](LiveInterval::SubRange& SR) { +    SR.createDeadDef(Def, Allocator); +  }); +  return Def; +} + +SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg, +    LaneBitmask LaneMask, MachineBasicBlock &MBB, +    MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { +  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); +  if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { +    // The full vreg is copied. +    MachineInstr *CopyMI = +        BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg); +    SlotIndexes &Indexes = *LIS.getSlotIndexes(); +    return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); +  } + +  // Only a subset of lanes needs to be copied. The following is a simple +  // heuristic to construct a sequence of COPYs. We could add a target +  // specific callback if this turns out to be suboptimal. +  LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx)); + +  // First pass: Try to find a perfectly matching subregister index. If none +  // exists find the one covering the most lanemask bits. +  SmallVector<unsigned, 8> PossibleIndexes; +  unsigned BestIdx = 0; +  unsigned BestCover = 0; +  const TargetRegisterClass *RC = MRI.getRegClass(FromReg); +  assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class"); +  for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) { +    // Is this index even compatible with the given class? +    if (TRI.getSubClassWithSubReg(RC, Idx) != RC) +      continue; +    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); +    // Early exit if we found a perfect match. +    if (SubRegMask == LaneMask) { +      BestIdx = Idx; +      break; +    } + +    // The index must not cover any lanes outside \p LaneMask. +    if ((SubRegMask & ~LaneMask).any()) +      continue; + +    unsigned PopCount = countPopulation(SubRegMask.getAsInteger()); +    PossibleIndexes.push_back(Idx); +    if (PopCount > BestCover) { +      BestCover = PopCount; +      BestIdx = Idx; +    } +  } + +  // Abort if we cannot possibly implement the COPY with the given indexes. +  if (BestIdx == 0) +    report_fatal_error("Impossible to implement partial COPY"); + +  SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, +                                        BestIdx, DestLI, Late, SlotIndex()); + +  // Greedy heuristic: Keep iterating keeping the best covering subreg index +  // each time. +  LaneBitmask LanesLeft = +      LaneMask & ~(TRI.getSubRegIndexLaneMask(BestCover)); +  while (LanesLeft.any()) { +    unsigned BestIdx = 0; +    int BestCover = INT_MIN; +    for (unsigned Idx : PossibleIndexes) { +      LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); +      // Early exit if we found a perfect match. +      if (SubRegMask == LanesLeft) { +        BestIdx = Idx; +        break; +      } + +      // Try to cover as much of the remaining lanes as possible but +      // as few of the already covered lanes as possible. +      int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger()) +                - countPopulation((SubRegMask & ~LanesLeft).getAsInteger()); +      if (Cover > BestCover) { +        BestCover = Cover; +        BestIdx = Idx; +      } +    } + +    if (BestIdx == 0) +      report_fatal_error("Impossible to implement partial COPY"); + +    buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, +                          DestLI, Late, Def); +    LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx); +  } + +  return Def; +} +  VNInfo *SplitEditor::defFromParent(unsigned RegIdx,                                     VNInfo *ParentVNI,                                     SlotIndex UseIdx,                                     MachineBasicBlock &MBB,                                     MachineBasicBlock::iterator I) { -  MachineInstr *CopyMI = nullptr;    SlotIndex Def;    LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx)); @@ -505,24 +620,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,    LiveInterval &OrigLI = LIS.getInterval(Original);    VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); +  unsigned Reg = LI->reg;    bool DidRemat = false;    if (OrigVNI) {      LiveRangeEdit::Remat RM(ParentVNI);      RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);      if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) { -      Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late); +      Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);        ++NumRemats;        DidRemat = true;      }    }    if (!DidRemat) { -    // Can't remat, just insert a copy from parent. -    CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg) -               .addReg(Edit->getReg()); -    Def = LIS.getSlotIndexes() -              ->insertMachineInstrInMaps(*CopyMI, Late) -              .getRegSlot(); +    LaneBitmask LaneMask; +    if (LI->hasSubRanges()) { +      LaneMask = LaneBitmask::getNone(); +      for (LiveInterval::SubRange &S : LI->subranges()) +        LaneMask |= S.LaneMask; +    } else { +      LaneMask = LaneBitmask::getAll(); +    } +      ++NumCopies; +    Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);    }    // Define the value in Reg. diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h index a75738aaf446..9d409e924a3d 100644 --- a/lib/CodeGen/SplitKit.h +++ b/lib/CodeGen/SplitKit.h @@ -405,6 +405,17 @@ private:    /// deleteRematVictims - Delete defs that are dead after rematerializing.    void deleteRematVictims(); +  /// Add a copy instruction copying \p FromReg to \p ToReg before +  /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it +  /// necessary to construct a sequence of copies to cover it exactly. +  SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask, +      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, +      bool Late, unsigned RegIdx); + +  SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, +      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, +      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy); +  public:    /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.    /// Newly created intervals will be appended to newIntervals. diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp index 89c4b574f17f..f51d959a089a 100644 --- a/lib/CodeGen/StackColoring.cpp +++ b/lib/CodeGen/StackColoring.cpp @@ -23,7 +23,6 @@  #include "llvm/ADT/BitVector.h"  #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" @@ -385,14 +384,13 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {    MachineFunctionPass::getAnalysisUsage(AU);  } -#ifndef NDEBUG - +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)  LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,                                              const BitVector &BV) const { -  DEBUG(dbgs() << tag << " : { "); +  dbgs() << tag << " : { ";    for (unsigned I = 0, E = BV.size(); I != E; ++I) -    DEBUG(dbgs() << BV.test(I) << " "); -  DEBUG(dbgs() << "}\n"); +    dbgs() << BV.test(I) << " "; +  dbgs() << "}\n";  }  LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { @@ -408,20 +406,19 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {  LLVM_DUMP_METHOD void StackColoring::dump() const {    for (MachineBasicBlock *MBB : depth_first(MF)) { -    DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " [" -                 << MBB->getName() << "]\n"); -    DEBUG(dumpBB(MBB)); +    dbgs() << "Inspecting block #" << MBB->getNumber() << " [" +           << MBB->getName() << "]\n"; +    dumpBB(MBB);    }  }  LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {    for (unsigned I = 0, E = Intervals.size(); I != E; ++I) { -    DEBUG(dbgs() << "Interval[" << I << "]:\n"); -    DEBUG(Intervals[I]->dump()); +    dbgs() << "Interval[" << I << "]:\n"; +    Intervals[I]->dump();    }  } - -#endif // not NDEBUG +#endif  static inline int getStartOrEndSlot(const MachineInstr &MI)  { @@ -570,9 +567,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot)    // Step 2: compute begin/end sets for each block -  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a -  // deterministic numbering, and because we'll need a post-order iteration -  // later for solving the liveness dataflow problem. +  // NOTE: We use a depth-first iteration to ensure that we obtain a +  // deterministic numbering.    for (MachineBasicBlock *MBB : depth_first(MF)) {      // Assign a serial number to this basic block. diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp index 9b7dd400fc92..1a8ec5bff322 100644 --- a/lib/CodeGen/StackMaps.cpp +++ b/lib/CodeGen/StackMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- StackMaps.cpp ---------------------------===// +//===- StackMaps.cpp ------------------------------------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -7,23 +7,34 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackMaps.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h"  #include "llvm/CodeGen/AsmPrinter.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h"  #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/StackMaps.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/MC/MCContext.h"  #include "llvm/MC/MCExpr.h"  #include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCRegisterInfo.h"  #include "llvm/MC/MCStreamer.h"  #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetOpcodes.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint>  #include <iterator> +#include <utility>  using namespace llvm; @@ -276,7 +287,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {    }    LiveOuts.erase( -      remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }), +      llvm::remove_if(LiveOuts, +                      [](const LiveOutReg &LO) { return LO.Reg == 0; }),        LiveOuts.end());    return LiveOuts; @@ -286,7 +298,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,                                      MachineInstr::const_mop_iterator MOI,                                      MachineInstr::const_mop_iterator MOE,                                      bool recordResult) { -    MCContext &OutContext = AP.OutStreamer->getContext();    MCSymbol *MILabel = OutContext.createTempSymbol();    AP.OutStreamer->EmitLabel(MILabel); @@ -378,6 +389,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {    }  #endif  } +  void StackMaps::recordStatepoint(const MachineInstr &MI) {    assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint"); diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp index c2c010a29d44..a8aafe78748d 100644 --- a/lib/CodeGen/StackProtector.cpp +++ b/lib/CodeGen/StackProtector.cpp @@ -1,4 +1,4 @@ -//===-- StackProtector.cpp - Stack Protector Insertion --------------------===// +//===- StackProtector.cpp - Stack Protector Insertion ---------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -14,30 +14,38 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackProtector.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/BranchProbabilityInfo.h"  #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h"  #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/StackProtector.h"  #include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h"  #include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/GlobalVariable.h"  #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h"  #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Intrinsics.h"  #include "llvm/IR/MDBuilder.h"  #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h"  #include "llvm/Target/TargetSubtargetInfo.h" -#include <cstdlib> +#include <utility> +  using namespace llvm;  #define DEBUG_TYPE "stack-protector" @@ -51,7 +59,7 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",  char StackProtector::ID = 0;  INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors", -                false, true) +                   false, true)  FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {    return new StackProtector(TM); @@ -222,7 +230,16 @@ bool StackProtector::RequiresStackProtector() {    if (F->hasFnAttribute(Attribute::SafeStack))      return false; +  // We are constructing the OptimizationRemarkEmitter on the fly rather than +  // using the analysis pass to avoid building DominatorTree and LoopInfo which +  // are not available this late in the IR pipeline. +  OptimizationRemarkEmitter ORE(F); +    if (F->hasFnAttribute(Attribute::StackProtectReq)) { +    ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F) +             << "Stack protection applied to function " +             << ore::NV("Function", F) +             << " due to a function attribute or command-line switch");      NeedsProtector = true;      Strong = true; // Use the same heuristic as strong to determine SSPLayout    } else if (F->hasFnAttribute(Attribute::StackProtectStrong)) @@ -236,20 +253,29 @@ bool StackProtector::RequiresStackProtector() {      for (const Instruction &I : BB) {        if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {          if (AI->isArrayAllocation()) { +          OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray", +                                    &I); +          Remark +              << "Stack protection applied to function " +              << ore::NV("Function", F) +              << " due to a call to alloca or use of a variable length array";            if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {              if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {                // A call to alloca with size >= SSPBufferSize requires                // stack protectors.                Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); +              ORE.emit(Remark);                NeedsProtector = true;              } else if (Strong) {                // Require protectors for all alloca calls in strong mode.                Layout.insert(std::make_pair(AI, SSPLK_SmallArray)); +              ORE.emit(Remark);                NeedsProtector = true;              }            } else {              // A call to alloca with a variable size requires protectors.              Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); +            ORE.emit(Remark);              NeedsProtector = true;            }            continue; @@ -259,6 +285,11 @@ bool StackProtector::RequiresStackProtector() {          if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {            Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray                                                     : SSPLK_SmallArray)); +          ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I) +                   << "Stack protection applied to function " +                   << ore::NV("Function", F) +                   << " due to a stack allocated buffer or struct containing a " +                      "buffer");            NeedsProtector = true;            continue;          } @@ -266,6 +297,11 @@ bool StackProtector::RequiresStackProtector() {          if (Strong && HasAddressTaken(AI)) {            ++NumAddrTaken;            Layout.insert(std::make_pair(AI, SSPLK_AddrOf)); +          ORE.emit( +              OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I) +              << "Stack protection applied to function " +              << ore::NV("Function", F) +              << " due to the address of a local variable being taken");            NeedsProtector = true;          }        } @@ -448,13 +484,13 @@ BasicBlock *StackProtector::CreateFailBB() {      Constant *StackChkFail =          M->getOrInsertFunction("__stack_smash_handler",                                 Type::getVoidTy(Context), -                               Type::getInt8PtrTy(Context), nullptr); +                               Type::getInt8PtrTy(Context));      B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));    } else {      Constant *StackChkFail = -        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context), -                               nullptr); +        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context)); +      B.CreateCall(StackChkFail, {});    }    B.CreateUnreachable(); diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp index 7709236bbaa8..d2414200e9d5 100644 --- a/lib/CodeGen/TailDuplicator.cpp +++ b/lib/CodeGen/TailDuplicator.cpp @@ -725,6 +725,7 @@ bool TailDuplicator::duplicateSimpleBB(      if (PredTBB == NextBB && PredFBB == nullptr)        PredTBB = nullptr; +    auto DL = PredBB->findBranchDebugLoc();      TII->removeBranch(*PredBB);      if (!PredBB->isSuccessor(NewTarget)) @@ -735,7 +736,7 @@ bool TailDuplicator::duplicateSimpleBB(      }      if (PredTBB) -      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); +      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL);      TDBBs.push_back(PredBB);    } diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp index f082add8c7dd..e5def6752e07 100644 --- a/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -73,7 +73,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,      return;    // Get the callee saved register list... -  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); +  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();    // Early exit if there are no callee saved registers.    if (!CSRegs || CSRegs[0] == 0) diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index 01f91b96b58a..711144a34743 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -470,7 +470,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,    // No need to fold return, the meta data, and function arguments    for (unsigned i = 0; i < StartIdx; ++i) -    MIB.addOperand(MI.getOperand(i)); +    MIB.add(MI.getOperand(i));    for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {      MachineOperand &MO = MI.getOperand(i); @@ -490,7 +490,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,        MIB.addImm(SpillOffset);      }      else -      MIB.addOperand(MO); +      MIB.add(MO);    }    return NewMI;  } @@ -941,12 +941,10 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const {    unsigned FrameSetupOpcode = getCallFrameSetupOpcode();    unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); -  if (MI.getOpcode() != FrameSetupOpcode && -      MI.getOpcode() != FrameDestroyOpcode) +  if (!isFrameInstr(MI))      return 0; -  int SPAdj = MI.getOperand(0).getImm(); -  SPAdj = TFI->alignSPAdjust(SPAdj); +  int SPAdj = TFI->alignSPAdjust(getFrameSize(MI));    if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) ||        (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode)) diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 003311b157fc..27630a3055cb 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -838,7 +838,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {    HasExtractBitsInsn = false;    JumpIsExpensive = JumpIsExpensiveOverride;    PredictableSelectIsExpensive = false; -  MaskAndBranchFoldingIsLegal = false;    EnableExtLdPromotion = false;    HasFloatingPointExceptions = true;    StackPointerRegisterToSaveRestore = 0; @@ -851,7 +850,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {    MinFunctionAlignment = 0;    PrefFunctionAlignment = 0;    PrefLoopAlignment = 0; -  GatherAllAliasesMaxDepth = 6; +  GatherAllAliasesMaxDepth = 18;    MinStackArgumentAlignment = 1;    // TODO: the default will be switched to 0 in the next commit, along    // with the Target-specific changes necessary. @@ -901,6 +900,7 @@ void TargetLoweringBase::initActions() {      setOperationAction(ISD::SMAX, VT, Expand);      setOperationAction(ISD::UMIN, VT, Expand);      setOperationAction(ISD::UMAX, VT, Expand); +    setOperationAction(ISD::ABS, VT, Expand);      // Overflow operations default to expand      setOperationAction(ISD::SADDO, VT, Expand); @@ -1227,7 +1227,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,      // Copy operands before the frame-index.      for (unsigned i = 0; i < OperIdx; ++i) -      MIB.addOperand(MI->getOperand(i)); +      MIB.add(MI->getOperand(i));      // Add frame index operands recognized by stackmaps.cpp      if (MFI.isStatepointSpillSlotObjectIndex(FI)) {        // indirect-mem-ref tag, size, #FI, offset. @@ -1237,18 +1237,18 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,        assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");        MIB.addImm(StackMaps::IndirectMemRefOp);        MIB.addImm(MFI.getObjectSize(FI)); -      MIB.addOperand(MI->getOperand(OperIdx)); +      MIB.add(MI->getOperand(OperIdx));        MIB.addImm(0);      } else {        // direct-mem-ref tag, #FI, offset.        // Used by patchpoint, and direct alloca arguments to statepoints        MIB.addImm(StackMaps::DirectMemRefOp); -      MIB.addOperand(MI->getOperand(OperIdx)); +      MIB.add(MI->getOperand(OperIdx));        MIB.addImm(0);      }      // Copy the operands after the frame index.      for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i) -      MIB.addOperand(MI->getOperand(i)); +      MIB.add(MI->getOperand(i));      // Inherit previous memory operands.      MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); @@ -1589,7 +1589,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT  /// type of the given function.  This does not require a DAG or a return value,  /// and is suitable for use before any DAGs for the function are constructed.  /// TODO: Move this out of TargetLowering.cpp. -void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, +void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr,                           SmallVectorImpl<ISD::OutputArg> &Outs,                           const TargetLowering &TLI, const DataLayout &DL) {    SmallVector<EVT, 4> ValueVTs; @@ -1601,9 +1601,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,      EVT VT = ValueVTs[j];      ISD::NodeType ExtendKind = ISD::ANY_EXTEND; -    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) +    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))        ExtendKind = ISD::SIGN_EXTEND; -    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) +    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))        ExtendKind = ISD::ZERO_EXTEND;      // FIXME: C calling convention requires the return type to be promoted to @@ -1621,13 +1621,13 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,      // 'inreg' on function refers to return value      ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); -    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg)) +    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))        Flags.setInReg();      // Propagate extension type if any -    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) +    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))        Flags.setSExt(); -    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) +    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))        Flags.setZExt();      for (unsigned i = 0; i < NumParts; ++i) @@ -1818,7 +1818,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {    Module *M = IRB.GetInsertBlock()->getParent()->getParent();    Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());    Value *Fn = M->getOrInsertFunction("__safestack_pointer_address", -                                     StackPtrTy->getPointerTo(0), nullptr); +                                     StackPtrTy->getPointerTo(0));    return IRB.CreateCall(Fn);  } @@ -1918,11 +1918,7 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {  /// override the target defaults.  static StringRef getRecipEstimateForFunc(MachineFunction &MF) {    const Function *F = MF.getFunction(); -  StringRef RecipAttrName = "reciprocal-estimates"; -  if (!F->hasFnAttribute(RecipAttrName)) -    return StringRef(); - -  return F->getFnAttribute(RecipAttrName).getValueAsString(); +  return F->getFnAttribute("reciprocal-estimates").getValueAsString();  }  /// Construct a string for the given reciprocal operation of the given type. diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index eb2a28f574a5..34892680aceb 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===// +//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===//  //  //                     The LLVM Compiler Infrastructure  // @@ -12,36 +12,52 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"  #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h"  #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineModuleInfo.h"  #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/Comdat.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h"  #include "llvm/IR/GlobalVariable.h"  #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h"  #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h"  #include "llvm/MC/MCAsmInfo.h"  #include "llvm/MC/MCContext.h"  #include "llvm/MC/MCExpr.h"  #include "llvm/MC/MCSectionCOFF.h"  #include "llvm/MC/MCSectionELF.h"  #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSectionWasm.h"  #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h"  #include "llvm/MC/MCSymbolELF.h"  #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h"  #include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h"  #include "llvm/Support/COFF.h"  #include "llvm/Support/Dwarf.h"  #include "llvm/Support/ELF.h"  #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h"  #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h"  #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <string> +  using namespace llvm;  using namespace dwarf; @@ -53,10 +69,10 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(      const GlobalValue *GV, const TargetMachine &TM,      MachineModuleInfo *MMI) const {    unsigned Encoding = getPersonalityEncoding(); -  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect) +  if ((Encoding & 0x80) == DW_EH_PE_indirect)      return getContext().getOrCreateSymbol(StringRef("DW.ref.") +                                            TM.getSymbol(GV)->getName()); -  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr) +  if ((Encoding & 0x70) == DW_EH_PE_absptr)      return TM.getSymbol(GV);    report_fatal_error("We do not support this DWARF encoding yet!");  } @@ -86,8 +102,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(  const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(      const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,      MachineModuleInfo *MMI, MCStreamer &Streamer) const { - -  if (Encoding & dwarf::DW_EH_PE_indirect) { +  if (Encoding & DW_EH_PE_indirect) {      MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();      MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM); @@ -102,7 +117,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(      return TargetLoweringObjectFile::        getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), -                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); +                        Encoding & ~DW_EH_PE_indirect, Streamer);    }    return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -117,8 +132,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {    // section(".eh_frame") gcc will produce:    //    //   .section   .eh_frame,"a",@progbits -   -  if (Name == getInstrProfCoverageSectionName(false)) + +  if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF, +                                      /*AddSegmentInfo=*/false))      return SectionKind::getMetadata();    if (Name.empty() || Name[0] != '.') return K; @@ -149,7 +165,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {    return K;  } -  static unsigned getELFSectionType(StringRef Name, SectionKind K) {    // Use SHT_NOTE for section whose name starts with ".note" to allow    // emitting ELF notes from C variable declaration. @@ -211,6 +226,20 @@ static const Comdat *getELFComdat(const GlobalValue *GV) {    return C;  } +static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, +                                              const TargetMachine &TM) { +  MDNode *MD = GO->getMetadata(LLVMContext::MD_associated); +  if (!MD) +    return nullptr; + +  auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0)); +  if (!VM) +    report_fatal_error("MD_associated operand is not ValueAsMetadata"); + +  GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue()); +  return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr; +} +  MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(      const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {    StringRef SectionName = GO->getSection(); @@ -224,9 +253,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(      Group = C->getName();      Flags |= ELF::SHF_GROUP;    } -  return getContext().getELFSection(SectionName, -                                    getELFSectionType(SectionName, Kind), Flags, -                                    /*EntrySize=*/0, Group); + +  // A section can have at most one associated section. Put each global with +  // MD_associated in a unique section. +  unsigned UniqueID = MCContext::GenericSectionID; +  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); +  if (AssociatedSymbol) { +    UniqueID = NextUniqueID++; +    Flags |= ELF::SHF_LINK_ORDER; +  } + +  MCSectionELF *Section = getContext().getELFSection( +      SectionName, getELFSectionType(SectionName, Kind), Flags, +      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); +  // Make sure that we did not get some other section with incompatible sh_link. +  // This should not be possible due to UniqueID code above. +  assert(Section->getAssociatedSymbol() == AssociatedSymbol); +  return Section;  }  /// Return the section prefix name used by options FunctionsSections and @@ -248,11 +291,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {    return ".data.rel.ro";  } -static MCSectionELF * -selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, -                          SectionKind Kind, Mangler &Mang, -                          const TargetMachine &TM, bool EmitUniqueSection, -                          unsigned Flags, unsigned *NextUniqueID) { +static MCSectionELF *selectELFSectionForGlobal( +    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, +    const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, +    unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) {    unsigned EntrySize = 0;    if (Kind.isMergeableCString()) {      if (Kind.isMergeable2ByteCString()) { @@ -319,7 +361,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,    if (Kind.isExecuteOnly())      UniqueID = 0;    return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags, -                           EntrySize, Group, UniqueID); +                           EntrySize, Group, UniqueID, AssociatedSymbol);  }  MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( @@ -337,8 +379,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(    }    EmitUniqueSection |= GO->hasComdat(); -  return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, -                                   EmitUniqueSection, Flags, &NextUniqueID); +  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); +  if (AssociatedSymbol) { +    EmitUniqueSection = true; +    Flags |= ELF::SHF_LINK_ORDER; +  } + +  MCSectionELF *Section = selectELFSectionForGlobal( +      getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags, +      &NextUniqueID, AssociatedSymbol); +  assert(Section->getAssociatedSymbol() == AssociatedSymbol); +  return Section;  }  MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( @@ -351,8 +402,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(      return ReadOnlySection;    return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(), -                                   getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC, -                                   &NextUniqueID); +                                   getMangler(), TM, EmitUniqueSection, +                                   ELF::SHF_ALLOC, &NextUniqueID, +                                   /* AssociatedSymbol */ nullptr);  }  bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection( @@ -723,7 +775,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(      return TargetLoweringObjectFile::        getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), -                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); +                        Encoding & ~DW_EH_PE_indirect, Streamer);    }    return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -1122,33 +1174,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(  void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(      raw_ostream &OS, const GlobalValue *GV) const { -  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration()) -    return; +  emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler()); +} -  const Triple &TT = getTargetTriple(); +//===----------------------------------------------------------------------===// +//                                  Wasm +//===----------------------------------------------------------------------===// -  if (TT.isKnownWindowsMSVCEnvironment()) -    OS << " /EXPORT:"; -  else -    OS << " -export:"; - -  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) { -    std::string Flag; -    raw_string_ostream FlagOS(Flag); -    getMangler().getNameWithPrefix(FlagOS, GV, false); -    FlagOS.flush(); -    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix()) -      OS << Flag.substr(1); -    else -      OS << Flag; -  } else { -    getMangler().getNameWithPrefix(OS, GV, false); +static const Comdat *getWasmComdat(const GlobalValue *GV) { +  const Comdat *C = GV->getComdat(); +  if (!C) +    return nullptr; + +  if (C->getSelectionKind() != Comdat::Any) +    report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" + +                       C->getName() + "' cannot be lowered."); + +  return C; +} + +MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal( +    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { +  llvm_unreachable("getExplicitSectionGlobal not yet implemented"); +  return nullptr; +} + +static MCSectionWasm * +selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, +                           SectionKind Kind, Mangler &Mang, +                           const TargetMachine &TM, bool EmitUniqueSection, +                           unsigned Flags, unsigned *NextUniqueID) { +  StringRef Group = ""; +  if (getWasmComdat(GO)) +    llvm_unreachable("comdat not yet supported for wasm"); + +  bool UniqueSectionNames = TM.getUniqueSectionNames(); +  SmallString<128> Name = getSectionPrefixForGlobal(Kind); + +  if (const auto *F = dyn_cast<Function>(GO)) { +    const auto &OptionalPrefix = F->getSectionPrefix(); +    if (OptionalPrefix) +      Name += *OptionalPrefix;    } -  if (!GV->getValueType()->isFunctionTy()) { -    if (TT.isKnownWindowsMSVCEnvironment()) -      OS << ",DATA"; -    else -      OS << ",data"; +  if (EmitUniqueSection && UniqueSectionNames) { +    Name.push_back('.'); +    TM.getNameWithPrefix(Name, GO, Mang, true); +  } +  unsigned UniqueID = MCContext::GenericSectionID; +  if (EmitUniqueSection && !UniqueSectionNames) { +    UniqueID = *NextUniqueID; +    (*NextUniqueID)++;    } +  return Ctx.getWasmSection(Name, /*Type=*/0, Flags, +                            Group, UniqueID); +} + +MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal( +    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + +  if (Kind.isCommon()) +    report_fatal_error("mergable sections not supported yet on wasm"); + +  // If we have -ffunction-section or -fdata-section then we should emit the +  // global value to a uniqued section specifically for it. +  bool EmitUniqueSection = false; +  if (Kind.isText()) +    EmitUniqueSection = TM.getFunctionSections(); +  else +    EmitUniqueSection = TM.getDataSections(); +  EmitUniqueSection |= GO->hasComdat(); + +  return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, +                                    EmitUniqueSection, /*Flags=*/0, +                                    &NextUniqueID); +} + +bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection( +    bool UsesLabelDifference, const Function &F) const { +  // We can always create relative relocations, so use another section +  // that can be marked non-executable. +  return false; +} + +const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference( +    const GlobalValue *LHS, const GlobalValue *RHS, +    const TargetMachine &TM) const { +  // We may only use a PLT-relative relocation to refer to unnamed_addr +  // functions. +  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) +    return nullptr; + +  // Basic sanity checks. +  if (LHS->getType()->getPointerAddressSpace() != 0 || +      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || +      RHS->isThreadLocal()) +    return nullptr; + +  return MCBinaryExpr::createSub( +      MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None, +                              getContext()), +      MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext()); +} + +void +TargetLoweringObjectFileWasm::InitializeWasm() { +  // TODO: Initialize StaticCtorSection and StaticDtorSection.  } diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp index b6da8e0aa60d..c20d5ab814f8 100644 --- a/lib/CodeGen/TargetOptionsImpl.cpp +++ b/lib/CodeGen/TargetOptionsImpl.cpp @@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {    return false;  } -/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option -/// is specified on the command line.  When this flag is off(default), the -/// code generator is not allowed to generate mad (multiply add) if the -/// result is "less precise" than doing those operations individually. -bool TargetOptions::LessPreciseFPMAD() const { -  return UnsafeFPMath || LessPreciseFPMADOption; -} -  /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume  /// that the rounding mode of the FPU can change from its default.  bool TargetOptions::HonorSignDependentRoundingFPMath() const { diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index e7ea2b4563f9..150195f5f85b 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -92,6 +92,9 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,      cl::desc("Verify generated machine code"),      cl::init(false),      cl::ZeroOrMore); +static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner", +    cl::Hidden, +    cl::desc("Enable machine outliner"));  static cl::opt<std::string>  PrintMachineInstrs("print-machineinstrs", cl::ValueOptional, @@ -261,7 +264,8 @@ TargetPassConfig::~TargetPassConfig() {  TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)      : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false),        AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), -      DisableVerify(false), EnableTailMerge(true) { +      DisableVerify(false), EnableTailMerge(true), +      RequireCodeGenSCCOrder(false) {    Impl = new PassConfigImpl(); @@ -279,6 +283,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)    if (StringRef(PrintMachineInstrs.getValue()).equals(""))      TM->Options.PrintMachineCode = true; + +  if (TM->Options.EnableIPRA) +    setRequiresCodeGenSCCOrder();  }  CodeGenOpt::Level TargetPassConfig::getOptLevel() const { @@ -531,7 +538,7 @@ void TargetPassConfig::addISelPrepare() {    addPreISel();    // Force codegen to run according to the callgraph. -  if (TM->Options.EnableIPRA) +  if (requiresCodeGenSCCOrder())      addPass(new DummyCGSCCPass);    // Add both the safe stack and the stack protection passes: each of them will @@ -668,9 +675,15 @@ void TargetPassConfig::addMachinePasses() {    addPass(&StackMapLivenessID, false);    addPass(&LiveDebugValuesID, false); +  // Insert before XRay Instrumentation. +  addPass(&FEntryInserterID, false); +    addPass(&XRayInstrumentationID, false);    addPass(&PatchableFunctionID, false); +  if (EnableMachineOutliner) +    PM->add(createMachineOutlinerPass()); +    AddingMachinePasses = false;  } @@ -704,6 +717,10 @@ void TargetPassConfig::addMachineSSAOptimization() {    addPass(&MachineLICMID, false);    addPass(&MachineCSEID, false); + +  // Coalesce basic blocks with the same branch condition +  addPass(&BranchCoalescingID); +    addPass(&MachineSinkingID);    addPass(&PeepholeOptimizerID); @@ -730,7 +747,7 @@ MachinePassRegistry RegisterRegAlloc::Registry;  /// A dummy default pass factory indicates whether the register allocator is  /// overridden on the command line. -LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag); +static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;  static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }  static RegisterRegAlloc  defaultRegAlloc("default", @@ -903,6 +920,11 @@ void TargetPassConfig::addBlockPlacement() {  //===---------------------------------------------------------------------===//  /// GlobalISel Configuration  //===---------------------------------------------------------------------===// + +bool TargetPassConfig::isGlobalISelEnabled() const { +  return false; +} +  bool TargetPassConfig::isGlobalISelAbortEnabled() const {    return EnableGlobalISelAbort == 1;  } diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index cd50c5b6571d..66cdad278e8d 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -155,8 +155,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {    // Pick the most sub register class of the right type that contains    // this physreg.    const TargetRegisterClass* BestRC = nullptr; -  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){ -    const TargetRegisterClass* RC = *I; +  for (const TargetRegisterClass* RC : regclasses()) {      if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&          (!BestRC || BestRC->hasSubClass(RC)))        BestRC = RC; @@ -185,10 +184,9 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,      if (SubClass)        getAllocatableSetForRC(MF, SubClass, Allocatable);    } else { -    for (TargetRegisterInfo::regclass_iterator I = regclass_begin(), -         E = regclass_end(); I != E; ++I) -      if ((*I)->isAllocatable()) -        getAllocatableSetForRC(MF, *I, Allocatable); +    for (const TargetRegisterClass *C : regclasses()) +      if (C->isAllocatable()) +        getAllocatableSetForRC(MF, C, Allocatable);    }    // Mask out the reserved registers @@ -415,9 +413,9 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,  }  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void -TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, -                            const TargetRegisterInfo *TRI) { +LLVM_DUMP_METHOD +void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, +                                 const TargetRegisterInfo *TRI) {    dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n";  }  #endif diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp index 83e52d335354..0df34ce43112 100644 --- a/lib/CodeGen/TargetSchedule.cpp +++ b/lib/CodeGen/TargetSchedule.cpp @@ -1,4 +1,4 @@ -//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===// +//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -12,12 +12,22 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h"  #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCSchedule.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetInstrInfo.h"  #include "llvm/Target/TargetRegisterInfo.h"  #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint>  using namespace llvm; @@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const {  static unsigned gcd(unsigned Dividend, unsigned Divisor) {    // Dividend and Divisor will be naturally swapped as needed. -  while(Divisor) { +  while (Divisor) {      unsigned Rem = Dividend % Divisor;      Dividend = Divisor;      Divisor = Rem;    };    return Dividend;  } +  static unsigned lcm(unsigned A, unsigned B) {    unsigned LCM = (uint64_t(A) * B) / gcd(A, B);    assert((LCM >= A && LCM >= B) && "LCM overflow"); @@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm,    }  } +/// Returns true only if instruction is specified as single issue. +bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI, +                                     const MCSchedClassDesc *SC) const { +  if (hasInstrSchedModel()) { +    if (!SC) +      SC = resolveSchedClass(MI); +    if (SC->isValid()) +      return SC->BeginGroup; +  } +  return false; +} + +bool TargetSchedModel::mustEndGroup(const MachineInstr *MI, +                                     const MCSchedClassDesc *SC) const { +  if (hasInstrSchedModel()) { +    if (!SC) +      SC = resolveSchedClass(MI); +    if (SC->isValid()) +      return SC->EndGroup; +  } +  return false; +} +  unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,                                            const MCSchedClassDesc *SC) const {    if (hasInstrItineraries()) { @@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) {  /// evaluation of predicates that depend on instruction operands or flags.  const MCSchedClassDesc *TargetSchedModel::  resolveSchedClass(const MachineInstr *MI) const { -    // Get the definition's scheduling class descriptor from this machine model.    unsigned SchedClass = MI->getDesc().getSchedClass();    const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); @@ -244,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {    if (SCDesc->isValid() && !SCDesc->isVariant())      return computeInstrLatency(*SCDesc); -  llvm_unreachable("No MI sched latency"); +  if (SCDesc->isValid()) { +    assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()"); +    return computeInstrLatency(*SCDesc); +  } +  return 0;  }  unsigned @@ -298,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,    }    return 0;  } + +static Optional<double> +getRTroughputFromItineraries(unsigned schedClass, +                             const InstrItineraryData *IID){ +  double Unknown = std::numeric_limits<double>::infinity(); +  double Throughput = Unknown; + +  for (const InstrStage *IS = IID->beginStage(schedClass), +                        *E = IID->endStage(schedClass); +       IS != E; ++IS) { +    unsigned Cycles = IS->getCycles(); +    if (!Cycles) +      continue; +    Throughput = +        std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); +  } +  // We need reciprocal throughput that's why we return such value. +  return 1 / Throughput; +} + +static Optional<double> +getRTroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc, +                                 const TargetSubtargetInfo *STI, +                                 const MCSchedModel &SchedModel) { +  double Unknown = std::numeric_limits<double>::infinity(); +  double Throughput = Unknown; + +  for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), +                                 *WEnd = STI->getWriteProcResEnd(SCDesc); +       WPR != WEnd; ++WPR) { +    unsigned Cycles = WPR->Cycles; +    if (!Cycles) +      return Optional<double>(); + +    unsigned NumUnits = +        SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; +    Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); +  } +  // We need reciprocal throughput that's why we return such value. +  return 1 / Throughput; +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const { +  if (hasInstrItineraries()) +    return getRTroughputFromItineraries(MI->getDesc().getSchedClass(), +                                        getInstrItineraries()); +  if (hasInstrSchedModel()) +    return getRTroughputFromInstrSchedModel(resolveSchedClass(MI), STI, +                                            SchedModel); +  return Optional<double>(); +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const { +  unsigned SchedClass = TII->get(Opcode).getSchedClass(); +  if (hasInstrItineraries()) +    return getRTroughputFromItineraries(SchedClass, getInstrItineraries()); +  if (hasInstrSchedModel()) { +    const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); +    if (SCDesc->isValid() && !SCDesc->isVariant()) +      return getRTroughputFromInstrSchedModel(SCDesc, STI, SchedModel); +  } +  return Optional<double>(); +} diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp index c74707d95b9e..0a444e0fff07 100644 --- a/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/lib/CodeGen/TargetSubtargetInfo.cpp @@ -11,6 +11,9 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/raw_ostream.h"  #include "llvm/Target/TargetSubtargetInfo.h"  using namespace llvm; @@ -52,3 +55,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const {  bool TargetSubtargetInfo::useAA() const {    return false;  } + +static std::string createSchedInfoStr(unsigned Latency, +                                     Optional<double> RThroughput) { +  static const char *SchedPrefix = " sched: ["; +  std::string Comment; +  raw_string_ostream CS(Comment); +  if (Latency > 0 && RThroughput.hasValue()) +    CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue()) +       << "]"; +  else if (Latency > 0) +    CS << SchedPrefix << Latency << ":?]"; +  else if (RThroughput.hasValue()) +    CS << SchedPrefix << "?:" << RThroughput.getValue() << "]"; +  CS.flush(); +  return Comment; +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { +  if (MI.isPseudo() || MI.isTerminator()) +    return std::string(); +  // We don't cache TSchedModel because it depends on TargetInstrInfo +  // that could be changed during the compilation +  TargetSchedModel TSchedModel; +  TSchedModel.init(getSchedModel(), this, getInstrInfo()); +  unsigned Latency = TSchedModel.computeInstrLatency(&MI); +  Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI); +  return createSchedInfoStr(Latency, RThroughput); +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { +  // We don't cache TSchedModel because it depends on TargetInstrInfo +  // that could be changed during the compilation +  TargetSchedModel TSchedModel; +  TSchedModel.init(getSchedModel(), this, getInstrInfo()); +  if (!TSchedModel.hasInstrSchedModel()) +    return std::string(); +  unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode()); +  Optional<double> RThroughput = +      TSchedModel.computeInstrRThroughput(MCI.getOpcode()); +  return createSchedInfoStr(Latency, RThroughput); +} diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 0f1b2ed994b7..75359fe3c0ea 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -905,7 +905,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,      ++End;    } -  // Check if the reschedule will not break depedencies. +  // Check if the reschedule will not break dependencies.    unsigned NumVisited = 0;    MachineBasicBlock::iterator KillPos = KillMI;    ++KillPos; @@ -1785,7 +1785,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {      MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),                                     TII->get(TargetOpcode::COPY))                                 .addReg(DstReg, RegState::Define, SubIdx) -                               .addOperand(UseMO); +                               .add(UseMO);      // The first def needs an <undef> flag because there is no live register      // before it. diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index 0d506d646659..c8946010e9d1 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -167,6 +167,7 @@ class VirtRegRewriter : public MachineFunctionPass {    bool readsUndefSubreg(const MachineOperand &MO) const;    void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;    void handleIdentityCopy(MachineInstr &MI) const; +  void expandCopyBundle(MachineInstr &MI) const;  public:    static char ID; @@ -367,11 +368,41 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {    }    if (Indexes) -    Indexes->removeMachineInstrFromMaps(MI); -  MI.eraseFromParent(); +    Indexes->removeSingleMachineInstrFromMaps(MI); +  MI.eraseFromBundle();    DEBUG(dbgs() << "  deleted.\n");  } +/// The liverange splitting logic sometimes produces bundles of copies when +/// subregisters are involved. Expand these into a sequence of copy instructions +/// after processing the last in the bundle. Does not update LiveIntervals +/// which we shouldn't need for this instruction anymore. +void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const { +  if (!MI.isCopy()) +    return; + +  if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) { +    // Only do this when the complete bundle is made out of COPYs. +    MachineBasicBlock &MBB = *MI.getParent(); +    for (MachineBasicBlock::reverse_instr_iterator I = +         std::next(MI.getReverseIterator()), E = MBB.instr_rend(); +         I != E && I->isBundledWithSucc(); ++I) { +      if (!I->isCopy()) +        return; +    } + +    for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator(); +         I->isBundledWithPred(); ) { +      MachineInstr &MI = *I; +      ++I; + +      MI.unbundleFromPred(); +      if (Indexes) +        Indexes->insertMachineInstrInMaps(MI); +    } +  } +} +  void VirtRegRewriter::rewrite() {    bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();    SmallVector<unsigned, 8> SuperDeads; @@ -431,12 +462,14 @@ void VirtRegRewriter::rewrite() {              }            } -          // The <def,undef> flag only makes sense for sub-register defs, and -          // we are substituting a full physreg.  An <imp-use,kill> operand -          // from the SuperKills list will represent the partial read of the -          // super-register. -          if (MO.isDef()) +          // The <def,undef> and <def,internal> flags only make sense for +          // sub-register defs, and we are substituting a full physreg.  An +          // <imp-use,kill> operand from the SuperKills list will represent the +          // partial read of the super-register. +          if (MO.isDef()) {              MO.setIsUndef(false); +            MO.setIsInternalRead(false); +          }            // PhysReg operands cannot have subregister indexes.            PhysReg = TRI->getSubReg(PhysReg, SubReg); @@ -461,6 +494,8 @@ void VirtRegRewriter::rewrite() {        DEBUG(dbgs() << "> " << *MI); +      expandCopyBundle(*MI); +        // We can remove identity copies right now.        handleIdentityCopy(*MI);      } diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp index 568720c66e55..ae07e8b2fa03 100644 --- a/lib/CodeGen/WinEHPrepare.cpp +++ b/lib/CodeGen/WinEHPrepare.cpp @@ -86,6 +86,7 @@ private:    // All fields are reset by runOnFunction.    EHPersonality Personality = EHPersonality::Unknown; +  const DataLayout *DL = nullptr;    DenseMap<BasicBlock *, ColorVector> BlockColors;    MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;  }; @@ -111,6 +112,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) {    if (!isFuncletEHPersonality(Personality))      return false; +  DL = &Fn.getParent()->getDataLayout();    return prepareExplicitEH(Fn);  } @@ -1070,7 +1072,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {    if (!isa<TerminatorInst>(EHPad)) {      // If the EHPad isn't a terminator, then we can insert a load in this block      // that will dominate all uses. -    SpillSlot = new AllocaInst(PN->getType(), nullptr, +    SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,                                 Twine(PN->getName(), ".wineh.spillslot"),                                 &F.getEntryBlock().front());      Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"), @@ -1157,7 +1159,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,                                        Function &F) {    // Lazilly create the spill slot.    if (!SpillSlot) -    SpillSlot = new AllocaInst(V->getType(), nullptr, +    SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,                                 Twine(V->getName(), ".wineh.spillslot"),                                 &F.getEntryBlock().front()); diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp index 63bd762eeb2b..7d2848bdc13b 100644 --- a/lib/CodeGen/XRayInstrumentation.cpp +++ b/lib/CodeGen/XRayInstrumentation.cpp @@ -81,7 +81,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF,          auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc))                         .addImm(T.getOpcode());          for (auto &MO : T.operands()) -          MIB.addOperand(MO); +          MIB.add(MO);          Terminators.push_back(&T);        }      } @@ -157,6 +157,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {    case Triple::ArchType::arm:    case Triple::ArchType::thumb:    case Triple::ArchType::aarch64: +  case Triple::ArchType::ppc64le: +  case Triple::ArchType::mips: +  case Triple::ArchType::mipsel: +  case Triple::ArchType::mips64: +  case Triple::ArchType::mips64el:      // For the architectures which don't have a single return instruction      prependRetWithPatchableExit(MF, TII);      break;  | 
