diff options
Diffstat (limited to 'lib/CodeGen')
154 files changed, 13182 insertions, 5237 deletions
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index bb908618b6794..955524c2a676e 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -163,9 +163,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // callee-saved register that is not saved in the prolog. const MachineFrameInfo &MFI = MF.getFrameInfo(); BitVector Pristine = MFI.getPristineRegs(MF); - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { + for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; + ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !Pristine.test(Reg)) continue; + if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { unsigned AliasReg = *AI; State->UnionGroups(AliasReg, 0); diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp index 79ecc4308fe7f..09a37a77e9fbc 100644 --- a/lib/CodeGen/Analysis.cpp +++ b/lib/CodeGen/Analysis.cpp @@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I, bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS; ADS = true; - AttrBuilder CallerAttrs(F->getAttributes(), - AttributeSet::ReturnIndex); + AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex); AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(), - AttributeSet::ReturnIndex); + AttributeList::ReturnIndex); // Noalias is completely benign as far as calling convention goes, it // shouldn't affect whether the call is a tail call. @@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, return true; } -bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) { - if (!GV->hasLinkOnceODRLinkage()) - return false; - - // We assume that anyone who sets global unnamed_addr on a non-constant knows - // what they're doing. - if (GV->hasGlobalUnnamedAddr()) - return true; - - // If it is a non constant variable, it needs to be uniqued across shared - // objects. - if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) { - if (!Var->isConstant()) - return false; - } - - return GV->hasAtLeastLocalUnnamedAddr(); -} - static void collectFuncletMembers( DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet, const MachineBasicBlock *MBB) { diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 24fdbfc901fdf..6c18d56b82723 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -11,48 +11,102 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/AsmPrinter.h" +#include "AsmPrinterHandler.h" #include "CodeViewDebug.h" #include "DwarfDebug.h" #include "DwarfException.h" #include "WinException.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ObjectUtils.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalIFunc.h" +#include "llvm/IR/GlobalIndirectSymbol.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Value.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cinttypes> +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -69,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription = STATISTIC(EmittedInsts, "Number of machine instrs printed"); +static cl::opt<bool> + PrintSchedule("print-schedule", cl::Hidden, cl::init(false), + cl::desc("Print 'sched: [latency:throughput]' in .s output")); + char AsmPrinter::ID = 0; typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type; @@ -78,7 +136,6 @@ static gcp_map_type &getGCMap(void *&P) { return *(gcp_map_type*)P; } - /// getGVAlignmentLog2 - Return the alignment to use for the specified global /// value in log2 form. This rounds up to the preferred alignment if possible /// and legal. @@ -107,16 +164,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL, AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer) : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()), - OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)), - isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) { - DD = nullptr; - MMI = nullptr; - LI = nullptr; - MF = nullptr; - CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr; - CurrentFnBegin = nullptr; - CurrentFnEnd = nullptr; - GCMetadataPrinters = nullptr; + OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) { VerboseAsm = OutStreamer->isVerboseAsm(); } @@ -171,6 +219,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired<MachineModuleInfo>(); + AU.addRequired<MachineOptimizationRemarkEmitterPass>(); AU.addRequired<GCModuleInfo>(); if (isVerbose()) AU.addRequired<MachineLoopInfo>(); @@ -223,7 +272,7 @@ bool AsmPrinter::doInitialization(Module &M) { // don't, this at least helps the user find where a global came from. if (MAI->hasSingleParameterDotFile()) { // .file "foo.c" - OutStreamer->EmitFileDirective(M.getModuleIdentifier()); + OutStreamer->EmitFileDirective(M.getSourceFileName()); } GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>(); @@ -571,7 +620,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { /// /// \p Value - The value to emit. /// \p Size - The size of the integer (in bytes) to emit. -void AsmPrinter::EmitDebugValue(const MCExpr *Value, +void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const { OutStreamer->EmitValue(Value, Size); } @@ -602,8 +651,23 @@ void AsmPrinter::EmitFunctionHeader() { } // Emit the prefix data. - if (F->hasPrefixData()) - EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + if (F->hasPrefixData()) { + if (MAI->hasSubsectionsViaSymbols()) { + // Preserving prefix data on platforms which use subsections-via-symbols + // is a bit tricky. Here we introduce a symbol for the prefix data + // and use the .alt_entry attribute to mark the function's real entry point + // as an alternative entry point to the prefix-data symbol. + MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol(); + OutStreamer->EmitLabel(PrefixSym); + + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + + // Emit an .alt_entry directive for the actual function symbol. + OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry); + } else { + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + } + } // Emit the CurrentFnSym. This is a virtual function to allow targets to // do their wild and crazy things as required. @@ -660,7 +724,8 @@ void AsmPrinter::EmitFunctionEntryLabel() { } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS, + AsmPrinter *AP) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -668,6 +733,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { int FI; const MachineFrameInfo &MFI = MF->getFrameInfo(); + bool Commented = false; // We assume a single instruction only has a spill or reload, not // both. @@ -675,24 +741,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { if (TII->isLoadFromStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Reload\n"; + CommentOS << MMO->getSize() << "-byte Reload"; + Commented = true; } } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Reload\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Reload"; + Commented = true; + } } else if (TII->isStoreToStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Spill\n"; + CommentOS << MMO->getSize() << "-byte Spill"; + Commented = true; } } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Spill\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Spill"; + Commented = true; + } } // Check for spill-induced copies - if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) - CommentOS << " Reload Reuse\n"; + if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) { + Commented = true; + CommentOS << " Reload Reuse"; + } + + if (Commented && AP->EnablePrintSchedInfo) + // If any comment was added above and we need sched info comment then + // add this new comment just after the above comment w/o "\n" between them. + CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n"; + else if (Commented) + CommentOS << "\n"; } /// emitImplicitDef - This method emits the specified machine instruction @@ -883,6 +964,7 @@ void AsmPrinter::EmitFunctionBody() { // Print out code for the function. bool HasAnyRealCode = false; + int NumInstsInFunction = 0; for (auto &MBB : *MF) { // Print a label for the basic block. EmitBasicBlockStart(MBB); @@ -892,7 +974,7 @@ void AsmPrinter::EmitFunctionBody() { if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() && !MI.isDebugValue()) { HasAnyRealCode = true; - ++EmittedInsts; + ++NumInstsInFunction; } if (ShouldPrintDebugScopes) { @@ -905,7 +987,7 @@ void AsmPrinter::EmitFunctionBody() { } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(MI, OutStreamer->GetCommentOS(), this); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -953,6 +1035,14 @@ void AsmPrinter::EmitFunctionBody() { EmitBasicBlockEnd(MBB); } + EmittedInsts += NumInstsInFunction; + MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount", + MF->getFunction()->getSubprogram(), + &MF->front()); + R << ore::NV("NumInstructions", NumInstsInFunction) + << " instructions in function"; + ORE->emit(R); + // If the function is empty and the object file uses .subsections_via_symbols, // then we need to emit *something* to the function body to prevent the // labels from collapsing together. Just emit a noop. @@ -1238,7 +1328,7 @@ bool AsmPrinter::doFinalization(Module &M) { break; AliasStack.push_back(Cur); } - for (const GlobalAlias *AncestorAlias : reverse(AliasStack)) + for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack)) emitGlobalIndirectSymbol(M, *AncestorAlias); AliasStack.clear(); } @@ -1311,19 +1401,28 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnSymForSize = CurrentFnBegin; } + ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); if (isVerbose()) LI = &getAnalysis<MachineLoopInfo>(); + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() + ? PrintSchedule + : STI.supportPrintSchedInfo(); } namespace { + // Keep track the alignment, constpool entries per Section. struct SectionCPs { MCSection *S; unsigned Alignment; SmallVector<unsigned, 4> CPEs; + SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {} }; -} + +} // end anonymous namespace /// EmitConstantPool - Print to the current output stream assembly /// representations of the constants in the constant pool MCP. This is @@ -1547,7 +1646,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI, OutStreamer->EmitValue(Value, EntrySize); } - /// EmitSpecialLLVMGlobal - Check to see if the specified global is a /// special global used by LLVM. If so, emit it and return true, otherwise /// do nothing and return false. @@ -1598,13 +1696,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) { } namespace { + struct Structor { - Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {} - int Priority; - llvm::Constant *Func; - llvm::GlobalValue *ComdatKey; + int Priority = 0; + Constant *Func = nullptr; + GlobalValue *ComdatKey = nullptr; + + Structor() = default; }; -} // end namespace + +} // end anonymous namespace /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init /// priority. @@ -1653,8 +1754,11 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List, const TargetLoweringObjectFile &Obj = getObjFileLowering(); const MCSymbol *KeySym = nullptr; if (GlobalValue *GV = S.ComdatKey) { - if (GV->hasAvailableExternallyLinkage()) - // If the associated variable is available_externally, some other TU + if (GV->isDeclarationForLinker()) + // If the associated variable is not defined in this module + // (it might be available_externally, or have been an + // available_externally definition that was dropped by the + // EliminateAvailableExternally pass), some other TU // will provide its dynamic initializer. continue; @@ -1931,7 +2035,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) { return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1. } - /// isRepeatedByteSequence - Determine whether the given value is /// composed of a repeated sequence of identical bytes and return the /// byte value. If it is not a repeated sequence, return -1. @@ -1972,7 +2075,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) { static void emitGlobalConstantDataSequential(const DataLayout &DL, const ConstantDataSequential *CDS, AsmPrinter &AP) { - // See if we can aggregate this into a .fill, if so, emit it as such. int Value = isRepeatedByteSequence(CDS, DL); if (Value != -1) { @@ -2006,7 +2108,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL, CDS->getNumElements(); if (unsigned Padding = Size - EmittedSize) AP.OutStreamer->EmitZeros(Padding); - } static void emitGlobalConstantArray(const DataLayout &DL, @@ -2420,8 +2521,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const { return OutContext.getOrCreateSymbol(NameStr); } - - /// PrintParentLoopComment - Print comments about parent loops of this one. static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop, unsigned FunctionNumber) { @@ -2486,7 +2585,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB, PrintChildLoopComment(OS, Loop, AP.getFunctionNumber()); } - /// EmitBasicBlockStart - This method prints the label for the specified /// MachineBasicBlock, an alignment (if present) and a comment describing /// it if appropriate. @@ -2607,8 +2705,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { return true; } - - GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { if (!S.usesMetadata()) return nullptr; @@ -2639,7 +2735,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { } /// Pin vtable to this file. -AsmPrinterHandler::~AsmPrinterHandler() {} +AsmPrinterHandler::~AsmPrinterHandler() = default; void AsmPrinterHandler::markFunctionEnd() {} @@ -2702,8 +2798,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind) { auto Fn = MI.getParent()->getParent()->getFunction(); auto Attr = Fn->getFnAttribute("function-instrument"); + bool LogArgs = Fn->hasFnAttribute("xray-log-args"); bool AlwaysInstrument = Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always"; + if (Kind == SledKind::FUNCTION_ENTER && LogArgs) + Kind = SledKind::LOG_ARGS_ENTER; Sleds.emplace_back( XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn }); } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 57864e4e4d4f2..683e622e3d537 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -40,25 +40,24 @@ using namespace llvm; #define DEBUG_TYPE "asm-printer" -namespace { - struct SrcMgrDiagInfo { - const MDNode *LocInfo; - LLVMContext::InlineAsmDiagHandlerTy DiagHandler; - void *DiagContext; - }; -} - /// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an /// inline asm has an error in it. diagInfo is a pointer to the SrcMgrDiagInfo /// struct above. static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) { - SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo); + AsmPrinter::SrcMgrDiagInfo *DiagInfo = + static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo); assert(DiagInfo && "Diagnostic context not passed down?"); + // Look up a LocInfo for the buffer this diagnostic is coming from. + unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc()); + const MDNode *LocInfo = nullptr; + if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size()) + LocInfo = DiagInfo->LocInfos[BufNum-1]; + // If the inline asm had metadata associated with it, pull out a location // cookie corresponding to which line the error occurred on. unsigned LocCookie = 0; - if (const MDNode *LocInfo = DiagInfo->LocInfo) { + if (LocInfo) { unsigned ErrorLine = Diag.getLineNo()-1; if (ErrorLine >= LocInfo->getNumOperands()) ErrorLine = 0; @@ -99,35 +98,39 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, return; } - SourceMgr SrcMgr; - SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); + if (!DiagInfo) { + DiagInfo = make_unique<SrcMgrDiagInfo>(); - SrcMgrDiagInfo DiagInfo; - - // If the current LLVMContext has an inline asm handler, set it in SourceMgr. - LLVMContext &LLVMCtx = MMI->getModule()->getContext(); - bool HasDiagHandler = false; - if (LLVMCtx.getInlineAsmDiagnosticHandler() != nullptr) { - // If the source manager has an issue, we arrange for srcMgrDiagHandler - // to be invoked, getting DiagInfo passed into it. - DiagInfo.LocInfo = LocMDNode; - DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler(); - DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext(); - SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo); - HasDiagHandler = true; + MCContext &Context = MMI->getContext(); + Context.setInlineSourceManager(&DiagInfo->SrcMgr); + + LLVMContext &LLVMCtx = MMI->getModule()->getContext(); + if (LLVMCtx.getInlineAsmDiagnosticHandler()) { + DiagInfo->DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler(); + DiagInfo->DiagContext = LLVMCtx.getInlineAsmDiagnosticContext(); + DiagInfo->SrcMgr.setDiagHandler(srcMgrDiagHandler, DiagInfo.get()); + } } + SourceMgr &SrcMgr = DiagInfo->SrcMgr; + SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); + std::unique_ptr<MemoryBuffer> Buffer; - if (isNullTerminated) - Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>"); - else - Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>"); + // The inline asm source manager will outlive Str, so make a copy of the + // string for SourceMgr to own. + Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>"); // Tell SrcMgr about this buffer, it takes ownership of the buffer. - SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + + // Store LocMDNode in DiagInfo, using BufNum as an identifier. + if (LocMDNode) { + DiagInfo->LocInfos.resize(BufNum); + DiagInfo->LocInfos[BufNum-1] = LocMDNode; + } std::unique_ptr<MCAsmParser> Parser( - createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI)); + createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum)); // We create a new MCInstrInfo here since we might be at the module level // and not have a MachineFunction to initialize the TargetInstrInfo from and @@ -151,7 +154,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, int Res = Parser->Run(/*NoInitialTextSection*/ true, /*NoFinalize*/ true); emitInlineAsmEnd(STI, &TAP->getSTI()); - if (Res && !HasDiagHandler) + + if (Res && !DiagInfo->DiagHandler) report_fatal_error("Error parsing inline asm\n"); } diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 83440513225c1..383b8cddb1a06 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -23,13 +23,13 @@ #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/DebugInfo/MSF/ByteStream.h" -#include "llvm/DebugInfo/MSF/StreamReader.h" #include "llvm/IR/Constants.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/BinaryByteStream.h" +#include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/COFF.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetFrameLowering.h" @@ -38,7 +38,6 @@ using namespace llvm; using namespace llvm::codeview; -using namespace llvm::msf; CodeViewDebug::CodeViewDebug(AsmPrinter *AP) : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(), @@ -495,9 +494,9 @@ void CodeViewDebug::emitTypeInformation() { // comments. The MSVC linker doesn't do much type record validation, // so the first link of an invalid type record can succeed while // subsequent links will fail with LNK1285. - ByteStream Stream(Record); + BinaryByteStream Stream(Record, llvm::support::little); CVTypeArray Types; - StreamReader Reader(Stream); + BinaryStreamReader Reader(Stream); Error E = Reader.readArray(Types, Reader.getLength()); if (!E) { TypeVisitorCallbacks C; @@ -948,10 +947,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { // Handle fragments. auto Fragment = DIExpr->getFragmentInfo(); - if (DIExpr && Fragment) { + if (Fragment) { IsSubfield = true; StructOffset = Fragment->OffsetInBits / 8; - } else if (DIExpr && DIExpr->getNumElements() > 0) { + } else if (DIExpr->getNumElements() > 0) { continue; // Ignore unrecognized exprs. } @@ -1014,14 +1013,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { } } -void CodeViewDebug::beginFunction(const MachineFunction *MF) { - assert(!CurFn && "Can't process two functions at once!"); - - if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram()) - return; - - DebugHandlerBase::beginFunction(MF); - +void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) { const Function *GV = MF->getFunction(); assert(FnDebugInfo.count(GV) == false); CurFn = &FnDebugInfo[GV]; @@ -1150,27 +1142,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8; - - // We want to assert that the element type multiplied by the array lengths - // match the size of the overall array. However, if we don't have complete - // type information for the base type, we can't make this assertion. This - // happens if limited debug info is enabled in this case: - // struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); }; - // VTableOptzn array[3]; - // The DICompositeType of VTableOptzn will have size zero, and the array will - // have size 3 * sizeof(void*), and we should avoid asserting. - // - // There is a related bug in the front-end where an array of a structure, - // which was declared as incomplete structure first, ends up not getting a - // size assigned to it. (PR28303) - // Example: - // struct A(*p)[3]; - // struct A { int f; } a[3]; - bool PartiallyIncomplete = false; - if (Ty->getSizeInBits() == 0 || ElementSize == 0) { - PartiallyIncomplete = true; - } - // Add subranges to array type. DINodeArray Elements = Ty->getElements(); for (int i = Elements.size() - 1; i >= 0; --i) { @@ -1185,16 +1156,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { // Variable Length Array (VLA) has Count equal to '-1'. // Replace with Count '1', assume it is the minimum VLA length. // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU. - if (Count == -1) { + if (Count == -1) Count = 1; - PartiallyIncomplete = true; - } // Update the element size and element type index for subsequent subranges. ElementSize *= Count; // If this is the outermost array, use the size from the array. It will be - // more accurate if PartiallyIncomplete is true. + // more accurate if we had a VLA or an incomplete element type size. uint64_t ArraySize = (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize; @@ -1203,9 +1172,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { ElementTypeIndex = TypeTable.writeKnownType(AR); } - (void)PartiallyIncomplete; - assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8)); - return ElementTypeIndex; } @@ -2115,18 +2081,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) { } } -void CodeViewDebug::endFunction(const MachineFunction *MF) { - if (!Asm || !CurFn) // We haven't created any debug info for this function. - return; - +void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) { const Function *GV = MF->getFunction(); assert(FnDebugInfo.count(GV)); assert(CurFn == &FnDebugInfo[GV]); collectVariableInfo(GV->getSubprogram()); - DebugHandlerBase::endFunction(MF); - // Don't emit anything if we don't have any line tables. if (!CurFn->HaveLineInfo) { FnDebugInfo.erase(GV); diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 3dd4315e4c2f1..343384c517728 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -299,6 +299,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { unsigned getPointerSizeInBytes(); +protected: + /// \brief Gather pre-function debug information. + void beginFunctionImpl(const MachineFunction *MF) override; + + /// \brief Gather post-function debug information. + void endFunctionImpl(const MachineFunction *) override; + public: CodeViewDebug(AsmPrinter *Asm); @@ -307,12 +314,6 @@ public: /// \brief Emit the COFF section that holds the line table information. void endModule() override; - /// \brief Gather pre-function debug information. - void beginFunction(const MachineFunction *MF) override; - - /// \brief Gather post-function debug information. - void endFunction(const MachineFunction *) override; - /// \brief Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; }; diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp index 879918995472c..b510e0ef36ac6 100644 --- a/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/lib/CodeGen/AsmPrinter/DIE.cpp @@ -42,6 +42,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const { // overloads. Otherwise MSVC 2010 thinks this call is ambiguous. ID.AddInteger(unsigned(Attribute)); ID.AddInteger(unsigned(Form)); + if (Form == dwarf::DW_FORM_implicit_const) + ID.AddInteger(Value); } //===----------------------------------------------------------------------===// @@ -107,13 +109,20 @@ void DIEAbbrev::print(raw_ostream &O) { O << " " << dwarf::AttributeString(Data[i].getAttribute()) << " " - << dwarf::FormEncodingString(Data[i].getForm()) - << '\n'; + << dwarf::FormEncodingString(Data[i].getForm()); + + if (Data[i].getForm() == dwarf::DW_FORM_implicit_const) + O << " " << Data[i].getValue(); + + O << '\n'; } } -LLVM_DUMP_METHOD -void DIEAbbrev::dump() { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEAbbrev::dump() { + print(dbgs()); +} +#endif //===----------------------------------------------------------------------===// // DIEAbbrevSet Implementation @@ -249,10 +258,11 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const { O << "\n"; } -LLVM_DUMP_METHOD -void DIE::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIE::dump() { print(dbgs()); } +#endif unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP, DIEAbbrevSet &AbbrevSet, @@ -340,10 +350,11 @@ void DIEValue::print(raw_ostream &O) const { } } -LLVM_DUMP_METHOD -void DIEValue::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEValue::dump() const { print(dbgs()); } +#endif //===----------------------------------------------------------------------===// // DIEInteger Implementation @@ -354,57 +365,42 @@ void DIEValue::dump() const { void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_implicit_const: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_flag_present: // Emit something to keep the lines and comments in sync. // FIXME: Is there a better way to do this? Asm->OutStreamer->AddBlankLine(); return; case dwarf::DW_FORM_flag: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref1: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data1: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_strx1: + case dwarf::DW_FORM_addrx1: case dwarf::DW_FORM_ref2: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data2: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_strx2: + case dwarf::DW_FORM_addrx2: case dwarf::DW_FORM_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref4: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data4: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_ref_sup4: + case dwarf::DW_FORM_strx4: + case dwarf::DW_FORM_addrx4: case dwarf::DW_FORM_ref8: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_sig8: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data8: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_ref_sup8: case dwarf::DW_FORM_GNU_ref_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_strp_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_line_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_sec_offset: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp_sup: - LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sup: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_addr: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_addr: Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form)); return; case dwarf::DW_FORM_GNU_str_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_addr_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_udata: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return; @@ -419,35 +415,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { /// unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { switch (Form) { - case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_flag_present: return 0; - case dwarf::DW_FORM_flag: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref1: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data1: return sizeof(int8_t); - case dwarf::DW_FORM_ref2: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data2: return sizeof(int16_t); - case dwarf::DW_FORM_ref4: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data4: return sizeof(int32_t); - case dwarf::DW_FORM_ref8: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sig8: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data8: return sizeof(int64_t); + case dwarf::DW_FORM_implicit_const: + case dwarf::DW_FORM_flag_present: + return 0; + case dwarf::DW_FORM_flag: + case dwarf::DW_FORM_ref1: + case dwarf::DW_FORM_data1: + case dwarf::DW_FORM_strx1: + case dwarf::DW_FORM_addrx1: + return sizeof(int8_t); + case dwarf::DW_FORM_ref2: + case dwarf::DW_FORM_data2: + case dwarf::DW_FORM_strx2: + case dwarf::DW_FORM_addrx2: + return sizeof(int16_t); + case dwarf::DW_FORM_ref4: + case dwarf::DW_FORM_data4: + case dwarf::DW_FORM_ref_sup4: + case dwarf::DW_FORM_strx4: + case dwarf::DW_FORM_addrx4: + return sizeof(int32_t); + case dwarf::DW_FORM_ref8: + case dwarf::DW_FORM_ref_sig8: + case dwarf::DW_FORM_data8: + case dwarf::DW_FORM_ref_sup8: + return sizeof(int64_t); case dwarf::DW_FORM_ref_addr: if (AP->getDwarfVersion() == 2) return AP->getPointerSize(); LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_ref_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_strp_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_line_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_sec_offset: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp_sup: - LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sup: switch (AP->OutStreamer->getContext().getDwarfFormat()) { case dwarf::DWARF32: return 4; @@ -456,11 +458,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { } llvm_unreachable("Invalid DWARF format"); case dwarf::DW_FORM_GNU_str_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_addr_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_udata: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_udata: return getULEB128Size(Integer); case dwarf::DW_FORM_sdata: @@ -484,7 +483,7 @@ void DIEInteger::print(raw_ostream &O) const { /// EmitValue - Emit expression value. /// void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->EmitDebugValue(Expr, SizeOf(AP, Form)); + AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form)); } /// SizeOf - Determine size of expression value in bytes. diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp index d8ecc7ccfb9bf..8e3b88d0af0e5 100644 --- a/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -490,9 +490,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) { Hash.final(Result); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); + // implementation always returns its results in little endian, so we actually + // need the "high" word. + return Result.high(); } /// This is based on the type signature computation given in section 7.27 of the @@ -514,7 +514,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) { Hash.final(Result); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); + // implementation always returns its results in little endian, so we actually + // need the "high" word. + return Result.high(); } diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 94190981e88ec..1d63e33a4d33a 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) { return getBaseTypeSize(BaseType); } +bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) { + if (!MMI->hasDebugInfo()) + return false; + auto *SP = MF->getFunction()->getSubprogram(); + if (!SP) + return false; + assert(SP->getUnit()); + auto EK = SP->getUnit()->getEmissionKind(); + if (EK == DICompileUnit::NoDebug) + return false; + return true; +} + void DebugHandlerBase::beginFunction(const MachineFunction *MF) { + assert(Asm); + PrevInstBB = nullptr; + + if (!hasDebugInfo(MMI, MF)) { + skippedNonDebugFunction(); + return; + } + // Grab the lexical scopes for the function, if we don't have any of those // then we're not going to be able to do anything. LScopes.initialize(*MF); - if (LScopes.empty()) + if (LScopes.empty()) { + beginFunctionImpl(MF); return; + } // Make sure that each lexical scope will have a begin/end label. identifyScopeMarkers(); @@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) { PrevInstLoc = DebugLoc(); PrevLabel = Asm->getFunctionBegin(); + beginFunctionImpl(MF); } void DebugHandlerBase::beginInstruction(const MachineInstr *MI) { @@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() { } void DebugHandlerBase::endFunction(const MachineFunction *MF) { + if (hasDebugInfo(MMI, MF)) + endFunctionImpl(MF); DbgValues.clear(); LabelsBeforeInsn.clear(); LabelsAfterInsn.clear(); diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h index c00fa189d94af..659a921e1fc56 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h @@ -80,6 +80,10 @@ protected: LabelsAfterInsn.insert(std::make_pair(MI, nullptr)); } + virtual void beginFunctionImpl(const MachineFunction *MF) = 0; + virtual void endFunctionImpl(const MachineFunction *MF) = 0; + virtual void skippedNonDebugFunction() {} + // AsmPrinterHandler overrides. public: void beginInstruction(const MachineInstr *MI) override; diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h index 36fb1507ddc65..a68e8cc6b4b31 100644 --- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -76,7 +76,8 @@ public: const DIExpression *getExpression() const { return Expression; } friend bool operator==(const Value &, const Value &); friend bool operator<(const Value &, const Value &); - void dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { if (isLocation()) { llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " "; if (Loc.isIndirect()) @@ -90,6 +91,7 @@ public: if (Expression) Expression->dump(); } +#endif }; private: diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index d904372af589b..a550ff2fb90f3 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1,3 +1,16 @@ +//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for constructing a dwarf compile unit. +// +//===----------------------------------------------------------------------===// + #include "DwarfCompileUnit.h" #include "DwarfExpression.h" #include "llvm/CodeGen/MachineFunction.h" @@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE( bool addToAccelTable = false; DIELoc *Loc = nullptr; std::unique_ptr<DIEDwarfExpression> DwarfExpr; - bool AllConstant = std::all_of( - GlobalExprs.begin(), GlobalExprs.end(), - [&](const GlobalExpr GE) { - return GE.Expr && GE.Expr->isConstant(); - }); - for (const auto &GE : GlobalExprs) { const GlobalVariable *Global = GE.Var; const DIExpression *Expr = GE.Expr; + // For compatibility with DWARF 3 and earlier, // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes // DW_AT_const_value(X). if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) { + addToAccelTable = true; addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1)); - // We cannot describe the location of dllimport'd variables: the - // computation of their address requires loads from the IAT. - } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) { - if (!Loc) { - Loc = new (DIEValueAllocator) DIELoc; - DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); - } + break; + } + + // We cannot describe the location of dllimport'd variables: the + // computation of their address requires loads from the IAT. + if (Global && Global->hasDLLImportStorageClass()) + continue; + + // Nothing to describe without address or constant. + if (!Global && (!Expr || !Expr->isConstant())) + continue; + + if (!Loc) { addToAccelTable = true; - if (Global) { - const MCSymbol *Sym = Asm->getSymbol(Global); - if (Global->isThreadLocal()) { - if (Asm->TM.Options.EmulatedTLS) { - // TODO: add debug info for emulated thread local mode. - } else { - // FIXME: Make this work with -gsplit-dwarf. - unsigned PointerSize = Asm->getDataLayout().getPointerSize(); - assert((PointerSize == 4 || PointerSize == 8) && - "Add support for other sizes if necessary"); - // Based on GCC's support for TLS: - if (!DD->useSplitDwarf()) { - // 1) Start with a constNu of the appropriate pointer size - addUInt(*Loc, dwarf::DW_FORM_data1, - PointerSize == 4 ? dwarf::DW_OP_const4u - : dwarf::DW_OP_const8u); - // 2) containing the (relocated) offset of the TLS variable - // within the module's TLS block. - addExpr(*Loc, dwarf::DW_FORM_udata, - Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); - } else { - addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); - addUInt(*Loc, dwarf::DW_FORM_udata, - DD->getAddressPool().getIndex(Sym, /* TLS */ true)); - } - // 3) followed by an OP to make the debugger do a TLS lookup. + Loc = new (DIEValueAllocator) DIELoc; + DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); + } + + if (Global) { + const MCSymbol *Sym = Asm->getSymbol(Global); + if (Global->isThreadLocal()) { + if (Asm->TM.Options.EmulatedTLS) { + // TODO: add debug info for emulated thread local mode. + } else { + // FIXME: Make this work with -gsplit-dwarf. + unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + assert((PointerSize == 4 || PointerSize == 8) && + "Add support for other sizes if necessary"); + // Based on GCC's support for TLS: + if (!DD->useSplitDwarf()) { + // 1) Start with a constNu of the appropriate pointer size addUInt(*Loc, dwarf::DW_FORM_data1, - DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address - : dwarf::DW_OP_form_tls_address); + PointerSize == 4 ? dwarf::DW_OP_const4u + : dwarf::DW_OP_const8u); + // 2) containing the (relocated) offset of the TLS variable + // within the module's TLS block. + addExpr(*Loc, dwarf::DW_FORM_udata, + Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); + } else { + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); + addUInt(*Loc, dwarf::DW_FORM_udata, + DD->getAddressPool().getIndex(Sym, /* TLS */ true)); } - } else { - DD->addArangeLabel(SymbolCU(this, Sym)); - addOpAddress(*Loc, Sym); + // 3) followed by an OP to make the debugger do a TLS lookup. + addUInt(*Loc, dwarf::DW_FORM_data1, + DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address + : dwarf::DW_OP_form_tls_address); } + } else { + DD->addArangeLabel(SymbolCU(this, Sym)); + addOpAddress(*Loc, Sym); } - if (Expr) { - DwarfExpr->addFragmentOffset(Expr); - DwarfExpr->AddExpression(Expr); - } + } + if (Expr) { + DwarfExpr->addFragmentOffset(Expr); + DwarfExpr->addExpression(Expr); } } if (Loc) @@ -507,8 +525,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); // If there is an expression, emit raw unsigned bytes. DwarfExpr.addFragmentOffset(Expr); - DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm()); - DwarfExpr.AddExpression(Expr); + DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm()); + DwarfExpr.addExpression(Expr); addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); } else addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType()); @@ -532,9 +550,15 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering(); int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg); DwarfExpr.addFragmentOffset(Fragment.Expr); - DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - FrameReg, Offset); - DwarfExpr.AddExpression(Fragment.Expr); + SmallVector<uint64_t, 8> Ops; + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Offset); + Ops.push_back(dwarf::DW_OP_deref); + Ops.append(Fragment.Expr->elements_begin(), Fragment.Expr->elements_end()); + DIExpressionCursor Expr(Ops); + DwarfExpr.addMachineRegExpression( + *Asm->MF->getSubtarget().getRegisterInfo(), Expr, FrameReg); + DwarfExpr.addExpression(std::move(Expr)); } addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); @@ -690,11 +714,14 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) { Asm->OutStreamer->EmitLabel(LabelBegin); } - DwarfUnit::emitHeader(UseOffsets); + dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile + : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton + : dwarf::DW_UT_compile; + DwarfUnit::emitCommonHeader(UseOffsets, UT); } /// addGlobalName - Add a new global name to the compile unit. -void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die, +void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die, const DIScope *Context) { if (includeMinimalInlineScopes()) return; @@ -702,6 +729,18 @@ void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die, GlobalNames[FullName] = &Die; } +void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name, + const DIScope *Context) { + if (includeMinimalInlineScopes()) + return; + std::string FullName = getParentContextString(Context) + Name.str(); + // Insert, allowing the entry to remain as-is if it's already present + // This way the CU-level type DIE is preferred over the "can't describe this + // type as a unit offset because it's not really in the CU at all, it's only + // in a type unit" + GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} + /// Add a new global type to the unit. void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) { @@ -711,6 +750,18 @@ void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die, GlobalTypes[FullName] = &Die; } +void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty, + const DIScope *Context) { + if (includeMinimalInlineScopes()) + return; + std::string FullName = getParentContextString(Context) + Ty->getName().str(); + // Insert, allowing the entry to remain as-is if it's already present + // This way the CU-level type DIE is preferred over the "can't describe this + // type as a unit offset because it's not really in the CU at all, it's only + // in a type unit" + GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} + /// addVariableAddress - Add DW_AT_location attribute for a /// DbgVariable based on provided MachineLocation. void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, @@ -727,22 +778,22 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute, const MachineLocation &Location) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression Expr(*Asm, *this, *Loc); - - bool validReg; - if (Location.isReg()) - validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg()); - else - validReg = - Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg(), Location.getOffset()); + DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); - if (!validReg) + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + Ops.push_back(dwarf::DW_OP_deref); + } + DIExpressionCursor Cursor(Ops); + const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) return; + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Expr.finalize()); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Start with the address based on the location provided, and generate the @@ -754,23 +805,24 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die, const MachineLocation &Location) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); - const DIExpression *Expr = DV.getSingleExpression(); - DIExpressionCursor ExprCursor(Expr); + const DIExpression *DIExpr = DV.getSingleExpression(); + DwarfExpr.addFragmentOffset(DIExpr); + + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + Ops.push_back(dwarf::DW_OP_deref); + } + Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpressionCursor Cursor(Ops); const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); - auto Reg = Location.getReg(); - DwarfExpr.addFragmentOffset(Expr); - bool ValidReg = - Location.getOffset() - ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset()) - : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg); - - if (!ValidReg) + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) return; - - DwarfExpr.AddExpression(std::move(ExprCursor)); + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Loc); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Add a Dwarf loclistptr attribute data and value. diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index a8025f1d15219..9a64b4b76b06e 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -210,12 +210,19 @@ public: } /// Add a new global name to the compile unit. - void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override; + void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) override; + + /// Add a new global name present in a type unit to this compile unit. + void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context); /// Add a new global type to the compile unit. void addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) override; + /// Add a new global type present in a type unit to this compile unit. + void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context); + const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; } const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 91a3d0989cc5e..5ce1113092088 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -39,7 +39,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" @@ -127,17 +126,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission"; static const char *const DbgTimerName = "writer"; static const char *const DbgTimerDescription = "DWARF Debug Writer"; -void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) { +void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) { BS.EmitInt8( Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op) : dwarf::OperationEncodingString(Op)); } -void DebugLocDwarfExpression::EmitSigned(int64_t Value) { +void DebugLocDwarfExpression::emitSigned(int64_t Value) { BS.EmitSLEB128(Value, Twine(Value)); } -void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) { +void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) { BS.EmitULEB128(Value, Twine(Value)); } @@ -200,6 +199,12 @@ const DIType *DbgVariable::getType() const { } ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const { + if (FrameIndexExprs.size() == 1) + return FrameIndexExprs; + + assert(all_of(FrameIndexExprs, + [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) && + "multiple FI expressions without DW_OP_LLVM_fragment"); std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(), [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool { return A.Expr->getFragmentInfo()->OffsetInBits < @@ -418,7 +423,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { Asm->OutStreamer->getContext().setMCLineTableCompilationDir( NewCU.getUniqueID(), CompilationDir); - NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer()); + StringRef Producer = DIUnit->getProducer(); + StringRef Flags = DIUnit->getFlags(); + if (!Flags.empty()) { + std::string ProducerWithFlags = Producer.str() + " " + Flags.str(); + NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags); + } else + NewCU.addString(Die, dwarf::DW_AT_producer, Producer); + NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2, DIUnit->getSourceLanguage()); NewCU.addString(Die, dwarf::DW_AT_name, FN); @@ -544,7 +556,6 @@ void DwarfDebug::beginModule() { // The retained types array by design contains pointers to // MDNodes rather than DIRefs. Unique them here. if (DIType *RT = dyn_cast<DIType>(Ty)) - if (!RT->isExternalTypeRef()) // There is no point in force-emitting a forward declaration. CU.getOrCreateTypeDIE(RT); } @@ -740,6 +751,7 @@ DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) { void DwarfDebug::createAbstractVariable(const DILocalVariable *Var, LexicalScope *Scope) { + assert(Scope && Scope->isAbstractScope()); auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr); InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get()); AbstractVariables[Var] = std::move(AbsDbgVariable); @@ -1137,20 +1149,9 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { // Gather pre-function debug information. Assumes being called immediately // after the function entry point has been emitted. -void DwarfDebug::beginFunction(const MachineFunction *MF) { +void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) { CurFn = MF; - // If there's no debug info for the function we're not going to do anything. - if (!MMI->hasDebugInfo()) - return; - - auto DI = MF->getFunction()->getSubprogram(); - if (!DI) - return; - - // Grab the lexical scopes for the function, if we don't have any of those - // then we're not going to be able to do anything. - DebugHandlerBase::beginFunction(MF); if (LScopes.empty()) return; @@ -1189,23 +1190,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { } } +void DwarfDebug::skippedNonDebugFunction() { + // If we don't have a subprogram for this function then there will be a hole + // in the range information. Keep note of this by setting the previously used + // section to nullptr. + PrevCU = nullptr; + CurFn = nullptr; +} + // Gather and emit post-function debug information. -void DwarfDebug::endFunction(const MachineFunction *MF) { +void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { + const DISubprogram *SP = MF->getFunction()->getSubprogram(); + assert(CurFn == MF && "endFunction should be called with the same function as beginFunction"); - const DISubprogram *SP = MF->getFunction()->getSubprogram(); - if (!MMI->hasDebugInfo() || !SP || - SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) { - // If we don't have a subprogram for this function then there will be a hole - // in the range information. Keep note of this by setting the previously - // used section to nullptr. - PrevCU = nullptr; - CurFn = nullptr; - DebugHandlerBase::endFunction(MF); - return; - } - // Set DwarfDwarfCompileUnitID in MCContext to default value. Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); @@ -1220,17 +1219,14 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd())); // Under -gmlt, skip building the subprogram if there are no inlined - // subroutines inside it. - if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && + // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram + // is still needed as we need its source location. + if (!TheCU.getCUNode()->getDebugInfoForProfiling() && + TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && LScopes.getAbstractScopesList().empty() && !IsDarwin) { assert(InfoHolder.getScopeVariables().empty()); - assert(DbgValues.empty()); - // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed - // by a -gmlt CU. Add a test and remove this assertion. - assert(AbstractVariables.empty()); PrevLabel = nullptr; CurFn = nullptr; - DebugHandlerBase::endFunction(MF); return; } @@ -1266,7 +1262,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { InfoHolder.getScopeVariables().clear(); PrevLabel = nullptr; CurFn = nullptr; - DebugHandlerBase::endFunction(MF); } // Register a source line with debug info. Returns the unique label that was @@ -1361,6 +1356,18 @@ void DwarfDebug::emitAccelTypes() { /// computeIndexValue - Compute the gdb index value for the DIE and CU. static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU, const DIE *Die) { + // Entities that ended up only in a Type Unit reference the CU instead (since + // the pub entry has offsets within the CU there's no real offset that can be + // provided anyway). As it happens all such entities (namespaces and types, + // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out + // not to be true it would be necessary to persist this information from the + // point at which the entry is added to the index data structure - since by + // the time the index is built from that, the original type/namespace DIE in a + // type unit has already been destroyed so it can't be queried for properties + // like tag, etc. + if (Die->getTag() == dwarf::DW_TAG_compile_unit) + return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, + dwarf::GIEL_EXTERNAL); dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC; // We could have a specification DIE that has our most of our knowledge, @@ -1498,27 +1505,37 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, ByteStreamer &Streamer, const DebugLocEntry::Value &Value, DwarfExpression &DwarfExpr) { - DIExpressionCursor ExprCursor(Value.getExpression()); - DwarfExpr.addFragmentOffset(Value.getExpression()); + auto *DIExpr = Value.getExpression(); + DIExpressionCursor ExprCursor(DIExpr); + DwarfExpr.addFragmentOffset(DIExpr); // Regular entry. if (Value.isInt()) { if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || BT->getEncoding() == dwarf::DW_ATE_signed_char)) - DwarfExpr.AddSignedConstant(Value.getInt()); + DwarfExpr.addSignedConstant(Value.getInt()); else - DwarfExpr.AddUnsignedConstant(Value.getInt()); + DwarfExpr.addUnsignedConstant(Value.getInt()); } else if (Value.isLocation()) { - MachineLocation Loc = Value.getLoc(); + MachineLocation Location = Value.getLoc(); + + SmallVector<uint64_t, 8> Ops; + // FIXME: Should this condition be Location.isIndirect() instead? + if (Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + Ops.push_back(dwarf::DW_OP_deref); + } + Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpressionCursor Cursor(Ops); const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo(); - if (Loc.getOffset()) - DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset()); - else - DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg()); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + return; + return DwarfExpr.addExpression(std::move(Cursor)); } else if (Value.isConstantFP()) { APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt(); - DwarfExpr.AddUnsignedConstant(RawBytes); + DwarfExpr.addUnsignedConstant(RawBytes); } - DwarfExpr.AddExpression(std::move(ExprCursor)); + DwarfExpr.addExpression(std::move(ExprCursor)); } void DebugLocEntry::finalize(const AsmPrinter &AP, @@ -1940,11 +1957,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) { MD5 Hash; Hash.update(Identifier); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. + // implementation always returns its results in little endian, so we actually + // need the "high" word. MD5::MD5Result Result; Hash.final(Result); - return support::endian::read64le(Result + 8); + return Result.high(); } void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index 253e3f06200ed..8a96e7867b6e3 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -89,7 +89,7 @@ public: assert(!MInsn && "Already initialized?"); assert((!E || E->isValid()) && "Expected valid expression"); - assert(~FI && "Expected valid index"); + assert(FI != INT_MAX && "Expected valid index"); FrameIndexExprs.push_back({FI, E}); } @@ -448,6 +448,15 @@ class DwarfDebug : public DebugHandlerBase { /// Collect variable information from the side table maintained by MF. void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P); +protected: + /// Gather pre-function debug information. + void beginFunctionImpl(const MachineFunction *MF) override; + + /// Gather and emit post-function debug information. + void endFunctionImpl(const MachineFunction *MF) override; + + void skippedNonDebugFunction() override; + public: //===--------------------------------------------------------------------===// // Main entry points. @@ -463,12 +472,6 @@ public: /// Emit all Dwarf sections that should come after the content. void endModule() override; - /// Gather pre-function debug information. - void beginFunction(const MachineFunction *MF) override; - - /// Gather and emit post-function debug information. - void endFunction(const MachineFunction *MF) override; - /// Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 61b2c7e658424..debe88f3b1ee1 100644 --- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -22,79 +22,76 @@ using namespace llvm; -void DwarfExpression::AddReg(int DwarfReg, const char *Comment) { +void DwarfExpression::addReg(int DwarfReg, const char *Comment) { assert(DwarfReg >= 0 && "invalid negative dwarf register number"); if (DwarfReg < 32) { - EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); + emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); } else { - EmitOp(dwarf::DW_OP_regx, Comment); - EmitUnsigned(DwarfReg); + emitOp(dwarf::DW_OP_regx, Comment); + emitUnsigned(DwarfReg); } } -void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) { +void DwarfExpression::addBReg(int DwarfReg, int Offset) { assert(DwarfReg >= 0 && "invalid negative dwarf register number"); if (DwarfReg < 32) { - EmitOp(dwarf::DW_OP_breg0 + DwarfReg); + emitOp(dwarf::DW_OP_breg0 + DwarfReg); } else { - EmitOp(dwarf::DW_OP_bregx); - EmitUnsigned(DwarfReg); + emitOp(dwarf::DW_OP_bregx); + emitUnsigned(DwarfReg); } - EmitSigned(Offset); - if (Deref) - EmitOp(dwarf::DW_OP_deref); + emitSigned(Offset); } -void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { +void DwarfExpression::addFBReg(int Offset) { + emitOp(dwarf::DW_OP_fbreg); + emitSigned(Offset); +} + +void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { if (!SizeInBits) return; const unsigned SizeOfByte = 8; if (OffsetInBits > 0 || SizeInBits % SizeOfByte) { - EmitOp(dwarf::DW_OP_bit_piece); - EmitUnsigned(SizeInBits); - EmitUnsigned(OffsetInBits); + emitOp(dwarf::DW_OP_bit_piece); + emitUnsigned(SizeInBits); + emitUnsigned(OffsetInBits); } else { - EmitOp(dwarf::DW_OP_piece); + emitOp(dwarf::DW_OP_piece); unsigned ByteSize = SizeInBits / SizeOfByte; - EmitUnsigned(ByteSize); + emitUnsigned(ByteSize); } this->OffsetInBits += SizeInBits; } -void DwarfExpression::AddShr(unsigned ShiftBy) { - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(ShiftBy); - EmitOp(dwarf::DW_OP_shr); +void DwarfExpression::addShr(unsigned ShiftBy) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(ShiftBy); + emitOp(dwarf::DW_OP_shr); } -bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI, - unsigned MachineReg, int Offset) { - if (isFrameRegister(TRI, MachineReg)) { - // If variable offset is based in frame register then use fbreg. - EmitOp(dwarf::DW_OP_fbreg); - EmitSigned(Offset); - return true; - } - - int DwarfReg = TRI.getDwarfRegNum(MachineReg, false); - if (DwarfReg < 0) - return false; - - AddRegIndirect(DwarfReg, Offset); - return true; +void DwarfExpression::addAnd(unsigned Mask) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Mask); + emitOp(dwarf::DW_OP_and); } -bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, unsigned MaxSize) { - if (!TRI.isPhysicalRegister(MachineReg)) + if (!TRI.isPhysicalRegister(MachineReg)) { + if (isFrameRegister(TRI, MachineReg)) { + DwarfRegs.push_back({-1, 0, nullptr}); + return true; + } return false; + } int Reg = TRI.getDwarfRegNum(MachineReg, false); // If this is a valid register number, emit it. if (Reg >= 0) { - AddReg(Reg); + DwarfRegs.push_back({Reg, 0, nullptr}); return true; } @@ -106,7 +103,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg); unsigned Size = TRI.getSubRegIdxSize(Idx); unsigned RegOffset = TRI.getSubRegIdxOffset(Idx); - AddReg(Reg, "super-register"); + DwarfRegs.push_back({Reg, 0, "super-register"}); // Use a DW_OP_bit_piece to describe the sub-register. setSubRegisterPiece(Size, RegOffset); return true; @@ -136,72 +133,101 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, // If this sub-register has a DWARF number and we haven't covered // its range, emit a DWARF piece for it. if (Reg >= 0 && Intersection.any()) { - AddReg(Reg, "sub-register"); + // Emit a piece for any gap in the coverage. + if (Offset > CurPos) + DwarfRegs.push_back({-1, Offset - CurPos, nullptr}); + DwarfRegs.push_back( + {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"}); if (Offset >= MaxSize) break; - // Emit a piece for the any gap in the coverage. - if (Offset > CurPos) - AddOpPiece(Offset - CurPos); - AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset)); - CurPos = Offset + Size; // Mark it as emitted. Coverage.set(Offset, Offset + Size); + CurPos = Offset + Size; } } return CurPos; } -void DwarfExpression::AddStackValue() { +void DwarfExpression::addStackValue() { if (DwarfVersion >= 4) - EmitOp(dwarf::DW_OP_stack_value); + emitOp(dwarf::DW_OP_stack_value); } -void DwarfExpression::AddSignedConstant(int64_t Value) { - EmitOp(dwarf::DW_OP_consts); - EmitSigned(Value); - AddStackValue(); +void DwarfExpression::addSignedConstant(int64_t Value) { + emitOp(dwarf::DW_OP_consts); + emitSigned(Value); + addStackValue(); } -void DwarfExpression::AddUnsignedConstant(uint64_t Value) { - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Value); - AddStackValue(); +void DwarfExpression::addUnsignedConstant(uint64_t Value) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Value); + addStackValue(); } -void DwarfExpression::AddUnsignedConstant(const APInt &Value) { +void DwarfExpression::addUnsignedConstant(const APInt &Value) { unsigned Size = Value.getBitWidth(); const uint64_t *Data = Value.getRawData(); // Chop it up into 64-bit pieces, because that's the maximum that - // AddUnsignedConstant takes. + // addUnsignedConstant takes. unsigned Offset = 0; while (Offset < Size) { - AddUnsignedConstant(*Data++); + addUnsignedConstant(*Data++); if (Offset == 0 && Size <= 64) break; - AddOpPiece(std::min(Size-Offset, 64u), Offset); + addOpPiece(std::min(Size-Offset, 64u), Offset); Offset += 64; } } -bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &ExprCursor, unsigned MachineReg, unsigned FragmentOffsetInBits) { - if (!ExprCursor) - return AddMachineReg(TRI, MachineReg); + auto Fragment = ExprCursor.getFragmentInfo(); + if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) + return false; - // Pattern-match combinations for which more efficient representations exist - // first. - bool ValidReg = false; + bool HasComplexExpression = false; auto Op = ExprCursor.peek(); + if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment) + HasComplexExpression = true; + + // If the register can only be described by a complex expression (i.e., + // multiple subregisters) it doesn't safely compose with another complex + // expression. For example, it is not possible to apply a DW_OP_deref + // operation to multiple DW_OP_pieces. + if (HasComplexExpression && DwarfRegs.size() > 1) { + DwarfRegs.clear(); + return false; + } + + // Handle simple register locations. + if (!HasComplexExpression) { + for (auto &Reg : DwarfRegs) { + if (Reg.DwarfRegNo >= 0) + addReg(Reg.DwarfRegNo, Reg.Comment); + addOpPiece(Reg.Size); + } + DwarfRegs.clear(); + return true; + } + + assert(DwarfRegs.size() == 1); + auto Reg = DwarfRegs[0]; + bool FBReg = isFrameRegister(TRI, MachineReg); + assert(Reg.Size == 0 && "subregister has same size as superregister"); + + // Pattern-match combinations for which more efficient representations exist. switch (Op->getOp()) { default: { - auto Fragment = ExprCursor.getFragmentInfo(); - ValidReg = AddMachineReg(TRI, MachineReg, - Fragment ? Fragment->SizeInBits : ~1U); + if (FBReg) + addFBReg(0); + else + addReg(Reg.DwarfRegNo, 0); break; } case dwarf::DW_OP_plus: @@ -210,28 +236,42 @@ bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI, // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset]. auto N = ExprCursor.peekNext(); if (N && N->getOp() == dwarf::DW_OP_deref) { - unsigned Offset = Op->getArg(0); - ValidReg = AddMachineRegIndirect( - TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset); + int Offset = Op->getArg(0); + int SignedOffset = (Op->getOp() == dwarf::DW_OP_plus) ? Offset : -Offset; + if (FBReg) + addFBReg(SignedOffset); + else + addBReg(Reg.DwarfRegNo, SignedOffset); + ExprCursor.consume(2); - } else - ValidReg = AddMachineReg(TRI, MachineReg); + break; + } + addReg(Reg.DwarfRegNo, 0); break; } case dwarf::DW_OP_deref: // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg]. - ValidReg = AddMachineRegIndirect(TRI, MachineReg); + if (FBReg) + addFBReg(0); + else + addBReg(Reg.DwarfRegNo, 0); ExprCursor.take(); break; } - - return ValidReg; + DwarfRegs.clear(); + return true; } -void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, +void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor, unsigned FragmentOffsetInBits) { while (ExprCursor) { auto Op = ExprCursor.take(); + + // If we need to mask out a subregister, do it now, unless the next + // operation would emit an OpPiece anyway. + if (SubRegisterSizeInBits && Op->getOp() != dwarf::DW_OP_LLVM_fragment) + maskSubRegister(); + switch (Op->getOp()) { case dwarf::DW_OP_LLVM_fragment: { unsigned SizeInBits = Op->getArg(1); @@ -241,39 +281,45 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, // location. assert(OffsetInBits >= FragmentOffset && "fragment offset not added?"); - // If \a AddMachineReg already emitted DW_OP_piece operations to represent + // If \a addMachineReg already emitted DW_OP_piece operations to represent // a super-register by splicing together sub-registers, subtract the size // of the pieces that was already emitted. SizeInBits -= OffsetInBits - FragmentOffset; - // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a + // If \a addMachineReg requested a DW_OP_bit_piece to stencil out a // sub-register that is smaller than the current fragment's size, use it. if (SubRegisterSizeInBits) SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits); - AddOpPiece(SizeInBits, SubRegisterOffsetInBits); + addOpPiece(SizeInBits, SubRegisterOffsetInBits); setSubRegisterPiece(0, 0); break; } case dwarf::DW_OP_plus: - EmitOp(dwarf::DW_OP_plus_uconst); - EmitUnsigned(Op->getArg(0)); + emitOp(dwarf::DW_OP_plus_uconst); + emitUnsigned(Op->getArg(0)); break; case dwarf::DW_OP_minus: // There is no OP_minus_uconst. - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Op->getArg(0)); - EmitOp(dwarf::DW_OP_minus); + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Op->getArg(0)); + emitOp(dwarf::DW_OP_minus); break; case dwarf::DW_OP_deref: - EmitOp(dwarf::DW_OP_deref); + emitOp(dwarf::DW_OP_deref); break; case dwarf::DW_OP_constu: - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Op->getArg(0)); + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Op->getArg(0)); break; case dwarf::DW_OP_stack_value: - AddStackValue(); + addStackValue(); + break; + case dwarf::DW_OP_swap: + emitOp(dwarf::DW_OP_swap); + break; + case dwarf::DW_OP_xderef: + emitOp(dwarf::DW_OP_xderef); break; default: llvm_unreachable("unhandled opcode found in expression"); @@ -281,9 +327,25 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, } } +/// add masking operations to stencil out a subregister. +void DwarfExpression::maskSubRegister() { + assert(SubRegisterSizeInBits && "no subregister was registered"); + if (SubRegisterOffsetInBits > 0) + addShr(SubRegisterOffsetInBits); + uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL; + addAnd(Mask); +} + + void DwarfExpression::finalize() { - if (SubRegisterSizeInBits) - AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); + assert(DwarfRegs.size() == 0 && "dwarf registers not emitted"); + // Emit any outstanding DW_OP_piece operations to mask out subregisters. + if (SubRegisterSizeInBits == 0) + return; + // Don't emit a DW_OP_piece for a subregister at offset 0. + if (SubRegisterOffsetInBits == 0) + return; + addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); } void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { @@ -294,6 +356,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { assert(FragmentOffset >= OffsetInBits && "overlapping or duplicate fragments"); if (FragmentOffset > OffsetInBits) - AddOpPiece(FragmentOffset - OffsetInBits); + addOpPiece(FragmentOffset - OffsetInBits); OffsetInBits = FragmentOffset; } diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h index fd90fa05bc32a..e8dc211eb3c22 100644 --- a/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -84,9 +84,19 @@ public: /// entry. class DwarfExpression { protected: - unsigned DwarfVersion; + /// Holds information about all subregisters comprising a register location. + struct Register { + int DwarfRegNo; + unsigned Size; + const char *Comment; + }; + + /// The register location, if any. + SmallVector<Register, 2> DwarfRegs; + /// Current Fragment Offset in Bits. uint64_t OffsetInBits = 0; + unsigned DwarfVersion; /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister. unsigned SubRegisterSizeInBits = 0; @@ -99,35 +109,54 @@ protected: SubRegisterOffsetInBits = OffsetInBits; } -public: - DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} - virtual ~DwarfExpression() {}; - - /// This needs to be called last to commit any pending changes. - void finalize(); + /// Add masking operations to stencil out a subregister. + void maskSubRegister(); /// Output a dwarf operand and an optional assembler comment. - virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0; + virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0; /// Emit a raw signed value. - virtual void EmitSigned(int64_t Value) = 0; + virtual void emitSigned(int64_t Value) = 0; /// Emit a raw unsigned value. - virtual void EmitUnsigned(uint64_t Value) = 0; + virtual void emitUnsigned(uint64_t Value) = 0; /// Return whether the given machine register is the frame register in the /// current function. virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0; - /// Emit a dwarf register operation. - void AddReg(int DwarfReg, const char *Comment = nullptr); - /// Emit an (double-)indirect dwarf register operation. - void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false); + /// Emit a DW_OP_reg operation. + void addReg(int DwarfReg, const char *Comment = nullptr); + /// Emit a DW_OP_breg operation. + void addBReg(int DwarfReg, int Offset); + /// Emit DW_OP_fbreg <Offset>. + void addFBReg(int Offset); + + /// Emit a partial DWARF register operation. + /// + /// \param MachineReg The register number. + /// \param MaxSize If the register must be composed from + /// sub-registers this is an upper bound + /// for how many bits the emitted DW_OP_piece + /// may cover. + /// + /// If size and offset is zero an operation for the entire register is + /// emitted: Some targets do not provide a DWARF register number for every + /// register. If this is the case, this function will attempt to emit a DWARF + /// register by emitting a fragment of a super-register or by piecing together + /// multiple subregisters that alias the register. + /// + /// \return false if no DWARF register exists for MachineReg. + bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, + unsigned MaxSize = ~1U); + /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment. /// \param OffsetInBits This is an optional offset into the location that /// is at the top of the DWARF stack. - void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); + void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); - /// Emit a shift-right dwarf expression. - void AddShr(unsigned ShiftBy); + /// Emit a shift-right dwarf operation. + void addShr(unsigned ShiftBy); + /// Emit a bitwise and dwarf operation. + void addAnd(unsigned Mask); /// Emit a DW_OP_stack_value, if supported. /// @@ -140,37 +169,21 @@ public: /// constant value, so the producers and consumers started to rely on /// heuristics to disambiguate the value vs. location status of the /// expression. See PR21176 for more details. - void AddStackValue(); + void addStackValue(); - /// Emit an indirect dwarf register operation for the given machine register. - /// \return false if no DWARF register exists for MachineReg. - bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg, - int Offset = 0); + ~DwarfExpression() = default; +public: + DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} - /// Emit a partial DWARF register operation. - /// - /// \param MachineReg The register number. - /// \param MaxSize If the register must be composed from - /// sub-registers this is an upper bound - /// for how many bits the emitted DW_OP_piece - /// may cover. - /// - /// If size and offset is zero an operation for the entire register is - /// emitted: Some targets do not provide a DWARF register number for every - /// register. If this is the case, this function will attempt to emit a DWARF - /// register by emitting a fragment of a super-register or by piecing together - /// multiple subregisters that alias the register. - /// - /// \return false if no DWARF register exists for MachineReg. - bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, - unsigned MaxSize = ~1U); + /// This needs to be called last to commit any pending changes. + void finalize(); /// Emit a signed constant. - void AddSignedConstant(int64_t Value); + void addSignedConstant(int64_t Value); /// Emit an unsigned constant. - void AddUnsignedConstant(uint64_t Value); + void addUnsignedConstant(uint64_t Value); /// Emit an unsigned constant. - void AddUnsignedConstant(const APInt &Value); + void addUnsignedConstant(const APInt &Value); /// Emit a machine register location. As an optimization this may also consume /// the prefix of a DwarfExpression if a more efficient representation for @@ -181,7 +194,7 @@ public: /// fragment inside the entire variable. /// \return false if no DWARF register exists /// for MachineReg. - bool AddMachineRegExpression(const TargetRegisterInfo &TRI, + bool addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &Expr, unsigned MachineReg, unsigned FragmentOffsetInBits = 0); /// Emit all remaining operations in the DIExpressionCursor. @@ -189,7 +202,7 @@ public: /// \param FragmentOffsetInBits If this is one fragment out of multiple /// locations, this is the offset of the /// fragment inside the entire variable. - void AddExpression(DIExpressionCursor &&Expr, + void addExpression(DIExpressionCursor &&Expr, unsigned FragmentOffsetInBits = 0); /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to @@ -198,33 +211,32 @@ public: }; /// DwarfExpression implementation for .debug_loc entries. -class DebugLocDwarfExpression : public DwarfExpression { +class DebugLocDwarfExpression final : public DwarfExpression { ByteStreamer &BS; + void emitOp(uint8_t Op, const char *Comment = nullptr) override; + void emitSigned(int64_t Value) override; + void emitUnsigned(uint64_t Value) override; + bool isFrameRegister(const TargetRegisterInfo &TRI, + unsigned MachineReg) override; public: DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS) : DwarfExpression(DwarfVersion), BS(BS) {} - - void EmitOp(uint8_t Op, const char *Comment = nullptr) override; - void EmitSigned(int64_t Value) override; - void EmitUnsigned(uint64_t Value) override; - bool isFrameRegister(const TargetRegisterInfo &TRI, - unsigned MachineReg) override; }; /// DwarfExpression implementation for singular DW_AT_location. -class DIEDwarfExpression : public DwarfExpression { +class DIEDwarfExpression final : public DwarfExpression { const AsmPrinter &AP; DwarfUnit &DU; DIELoc &DIE; -public: - DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); - void EmitOp(uint8_t Op, const char *Comment = nullptr) override; - void EmitSigned(int64_t Value) override; - void EmitUnsigned(uint64_t Value) override; + void emitOp(uint8_t Op, const char *Comment = nullptr) override; + void emitSigned(int64_t Value) override; + void emitUnsigned(uint64_t Value) override; bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) override; +public: + DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); DIELoc *finalize() { DwarfExpression::finalize(); return &DIE; diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 2a866c071f593..bad5b09553cdc 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -54,15 +54,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU), DIE(DIE) {} -void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) { +void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) { DU.addUInt(DIE, dwarf::DW_FORM_data1, Op); } -void DIEDwarfExpression::EmitSigned(int64_t Value) { +void DIEDwarfExpression::emitSigned(int64_t Value) { DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value); } -void DIEDwarfExpression::EmitUnsigned(uint64_t Value) { +void DIEDwarfExpression::emitUnsigned(uint64_t Value) { DU.addUInt(DIE, dwarf::DW_FORM_udata, Value); } @@ -98,25 +98,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const { default: break; - case dwarf::DW_LANG_C89: - case dwarf::DW_LANG_C99: + // The languages below have valid values in all DWARF versions. case dwarf::DW_LANG_C: + case dwarf::DW_LANG_C89: case dwarf::DW_LANG_C_plus_plus: - case dwarf::DW_LANG_ObjC: - case dwarf::DW_LANG_ObjC_plus_plus: return 0; case dwarf::DW_LANG_Fortran77: case dwarf::DW_LANG_Fortran90: - case dwarf::DW_LANG_Fortran95: return 1; - // The languages below have valid values only if the DWARF version >= 4. + // The languages below have valid values only if the DWARF version >= 3. + case dwarf::DW_LANG_C99: + case dwarf::DW_LANG_ObjC: + case dwarf::DW_LANG_ObjC_plus_plus: + if (DD->getDwarfVersion() >= 3) + return 0; + break; + + case dwarf::DW_LANG_Fortran95: + if (DD->getDwarfVersion() >= 3) + return 1; + break; + + // Starting with DWARF v4, all defined languages have valid values. + case dwarf::DW_LANG_D: case dwarf::DW_LANG_Java: case dwarf::DW_LANG_Python: case dwarf::DW_LANG_UPC: - case dwarf::DW_LANG_D: - if (dwarf::DWARF_VERSION >= 4) + if (DD->getDwarfVersion() >= 4) return 0; break; @@ -127,31 +137,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const { case dwarf::DW_LANG_Modula2: case dwarf::DW_LANG_Pascal83: case dwarf::DW_LANG_PLI: - if (dwarf::DWARF_VERSION >= 4) + if (DD->getDwarfVersion() >= 4) return 1; break; - // The languages below have valid values only if the DWARF version >= 5. - case dwarf::DW_LANG_OpenCL: - case dwarf::DW_LANG_Go: - case dwarf::DW_LANG_Haskell: + // The languages below are new in DWARF v5. + case dwarf::DW_LANG_BLISS: + case dwarf::DW_LANG_C11: case dwarf::DW_LANG_C_plus_plus_03: case dwarf::DW_LANG_C_plus_plus_11: + case dwarf::DW_LANG_C_plus_plus_14: + case dwarf::DW_LANG_Dylan: + case dwarf::DW_LANG_Go: + case dwarf::DW_LANG_Haskell: case dwarf::DW_LANG_OCaml: + case dwarf::DW_LANG_OpenCL: + case dwarf::DW_LANG_RenderScript: case dwarf::DW_LANG_Rust: - case dwarf::DW_LANG_C11: case dwarf::DW_LANG_Swift: - case dwarf::DW_LANG_Dylan: - case dwarf::DW_LANG_C_plus_plus_14: - if (dwarf::DWARF_VERSION >= 5) + if (DD->getDwarfVersion() >= 5) return 0; break; - case dwarf::DW_LANG_Modula3: - case dwarf::DW_LANG_Julia: case dwarf::DW_LANG_Fortran03: case dwarf::DW_LANG_Fortran08: - if (dwarf::DWARF_VERSION >= 5) + case dwarf::DW_LANG_Julia: + case dwarf::DW_LANG_Modula3: + if (DD->getDwarfVersion() >= 5) return 1; break; } @@ -285,13 +297,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) { dwarf::DW_FORM_ref_sig8, DIEInteger(Signature)); } -void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, - StringRef Identifier) { - uint64_t Signature = DD->makeTypeSignature(Identifier); - Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8, - DIEInteger(Signature)); -} - void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry) { const DIEUnit *CU = Die.getUnit(); @@ -465,50 +470,47 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die, // Decode the original location, and use that as the start of the byref // variable's location. DIELoc *Loc = new (DIEValueAllocator) DIELoc; - SmallVector<uint64_t, 6> DIExpr; - DIEDwarfExpression Expr(*Asm, *this, *Loc); - - bool validReg; - if (Location.isReg()) - validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg()); - else - validReg = - Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg(), Location.getOffset()); - - if (!validReg) - return; + DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); + SmallVector<uint64_t, 9> Ops; + if (Location.isIndirect()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + Ops.push_back(dwarf::DW_OP_deref); + } // If we started with a pointer to the __Block_byref... struct, then // the first thing we need to do is dereference the pointer (DW_OP_deref). if (isPointer) - DIExpr.push_back(dwarf::DW_OP_deref); + Ops.push_back(dwarf::DW_OP_deref); // Next add the offset for the '__forwarding' field: // DW_OP_plus_uconst ForwardingFieldOffset. Note there's no point in // adding the offset if it's 0. if (forwardingFieldOffset > 0) { - DIExpr.push_back(dwarf::DW_OP_plus); - DIExpr.push_back(forwardingFieldOffset); + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(forwardingFieldOffset); } // Now dereference the __forwarding field to get to the real __Block_byref // struct: DW_OP_deref. - DIExpr.push_back(dwarf::DW_OP_deref); + Ops.push_back(dwarf::DW_OP_deref); // Now that we've got the real __Block_byref... struct, add the offset // for the variable's field to get to the location of the actual variable: // DW_OP_plus_uconst varFieldOffset. Again, don't add if it's 0. if (varFieldOffset > 0) { - DIExpr.push_back(dwarf::DW_OP_plus); - DIExpr.push_back(varFieldOffset); + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(varFieldOffset); } - Expr.AddExpression(makeArrayRef(DIExpr)); - Expr.finalize(); + + DIExpressionCursor Cursor(Ops); + const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + return; + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Loc); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Return true if type encoding is unsigned. @@ -672,7 +674,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { return getDIE(Context); } -DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { +DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) { auto *Context = resolve(Ty->getScope()); DIE *ContextDIE = getOrCreateContextDIE(Context); @@ -684,8 +686,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { constructTypeDIE(TyDIE, cast<DICompositeType>(Ty)); - if (!Ty->isExternalTypeRef()) - updateAcceleratorTables(Context, Ty, TyDIE); + updateAcceleratorTables(Context, Ty, TyDIE); return &TyDIE; } @@ -841,6 +842,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) { // Add source line info if available and TyDesc is not a forward declaration. if (!DTy->isForwardDecl()) addSourceLine(Buffer, DTy); + + // If DWARF address space value is other than None, add it for pointer and + // reference types as DW_AT_address_class. + if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type || + Tag == dwarf::DW_TAG_reference_type)) + addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4, + DTy->getDWARFAddressSpace().getValue()); } void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) { @@ -892,13 +900,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) { } void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { - if (CTy->isExternalTypeRef()) { - StringRef Identifier = CTy->getIdentifier(); - assert(!Identifier.empty() && "external type ref without identifier"); - addFlag(Buffer, dwarf::DW_AT_declaration); - return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier); - } - // Add name if not anonymous or intermediate type. StringRef Name = CTy->getName(); @@ -1180,8 +1181,12 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, } void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, - bool Minimal) { - if (!Minimal) + bool SkipSPAttributes) { + // If -fdebug-info-for-profiling is enabled, need to emit the subprogram + // and its source location. + bool SkipSPSourceLocation = SkipSPAttributes && + !CUNode->getDebugInfoForProfiling(); + if (!SkipSPSourceLocation) if (applySubprogramDefinitionAttributes(SP, SPDie)) return; @@ -1189,12 +1194,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, if (!SP->getName().empty()) addString(SPDie, dwarf::DW_AT_name, SP->getName()); + if (!SkipSPSourceLocation) + addSourceLine(SPDie, SP); + // Skip the rest of the attributes under -gmlt to save space. - if (Minimal) + if (SkipSPAttributes) return; - addSourceLine(SPDie, SP); - // Add the prototype if we have a prototype and we have a C like // language. uint16_t Language = getLanguage(); @@ -1526,18 +1532,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) { return &StaticMemberDIE; } -void DwarfUnit::emitHeader(bool UseOffsets) { +void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) { // Emit size of content not including length itself Asm->OutStreamer->AddComment("Length of Unit"); Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize()); Asm->OutStreamer->AddComment("DWARF version number"); - Asm->EmitInt16(DD->getDwarfVersion()); - Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); + unsigned Version = DD->getDwarfVersion(); + Asm->EmitInt16(Version); + + // DWARF v5 reorders the address size and adds a unit type. + if (Version >= 5) { + Asm->OutStreamer->AddComment("DWARF Unit Type"); + Asm->EmitInt8(UT); + Asm->OutStreamer->AddComment("Address Size (in bytes)"); + Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); + } // We share one abbreviations table across all units so it's always at the // start of the section. Use a relocatable offset where needed to ensure // linking doesn't invalidate that offset. + Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); if (UseOffsets) Asm->EmitInt32(0); @@ -1545,12 +1560,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) { Asm->emitDwarfSymbolReference( TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false); - Asm->OutStreamer->AddComment("Address Size (in bytes)"); - Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); + if (Version <= 4) { + Asm->OutStreamer->AddComment("Address Size (in bytes)"); + Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); + } } void DwarfTypeUnit::emitHeader(bool UseOffsets) { - DwarfUnit::emitHeader(UseOffsets); + DwarfUnit::emitCommonHeader(UseOffsets, + DD->useSplitDwarf() ? dwarf::DW_UT_split_type + : dwarf::DW_UT_type); Asm->OutStreamer->AddComment("Type Signature"); Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature)); Asm->OutStreamer->AddComment("Type DIE Offset"); @@ -1564,3 +1583,13 @@ bool DwarfTypeUnit::isDwoUnit() const { // when split DWARF is being used. return DD->useSplitDwarf(); } + +void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) { + getCU().addGlobalNameForTypeUnit(Name, Context); +} + +void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die, + const DIScope *Context) { + getCU().addGlobalTypeUnitType(Ty, Context); +} diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h index 8654d6f0caf43..d626ef920f956 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -124,12 +124,12 @@ public: std::string getParentContextString(const DIScope *Context) const; /// Add a new global name to the compile unit. - virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) { - } + virtual void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) = 0; /// Add a new global type to the compile unit. virtual void addGlobalType(const DIType *Ty, const DIE &Die, - const DIScope *Context) {} + const DIScope *Context) = 0; /// Returns the DIE map slot for the specified debug variable. /// @@ -198,9 +198,6 @@ public: /// Add a type's DW_AT_signature and set the declaration flag. void addDIETypeSignature(DIE &Die, uint64_t Signature); - /// Add an attribute containing the type signature for a unique identifier. - void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, - StringRef Identifier); /// Add block data. void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block); @@ -256,15 +253,12 @@ public: DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false); void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, - bool Minimal = false); + bool SkipSPAttributes = false); /// Find existing DIE or create new DIE for the given type. DIE *getOrCreateTypeDIE(const MDNode *N); /// Get context owner's DIE. - DIE *createTypeDIE(const DICompositeType *Ty); - - /// Get context owner's DIE. DIE *getOrCreateContextDIE(const DIScope *Context); /// Construct DIEs for types that contain vtables. @@ -282,11 +276,13 @@ public: virtual unsigned getHeaderSize() const { return sizeof(int16_t) + // DWARF version number sizeof(int32_t) + // Offset Into Abbrev. Section - sizeof(int8_t); // Pointer Size (in bytes) + sizeof(int8_t) + // Pointer Size (in bytes) + (DD->getDwarfVersion() >= 5 ? sizeof(int8_t) + : 0); // DWARF v5 unit type } /// Emit the header for this unit, not including the initial length field. - virtual void emitHeader(bool UseOffsets); + virtual void emitHeader(bool UseOffsets) = 0; virtual DwarfCompileUnit &getCU() = 0; @@ -306,6 +302,14 @@ protected: return Ref.resolve(); } + /// If this is a named finished type then include it in the list of types for + /// the accelerator tables. + void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, + const DIE &TyDIE); + + /// Emit the common part of the header for this unit. + void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT); + private: void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy); void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy); @@ -330,11 +334,6 @@ private: /// Set D as anonymous type for index which can be reused later. void setIndexTyDie(DIE *D) { IndexTyDie = D; } - /// If this is a named finished type then include it in the list of types for - /// the accelerator tables. - void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, - const DIE &TyDIE); - virtual bool isDwoUnit() const = 0; }; @@ -354,12 +353,19 @@ public: void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; } void setType(const DIE *Ty) { this->Ty = Ty; } + /// Get context owner's DIE. + DIE *createTypeDIE(const DICompositeType *Ty); + /// Emit the header for this unit, not including the initial length field. void emitHeader(bool UseOffsets) override; unsigned getHeaderSize() const override { return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature sizeof(uint32_t); // Type DIE Offset } + void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) override; + void addGlobalType(const DIType *Ty, const DIE &Die, + const DIScope *Context) override; DwarfCompileUnit &getCU() override { return CU; } }; } // end llvm namespace diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp index 6a023b998b326..342efc3611c78 100644 --- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp @@ -1,4 +1,4 @@ -//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===// +//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===// // // The LLVM Compiler Infrastructure // @@ -14,21 +14,19 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/GCs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Metadata.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Support/ELF.h" using namespace llvm; @@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter { public: void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override; }; -} + +} // end anonymous namespace static GCMetadataPrinterRegistry::Add<ErlangGCPrinter> X("erlang", "erlang-compatible garbage collector"); -void llvm::linkErlangGCPrinter() {} - void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) { MCStreamer &OS = *AP.OutStreamer; @@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, } } } + +void llvm::linkErlangGCPrinter() {} diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp index 9d7c96a1b8efd..704f0ac2f1919 100644 --- a/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/lib/CodeGen/AsmPrinter/WinException.cpp @@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) { const Function *F = MF->getFunction(); - shouldEmitMoves = Asm->needsSEHMoves(); + shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI(); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); unsigned PerEncoding = TLOF.getPersonalityEncoding(); @@ -94,7 +94,7 @@ void WinException::beginFunction(const MachineFunction *MF) { // If we're not using CFI, we don't want the CFI or the personality, but we // might want EH tables if we had EH pads. - if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) { + if (!Asm->MAI->usesWindowsCFI()) { if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) { // If this is 32-bit SEH and we don't have any funclets (really invokes), // make sure we emit the parent offset label. Some unreferenced filter diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index bf5cf105a8f86..9c19a4fd3c3e0 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -1532,7 +1532,7 @@ bool AtomicExpand::expandAtomicOpToLibcall( Type *ResultTy; SmallVector<Value *, 6> Args; - AttributeSet Attr; + AttributeList Attr; // 'size' argument. if (!UseSizedLibcall) { @@ -1593,7 +1593,7 @@ bool AtomicExpand::expandAtomicOpToLibcall( // Now, the return type. if (CASExpected) { ResultTy = Type::getInt1Ty(Ctx); - Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt); + Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); } else if (HasResult && UseSizedLibcall) ResultTy = SizedIntTy; else diff --git a/lib/CodeGen/BranchCoalescing.cpp b/lib/CodeGen/BranchCoalescing.cpp new file mode 100644 index 0000000000000..efdf300df8506 --- /dev/null +++ b/lib/CodeGen/BranchCoalescing.cpp @@ -0,0 +1,758 @@ +//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Coalesce basic blocks guarded by the same branch condition into a single +/// basic block. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "coal-branch" + +static cl::opt<cl::boolOrDefault> + EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden, + cl::desc("enable coalescing of duplicate branches")); + +STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced"); +STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged"); +STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced"); + +//===----------------------------------------------------------------------===// +// BranchCoalescing +//===----------------------------------------------------------------------===// +/// +/// Improve scheduling by coalescing branches that depend on the same condition. +/// This pass looks for blocks that are guarded by the same branch condition +/// and attempts to merge the blocks together. Such opportunities arise from +/// the expansion of select statements in the IR. +/// +/// For example, consider the following LLVM IR: +/// +/// %test = icmp eq i32 %x 0 +/// %tmp1 = select i1 %test, double %a, double 2.000000e-03 +/// %tmp2 = select i1 %test, double %b, double 5.000000e-03 +/// +/// This IR expands to the following machine code on PowerPC: +/// +/// BB#0: derived from LLVM BB %entry +/// Live Ins: %F1 %F3 %X6 +/// <SNIP1> +/// %vreg0<def> = COPY %F1; F8RC:%vreg0 +/// %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +/// %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +/// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +/// BCC 76, %vreg5, <BB#2>; CRRC:%vreg5 +/// Successors according to CFG: BB#1(?%) BB#2(?%) +/// +/// BB#1: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 +/// Successors according to CFG: BB#2(?%) +/// +/// BB#2: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 BB#1 +/// %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +/// F8RC:%vreg9,%vreg8,%vreg0 +/// <SNIP2> +/// BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +/// Successors according to CFG: BB#3(?%) BB#4(?%) +/// +/// BB#3: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#2 +/// Successors according to CFG: BB#4(?%) +/// +/// BB#4: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#2 BB#3 +/// %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>; +/// F8RC:%vreg13,%vreg12,%vreg2 +/// <SNIP3> +/// BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// When this pattern is detected, branch coalescing will try to collapse +/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3. +/// +/// If all conditions are meet, IR should collapse to: +/// +/// BB#0: derived from LLVM BB %entry +/// Live Ins: %F1 %F3 %X6 +/// <SNIP1> +/// %vreg0<def> = COPY %F1; F8RC:%vreg0 +/// %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +/// %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +/// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +/// <SNIP2> +/// BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +/// Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%) +/// BB#4(0x55555554 / 0x80000000 = 66.67%) +/// +/// BB#1: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 +/// Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%) +/// +/// BB#4: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 BB#1 +/// %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +/// F8RC:%vreg9,%vreg8,%vreg0 +/// %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>; +/// F8RC:%vreg13,%vreg12,%vreg2 +/// <SNIP3> +/// BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// Branch Coalescing does not split blocks, it moves everything in the same +/// direction ensuring it does not break use/definition semantics. +/// +/// PHI nodes and its corresponding use instructions are moved to its successor +/// block if there are no uses within the successor block PHI nodes. PHI +/// node ordering cannot be assumed. +/// +/// Non-PHI can be moved up to the predecessor basic block or down to the +/// successor basic block following any PHI instructions. Whether it moves +/// up or down depends on whether the register(s) defined in the instructions +/// are used in current block or in any PHI instructions at the beginning of +/// the successor block. + +namespace { + +class BranchCoalescing : public MachineFunctionPass { + struct CoalescingCandidateInfo { + MachineBasicBlock *BranchBlock; // Block containing the branch + MachineBasicBlock *BranchTargetBlock; // Block branched to + MachineBasicBlock *FallThroughBlock; // Fall-through if branch not taken + SmallVector<MachineOperand, 4> Cond; + bool MustMoveDown; + bool MustMoveUp; + + CoalescingCandidateInfo(); + void clear(); + }; + + MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + void initialize(MachineFunction &F); + bool canCoalesceBranch(CoalescingCandidateInfo &Cand); + bool identicalOperands(ArrayRef<MachineOperand> OperandList1, + ArrayRef<MachineOperand> OperandList2) const; + bool validateCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + + static bool isBranchCoalescingEnabled() { + return EnableBranchCoalescing == cl::BOU_TRUE; + } + +public: + static char ID; + + BranchCoalescing() : MachineFunctionPass(ID) { + initializeBranchCoalescingPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Branch Coalescing"; } + + bool mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion); + bool canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB, + MachineBasicBlock *TargetRegionMBB); + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char BranchCoalescing::ID = 0; +char &llvm::BranchCoalescingID = BranchCoalescing::ID; + +INITIALIZE_PASS_BEGIN(BranchCoalescing, "branch-coalescing", + "Branch Coalescing", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(BranchCoalescing, "branch-coalescing", "Branch Coalescing", + false, false) + +BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo() + : BranchBlock(nullptr), BranchTargetBlock(nullptr), + FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {} + +void BranchCoalescing::CoalescingCandidateInfo::clear() { + BranchBlock = nullptr; + BranchTargetBlock = nullptr; + FallThroughBlock = nullptr; + Cond.clear(); + MustMoveDown = false; + MustMoveUp = false; +} + +void BranchCoalescing::initialize(MachineFunction &MF) { + MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = &getAnalysis<MachinePostDominatorTree>(); + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); +} + +/// +/// Analyze the branch statement to determine if it can be coalesced. This +/// method analyses the branch statement for the given candidate to determine +/// if it can be coalesced. If the branch can be coalesced, then the +/// BranchTargetBlock and the FallThroughBlock are recorded in the specified +/// Candidate. +/// +///\param[in,out] Cand The coalescing candidate to analyze +///\return true if and only if the branch can be coalesced, false otherwise +/// +bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) { + DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber() + << " can be coalesced:"); + MachineBasicBlock *FalseMBB = nullptr; + + if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB, + Cand.Cond)) { + DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n"); + return false; + } + + for (auto &I : Cand.BranchBlock->terminators()) { + DEBUG(dbgs() << "Looking at terminator : " << I << "\n"); + if (!I.isBranch()) + continue; + + if (I.getNumOperands() != I.getNumExplicitOperands()) { + DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I + << "\n"); + return false; + } + } + + if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) { + DEBUG(dbgs() << "EH Pad - skip\n"); + return false; + } + + // For now only consider triangles (i.e, BranchTargetBlock is set, + // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock) + if (!Cand.BranchTargetBlock || FalseMBB || + !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() << "Does not form a triangle - skip\n"); + return false; + } + + // Ensure there are only two successors + if (Cand.BranchBlock->succ_size() != 2) { + DEBUG(dbgs() << "Does not have 2 successors - skip\n"); + return false; + } + + // Sanity check - the block must be able to fall through + assert(Cand.BranchBlock->canFallThrough() && + "Expecting the block to fall through!"); + + // We have already ensured there are exactly two successors to + // BranchBlock and that BranchTargetBlock is a successor to BranchBlock. + // Ensure the single fall though block is empty. + MachineBasicBlock *Succ = + (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock) + ? *Cand.BranchBlock->succ_rbegin() + : *Cand.BranchBlock->succ_begin(); + + assert(Succ && "Expecting a valid fall-through block\n"); + + if (!Succ->empty()) { + DEBUG(dbgs() << "Fall-through block contains code -- skip\n"); + return false; + } + + if (!Succ->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() + << "Successor of fall through block is not branch taken block\n"); + return false; + } + + Cand.FallThroughBlock = Succ; + DEBUG(dbgs() << "Valid Candidate\n"); + return true; +} + +/// +/// Determine if the two operand lists are identical +/// +/// \param[in] OpList1 operand list +/// \param[in] OpList2 operand list +/// \return true if and only if the operands lists are identical +/// +bool BranchCoalescing::identicalOperands( + ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const { + + if (OpList1.size() != OpList2.size()) { + DEBUG(dbgs() << "Operand list is different size\n"); + return false; + } + + for (unsigned i = 0; i < OpList1.size(); ++i) { + const MachineOperand &Op1 = OpList1[i]; + const MachineOperand &Op2 = OpList2[i]; + + DEBUG(dbgs() << "Op1: " << Op1 << "\n" + << "Op2: " << Op2 << "\n"); + + if (Op1.isIdenticalTo(Op2)) { + DEBUG(dbgs() << "Op1 and Op2 are identical!\n"); + continue; + } + + // If the operands are not identical, but are registers, check to see if the + // definition of the register produces the same value. If they produce the + // same value, consider them to be identical. + if (Op1.isReg() && Op2.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg()) && + TargetRegisterInfo::isVirtualRegister(Op2.getReg())) { + MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg()); + MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg()); + if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) { + DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def + << " produce the same value!\n"); + } else { + DEBUG(dbgs() << "Operands produce different values\n"); + return false; + } + } else { + DEBUG(dbgs() << "The operands are not provably identical.\n"); + return false; + } + } + return true; +} + +/// +/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB +/// and update them to refer to the new block. PHI node ordering +/// cannot be assumed so it does not matter where the PHI instructions +/// are moved to in TargetMBB. +/// +/// \param[in] SourceMBB block to move PHI instructions from +/// \param[in] TargetMBB block to move PHI instructions to +/// +void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB, + MachineBasicBlock *TargetMBB) { + + MachineBasicBlock::iterator MI = SourceMBB->begin(); + MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI(); + + if (MI == ME) { + DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n"); + return; + } + + // Update all PHI instructions in SourceMBB and move to top of TargetMBB + for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) { + MachineInstr &PHIInst = *Iter; + for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) { + MachineOperand &MO = PHIInst.getOperand(i); + if (MO.getMBB() == SourceMBB) + MO.setMBB(TargetMBB); + } + } + TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME); +} + +/// +/// This function checks if MI can be moved to the beginning of the TargetMBB +/// following PHI instructions. A MI instruction can be moved to beginning of +/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to beginning of TargetMBB, +/// false otherwise. +/// +bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Def : MI.defs()) { // Looking at Def + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == &TargetMBB) { + DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n"); + return false; + } + } + } + + DEBUG(dbgs() << " Safe to move to the beginning.\n"); + return true; +} + +/// +/// This function checks if MI can be moved to the end of the TargetMBB, +/// immediately before the first terminator. A MI instruction can be moved +/// to then end of the TargetMBB if no PHI node defines what MI uses within +/// it's own MBB. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to end of TargetMBB, +/// false otherwise. +/// +bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to end of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Use : MI.uses()) { + if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) { + MachineInstr *DefInst = MRI->getVRegDef(Use.getReg()); + if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) { + DEBUG(dbgs() << " *** Cannot move this instruction ***\n"); + return false; + } else { + DEBUG(dbgs() << " *** def is in another block -- safe to move!\n"); + } + } + } + + DEBUG(dbgs() << " Safe to move to the end.\n"); + return true; +} + +/// +/// This method checks to ensure the two coalescing candidates follows the +/// expected pattern required for coalescing. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion; false otherwise. +/// +bool BranchCoalescing::validateCandidates( + CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + + if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock) + llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion"); + else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock)) + llvm_unreachable("Expecting TargetRegion to dominate SourceRegion"); + else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock)) + llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion"); + else if (!TargetRegion.FallThroughBlock->empty() || + !SourceRegion.FallThroughBlock->empty()) + llvm_unreachable("Expecting fall-through blocks to be empty"); + + return true; +} + +/// +/// This method determines whether the two coalescing candidates can be merged. +/// In order to be merged, all instructions must be able to +/// 1. Move to the beginning of the SourceRegion.BranchTargetBlock; +/// 2. Move to the end of the TargetRegion.BranchBlock. +/// Merging involves moving the instructions in the +/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock). +/// +/// This function first try to move instructions from the +/// TargetRegion.BranchTargetBlock down, to the beginning of the +/// SourceRegion.BranchTargetBlock. This is not possible if any register defined +/// in TargetRegion.BranchTargetBlock is used in a PHI node in the +/// SourceRegion.BranchTargetBlock. In this case, check whether the statement +/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately +/// before the branch statement). If it cannot move, then these blocks cannot +/// be merged. +/// +/// Note that there is no analysis for moving instructions past the fall-through +/// blocks because they are confirmed to be empty. An assert is thrown if they +/// are not. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion, false otherwise. +/// +bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Walk through PHI nodes first and see if they force the merge into the + // SourceRegion.BranchTargetBlock. + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->instr_begin(), + E = SourceRegion.BranchBlock->getFirstNonPHI(); + I != E; ++I) { + for (auto &Def : I->defs()) + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) { + DEBUG(dbgs() << "PHI " << *I << " defines register used in another " + "PHI within branch target block -- can't merge\n"); + NumPHINotMoved++; + return false; + } + if (Use.getParent() == SourceRegion.BranchBlock) { + DEBUG(dbgs() << "PHI " << *I + << " defines register used in this " + "block -- all must move down\n"); + SourceRegion.MustMoveDown = true; + } + } + } + + // Walk through the MI to see if they should be merged into + // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down) + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->getFirstNonPHI(), + E = SourceRegion.BranchBlock->end(); + I != E; ++I) { + if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move down - must move up!\n"); + SourceRegion.MustMoveUp = true; + } + if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move up - must move down!\n"); + SourceRegion.MustMoveDown = true; + } + } + + return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true; +} + +/// Merge the instructions from SourceRegion.BranchBlock, +/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into +/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and +/// TargetRegion.FallThroughBlock respectively. +/// +/// The successors for blocks in TargetRegion will be updated to use the +/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion +/// will be removed from the function. +/// +/// A region consists of a BranchBlock, a FallThroughBlock, and a +/// BranchTargetBlock. Branch coalesce works on patterns where the +/// TargetRegion's BranchTargetBlock must also be the SourceRegions's +/// BranchBlock. +/// +/// Before mergeCandidates: +/// +/// +---------------------------+ +/// | TargetRegion.BranchBlock | +/// +---------------------------+ +/// / | +/// / +--------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | TargetRegion.BranchTargetBlock | +/// | SourceRegion.BranchBlock | +/// +----------------------------------+ +/// / | +/// / +--------------------------------+ +/// | | SourceRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// After mergeCandidates: +/// +/// +-----------------------------+ +/// | TargetRegion.BranchBlock | +/// | SourceRegion.BranchBlock | +/// +-----------------------------+ +/// / | +/// / +---------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// | | SourceRegion.FallThroughBlock | +/// \ +---------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// \param[in] SourceRegion The candidate to move blocks from +/// \param[in] TargetRegion The candidate to move blocks to +/// +bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) { + + if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) { + llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!"); + return false; + } + + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Start the merging process by first handling the BranchBlock. + // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block + moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + + // Move remaining instructions in SourceRegion.BranchBlock into + // TargetRegion.BranchBlock + MachineBasicBlock::iterator firstInstr = + SourceRegion.BranchBlock->getFirstNonPHI(); + MachineBasicBlock::iterator lastInstr = + SourceRegion.BranchBlock->getFirstTerminator(); + + MachineBasicBlock *Source = SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock + : TargetRegion.BranchBlock; + + MachineBasicBlock::iterator Target = + SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock->getFirstNonPHI() + : TargetRegion.BranchBlock->getFirstTerminator(); + + Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr); + + // Once PHI and instructions have been moved we need to clean up the + // control flow. + + // Remove SourceRegion.FallThroughBlock before transferring successors of + // SourceRegion.BranchBlock to TargetRegion.BranchBlock. + SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock); + TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.BranchBlock); + // Update branch in TargetRegion.BranchBlock to jump to + // SourceRegion.BranchTargetBlock + // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock. + TargetRegion.BranchBlock->ReplaceUsesOfBlockWith( + SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + // Remove the branch statement(s) in SourceRegion.BranchBlock + MachineBasicBlock::iterator I = + SourceRegion.BranchBlock->terminators().begin(); + while (I != SourceRegion.BranchBlock->terminators().end()) { + MachineInstr &CurrInst = *I; + ++I; + if (CurrInst.isBranch()) + CurrInst.eraseFromParent(); + } + + // Fall-through block should be empty since this is part of the condition + // to coalesce the branches. + assert(TargetRegion.FallThroughBlock->empty() && + "FallThroughBlocks should be empty!"); + + // Transfer successor information and move PHIs down to the + // branch-taken block. + TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.FallThroughBlock); + TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock); + + // Remove the blocks from the function. + assert(SourceRegion.BranchBlock->empty() && + "Expecting branch block to be empty!"); + SourceRegion.BranchBlock->eraseFromParent(); + + assert(SourceRegion.FallThroughBlock->empty() && + "Expecting fall-through block to be empty!\n"); + SourceRegion.FallThroughBlock->eraseFromParent(); + + NumBlocksCoalesced++; + return true; +} + +bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) { + + if (skipFunction(*MF.getFunction()) || MF.empty() || + !isBranchCoalescingEnabled()) + return false; + + bool didSomething = false; + + DEBUG(dbgs() << "******** Branch Coalescing ********\n"); + initialize(MF); + + DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); + + CoalescingCandidateInfo Cand1, Cand2; + // Walk over blocks and find candidates to merge + // Continue trying to merge with the first candidate found, as long as merging + // is successfull. + for (MachineBasicBlock &MBB : MF) { + bool MergedCandidates = false; + do { + MergedCandidates = false; + Cand1.clear(); + Cand2.clear(); + + Cand1.BranchBlock = &MBB; + + // If unable to coalesce the branch, then continue to next block + if (!canCoalesceBranch(Cand1)) + break; + + Cand2.BranchBlock = Cand1.BranchTargetBlock; + if (!canCoalesceBranch(Cand2)) + break; + + // Sanity check + // The branch-taken block of the second candidate should post-dominate the + // first candidate + assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) && + "Branch-taken block should post-dominate first candidate"); + + if (!identicalOperands(Cand1.Cond, Cand2.Cond)) { + DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and " + << Cand2.BranchBlock->getNumber() + << " have different branches\n"); + break; + } + if (!canMerge(Cand2, Cand1)) { + DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand2.BranchBlock->getNumber() << "\n"); + NumBlocksNotCoalesced++; + continue; + } + DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand1.BranchTargetBlock->getNumber() << "\n"); + MergedCandidates = mergeCandidates(Cand2, Cand1); + if (MergedCandidates) + didSomething = true; + + DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n"); + } while (MergedCandidates); + } + +#ifndef NDEBUG + // Verify MF is still valid after branch coalescing + if (didSomething) + MF.verify(nullptr, "Error in code produced by branch coalescing"); +#endif // NDEBUG + + DEBUG(dbgs() << "Finished Branch Coalescing\n"); + return didSomething; +} diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index 6fba161033b02..2d01301402f04 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -49,6 +50,7 @@ STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); STATISTIC(NumBranchOpts, "Number of branches optimized"); STATISTIC(NumTailMerge , "Number of block tails merged"); STATISTIC(NumHoist , "Number of times common instructions are hoisted"); +STATISTIC(NumTailCalls, "Number of tail calls optimized"); static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge", cl::init(cl::BOU_UNSET), cl::Hidden); @@ -123,8 +125,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist, } } -/// RemoveDeadBlock - Remove the specified dead machine basic block from the -/// function, updating the CFG. void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); @@ -144,9 +144,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { MLI->removeBlock(MBB); } -/// OptimizeFunction - Perhaps branch folding, tail merging and other -/// CFG optimizations on the given function. Block placement changes the layout -/// and may create new tail merging opportunities. bool BranchFolder::OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, @@ -348,8 +345,6 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1, return TailLen; } -/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything -/// after it, replacing it with an unconditional branch to NewDest. void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, MachineBasicBlock *NewDest) { TII->ReplaceTailWithBranchTo(OldInst, NewDest); @@ -362,9 +357,6 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, ++NumTailMerge; } -/// SplitMBBAt - Given a machine basic block and an iterator into it, split the -/// MBB so that the part before the iterator falls into the part starting at the -/// iterator. This returns the new MBB. MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MachineBasicBlock::iterator BBI1, const BasicBlock *BB) { @@ -388,7 +380,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end()); // NewMBB belongs to the same loop as CurMBB. - if (MLI) + if (MLI) if (MachineLoop *ML = MLI->getLoopFor(&CurMBB)) ML->addBasicBlockToLoop(NewMBB, MLI->getBase()); @@ -436,7 +428,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB, MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB)); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc dl; // FIXME: this is nowhere + DebugLoc dl = CurMBB->findBranchDebugLoc(); if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) { MachineBasicBlock *NextBB = &*I; if (TBB == NextBB && !Cond.empty() && !FBB) { @@ -497,6 +489,15 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS, return MBFI.printBlockFreq(OS, Freq); } +void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) { + MBFI.view(Name, isSimple); +} + +uint64_t +BranchFolder::MBFIWrapper::getEntryFreq() const { + return MBFI.getEntryFreq(); +} + /// CountTerminators - Count the number of terminators in the given /// block and set I to the position of the first non-terminator, if there /// is one, or MBB->end() otherwise. @@ -516,6 +517,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB, return NumTerms; } +/// A no successor, non-return block probably ends in unreachable and is cold. +/// Also consider a block that ends in an indirect branch to be a return block, +/// since many targets use plain indirect branches to return. +static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) { + if (!MBB->succ_empty()) + return false; + if (MBB->empty()) + return true; + return !(MBB->back().isReturn() || MBB->back().isIndirectBranch()); +} + /// ProfitableToMerge - Check if two machine basic blocks have a common tail /// and decide if it would be profitable to merge those tails. Return the /// length of the common tail and iterators to the first common instruction @@ -570,6 +582,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, return true; } + // If these are identical non-return blocks with no successors, merge them. + // Such blocks are typically cold calls to noreturn functions like abort, and + // are unlikely to become a fallthrough target after machine block placement. + // Tail merging these blocks is unlikely to create additional unconditional + // branches, and will reduce the size of this cold code. + if (I1 == MBB1->begin() && I2 == MBB2->begin() && + blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2)) + return true; + // If one of the blocks can be completely merged and happens to be in // a position where the other could fall through into it, merge any number // of instructions, because it can be done without a branch. @@ -579,6 +600,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin()) return true; + // If both blocks are identical and end in a branch, merge them unless they + // both have a fallthrough predecessor and successor. + // We can only do this after block placement because it depends on whether + // there are fallthroughs, and we don't know until after layout. + if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) { + auto BothFallThrough = [](MachineBasicBlock *MBB) { + if (MBB->succ_size() != 0 && !MBB->canFallThrough()) + return false; + MachineFunction::iterator I(MBB); + MachineFunction *MF = MBB->getParent(); + return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough(); + }; + if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2)) + return true; + } + // If both blocks have an unconditional branch temporarily stripped out, // count that as an additional common instruction for the following // heuristics. This heuristic is only accurate for single-succ blocks, so to @@ -604,16 +641,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, (I1 == MBB1->begin() || I2 == MBB2->begin()); } -/// ComputeSameTails - Look through all the blocks in MergePotentials that have -/// hash CurHash (guaranteed to match the last element). Build the vector -/// SameTails of all those that have the (same) largest number of instructions -/// in common of any pair of these blocks. SameTails entries contain an -/// iterator into MergePotentials (from which the MachineBasicBlock can be -/// found) and a MachineBasicBlock::iterator into that MBB indicating the -/// instruction where the matching code sequence begins. -/// Order of elements in SameTails is the reverse of the order in which -/// those blocks appear in MergePotentials (where they are not necessarily -/// consecutive). unsigned BranchFolder::ComputeSameTails(unsigned CurHash, unsigned MinCommonTailLength, MachineBasicBlock *SuccBB, @@ -650,8 +677,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash, return maxCommonTailLength; } -/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from -/// MergePotentials, restoring branches at ends of blocks as appropriate. void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB) { @@ -671,8 +696,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, MergePotentials.erase(CurMPIter, MergePotentials.end()); } -/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist -/// only of the common tail. Create a block that does by splitting one. bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, MachineBasicBlock *SuccBB, unsigned maxCommonTailLength, @@ -723,6 +746,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, return true; } +void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) { + MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock(); + + std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size()); + for (unsigned int i = 0 ; i != SameTails.size() ; ++i) { + if (i != commonTailIndex) + NextCommonInsts[i] = SameTails[i].getTailStartPos(); + else { + assert(SameTails[i].getTailStartPos() == MBB->begin() && + "MBB is not a common tail only block"); + } + } + + for (auto &MI : *MBB) { + if (MI.isDebugValue()) + continue; + DebugLoc DL = MI.getDebugLoc(); + for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) { + if (i == commonTailIndex) + continue; + + auto &Pos = NextCommonInsts[i]; + assert(Pos != SameTails[i].getBlock()->end() && + "Reached BB end within common tail"); + while (Pos->isDebugValue()) { + ++Pos; + assert(Pos != SameTails[i].getBlock()->end() && + "Reached BB end within common tail"); + } + assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!"); + DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc()); + NextCommonInsts[i] = ++Pos; + } + MI.setDebugLoc(DL); + } +} + static void mergeOperations(MachineBasicBlock::iterator MBBIStartPos, MachineBasicBlock &MBBCommon) { @@ -875,10 +935,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB, // Recompute common tail MBB's edge weights and block frequency. setCommonTailEdgeWeights(*MBB); - // Remove the original debug location from the common tail. - for (auto &MI : *MBB) - if (!MI.isDebugValue()) - MI.setDebugLoc(DebugLoc()); + // Merge debug locations across identical instructions for common tail. + MergeCommonTailDebugLocs(commonTailIndex); // MBB is common tail. Adjust all other BB's to jump to this one. // Traversal must be forwards so erases work. @@ -1043,7 +1101,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // Remove the unconditional branch at the end, if any. if (TBB && (Cond.empty() || FBB)) { - DebugLoc dl; // FIXME: this is nowhere + DebugLoc dl = PBB->findBranchDebugLoc(); TII->removeBranch(*PBB); if (!Cond.empty()) // reinsert conditional branch only, for now @@ -1193,8 +1251,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) { return DebugLoc(); } -/// OptimizeBlock - Analyze and optimize control flow related to the specified -/// block. This is never called on the entry block. bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { bool MadeChange = false; MachineFunction &MF = *MBB->getParent(); @@ -1386,6 +1442,42 @@ ReoptimizeBlock: } } + if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && + MF.getFunction()->optForSize()) { + // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch + // direction, thereby defeating careful block placement and regressing + // performance. Therefore, only consider this for optsize functions. + MachineInstr &TailCall = *MBB->getFirstNonDebugInstr(); + if (TII->isUnconditionalTailCall(TailCall)) { + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector<MachineOperand, 4> PredCond; + bool PredAnalyzable = + !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); + + if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) { + // The predecessor has a conditional branch to this block which consists + // of only a tail call. Try to fold the tail call into the conditional + // branch. + if (TII->canMakeTailCallConditional(PredCond, TailCall)) { + // TODO: It would be nice if analyzeBranch() could provide a pointer + // to the branch insturction so replaceBranchWithTailCall() doesn't + // have to search for it. + TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); + ++NumTailCalls; + Pred->removeSuccessor(MBB); + MadeChange = true; + return MadeChange; + } + } + // If the predecessor is falling through to this block, we could reverse + // the branch condition and fold the tail call into that. However, after + // that we might have to re-arrange the CFG to fall through to the other + // block and there is a high risk of regressing code size rather than + // improving it. + } + } + // Analyze the branch in the current block. MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr; SmallVector<MachineOperand, 4> CurCond; @@ -1599,8 +1691,6 @@ ReoptimizeBlock: // Hoist Common Code //===----------------------------------------------------------------------===// -/// HoistCommonCode - Hoist common instruction sequences at the start of basic -/// blocks to their common predecessor. bool BranchFolder::HoistCommonCode(MachineFunction &MF) { bool MadeChange = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) { @@ -1734,9 +1824,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, return PI; } -/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction -/// sequence at the start of the function, move the instructions before MBB -/// terminator if it's legal. bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h index fc48e484292d6..4852721eea102 100644 --- a/lib/CodeGen/BranchFolding.h +++ b/lib/CodeGen/BranchFolding.h @@ -37,6 +37,9 @@ namespace llvm { // flag. Ignored for optsize. unsigned MinCommonTailLength = 0); + /// Perhaps branch folding, tail merging and other CFG optimizations on the + /// given function. Block placement changes the layout and may create new + /// tail merging opportunities. bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli = nullptr, @@ -122,6 +125,8 @@ namespace llvm { const MachineBasicBlock *MBB) const; raw_ostream &printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const; + void view(const Twine &Name, bool isSimple = true); + uint64_t getEntryFreq() const; private: const MachineBlockFrequencyInfo &MBFI; @@ -137,26 +142,64 @@ namespace llvm { MachineBasicBlock* PredBB, unsigned MinCommonTailLength); void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB); + + /// Delete the instruction OldInst and everything after it, replacing it + /// with an unconditional branch to NewDest. void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, MachineBasicBlock *NewDest); + + /// Given a machine basic block and an iterator into it, split the MBB so + /// that the part before the iterator falls into the part starting at the + /// iterator. This returns the new MBB. MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB, MachineBasicBlock::iterator BBI1, const BasicBlock *BB); + + /// Look through all the blocks in MergePotentials that have hash CurHash + /// (guaranteed to match the last element). Build the vector SameTails of + /// all those that have the (same) largest number of instructions in common + /// of any pair of these blocks. SameTails entries contain an iterator into + /// MergePotentials (from which the MachineBasicBlock can be found) and a + /// MachineBasicBlock::iterator into that MBB indicating the instruction + /// where the matching code sequence begins. Order of elements in SameTails + /// is the reverse of the order in which those blocks appear in + /// MergePotentials (where they are not necessarily consecutive). unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB); + + /// Remove all blocks with hash CurHash from MergePotentials, restoring + /// branches at ends of blocks as appropriate. void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB, MachineBasicBlock* PredBB); + + /// None of the blocks to be tail-merged consist only of the common tail. + /// Create a block that does by splitting one. bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, MachineBasicBlock *SuccBB, unsigned maxCommonTailLength, unsigned &commonTailIndex); + /// Create merged DebugLocs of identical instructions across SameTails and + /// assign it to the instruction in common tail. + void MergeCommonTailDebugLocs(unsigned commonTailIndex); + bool OptimizeBranches(MachineFunction &MF); + + /// Analyze and optimize control flow related to the specified block. This + /// is never called on the entry block. bool OptimizeBlock(MachineBasicBlock *MBB); + + /// Remove the specified dead machine basic block from the function, + /// updating the CFG. void RemoveDeadBlock(MachineBasicBlock *MBB); + /// Hoist common instruction sequences at the start of basic blocks to their + /// common predecessor. bool HoistCommonCode(MachineFunction &MF); + + /// If the successors of MBB has common instruction sequence at the start of + /// the function, move the instructions before MBB terminator if it's legal. bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB); }; } diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp index 8b27570a17f47..7af1369416615 100644 --- a/lib/CodeGen/BranchRelaxation.cpp +++ b/lib/CodeGen/BranchRelaxation.cpp @@ -126,14 +126,16 @@ void BranchRelaxation::verify() { #endif } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// print block size and offset information - debugging -void BranchRelaxation::dumpBBs() { +LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() { for (auto &MBB : *MF) { const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()]; dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset) << format("size=%#x\n", BBI.Size); } } +#endif /// scanFunction - Do the initial scan of the function, building up /// information about each block. diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp index ff7c99de0420e..e4eab8c513d99 100644 --- a/lib/CodeGen/BuiltinGCs.cpp +++ b/lib/CodeGen/BuiltinGCs.cpp @@ -1,4 +1,4 @@ -//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===// +//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===// // // The LLVM Compiler Infrastructure // @@ -14,6 +14,8 @@ #include "llvm/CodeGen/GCs.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Casting.h" using namespace llvm; @@ -77,6 +79,7 @@ public: UsesMetadata = false; CustomRoots = false; } + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. const PointerType *PT = cast<PointerType>(Ty); @@ -110,6 +113,7 @@ public: UsesMetadata = false; CustomRoots = false; } + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. const PointerType *PT = cast<PointerType>(Ty); @@ -117,7 +121,8 @@ public: return (1 == PT->getAddressSpace()); } }; -} + +} // end anonymous namespace // Register all the above so that they can be found at runtime. Note that // these static initializers are important since the registration list is diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 398ea88363b64..0912d9f68aff2 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_library(LLVMCodeGen Analysis.cpp AtomicExpandPass.cpp BasicTargetTransformInfo.cpp + BranchCoalescing.cpp BranchFolding.cpp BranchRelaxation.cpp BuiltinGCs.cpp @@ -23,6 +24,7 @@ add_llvm_library(LLVMCodeGen ExpandISelPseudos.cpp ExpandPostRAPseudos.cpp FaultMaps.cpp + FEntryInserter.cpp FuncletLayout.cpp GCMetadata.cpp GCMetadataPrinter.cpp @@ -36,6 +38,7 @@ add_llvm_library(LLVMCodeGen InterleavedAccessPass.cpp IntrinsicLowering.cpp LatencyPriorityQueue.cpp + LazyMachineBlockFrequencyInfo.cpp LexicalScopes.cpp LiveDebugValues.cpp LiveDebugVariables.cpp @@ -46,6 +49,7 @@ add_llvm_library(LLVMCodeGen LiveRangeCalc.cpp LiveRangeEdit.cpp LiveRegMatrix.cpp + LiveRegUnits.cpp LiveStackAnalysis.cpp LiveVariables.cpp LLVMTargetMachine.cpp @@ -70,6 +74,8 @@ add_llvm_library(LLVMCodeGen MachineLoopInfo.cpp MachineModuleInfo.cpp MachineModuleInfoImpls.cpp + MachineOptimizationRemarkEmitter.cpp + MachineOutliner.cpp MachinePassRegistry.cpp MachinePipeliner.cpp MachinePostDominators.cpp @@ -147,7 +153,7 @@ add_llvm_library(LLVMCodeGen ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP - LINK_LIBS ${PTHREAD_LIB} + LINK_LIBS ${LLVM_PTHREAD_LIB} DEPENDS intrinsics_gen diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp index 2e33f14c7ee3e..7cad4d0311694 100644 --- a/lib/CodeGen/CallingConvLower.cpp +++ b/lib/CodeGen/CallingConvLower.cpp @@ -30,8 +30,7 @@ using namespace llvm; CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) : CallingConv(CC), IsVarArg(isVarArg), MF(mf), - TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C), - CallOrPrologue(Unknown) { + TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) { // No stack is used. StackOffset = 0; MaxStackArgAlign = 1; diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 4cf9b138f10d3..3fc12ccc3b60c 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -21,6 +21,7 @@ using namespace llvm; /// initializeCodeGen - Initialize all passes linked into the CodeGen library. void llvm::initializeCodeGen(PassRegistry &Registry) { initializeAtomicExpandPass(Registry); + initializeBranchCoalescingPass(Registry); initializeBranchFolderPassPass(Registry); initializeBranchRelaxationPass(Registry); initializeCodeGenPreparePass(Registry); @@ -31,12 +32,15 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeEarlyIfConverterPass(Registry); initializeExpandISelPseudosPass(Registry); initializeExpandPostRAPass(Registry); + initializeFEntryInserterPass(Registry); initializeFinalizeMachineBundlesPass(Registry); initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); + initializeImplicitNullChecksPass(Registry); initializeInterleavedAccessPass(Registry); + initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); initializeLiveStacksPass(Registry); @@ -47,7 +51,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCSEPass(Registry); - initializeImplicitNullChecksPass(Registry); initializeMachineCombinerPass(Registry); initializeMachineCopyPropagationPass(Registry); initializeMachineDominatorTreePass(Registry); @@ -55,16 +58,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoPass(Registry); + initializeMachineOptimizationRemarkEmitterPassPass(Registry); + initializeMachineOutlinerPass(Registry); initializeMachinePipelinerPass(Registry); initializeMachinePostDominatorTreePass(Registry); + initializeMachineRegionInfoPassPass(Registry); initializeMachineSchedulerPass(Registry); initializeMachineSinkingPass(Registry); initializeMachineVerifierPassPass(Registry); - initializeXRayInstrumentationPass(Registry); - initializePatchableFunctionPass(Registry); initializeOptimizePHIsPass(Registry); initializePEIPass(Registry); initializePHIEliminationPass(Registry); + initializePatchableFunctionPass(Registry); initializePeepholeOptimizerPass(Registry); initializePostMachineSchedulerPass(Registry); initializePostRAHazardRecognizerPass(Registry); @@ -74,12 +79,11 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRAGreedyPass(Registry); initializeRegisterCoalescerPass(Registry); initializeRenameIndependentSubregsPass(Registry); + initializeSafeStackPass(Registry); initializeShrinkWrapPass(Registry); initializeSlotIndexesPass(Registry); initializeStackColoringPass(Registry); initializeStackMapLivenessPass(Registry); - initializeLiveDebugValuesPass(Registry); - initializeSafeStackPass(Registry); initializeStackProtectorPass(Registry); initializeStackSlotColoringPass(Registry); initializeTailDuplicatePassPass(Registry); @@ -91,6 +95,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeVirtRegMapPass(Registry); initializeVirtRegRewriterPass(Registry); initializeWinEHPreparePass(Registry); + initializeXRayInstrumentationPass(Registry); } void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 934b470f13b5f..2bdd189557b40 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -15,10 +15,12 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -53,8 +55,10 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -77,7 +81,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); -STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); static cl::opt<bool> DisableBranchOpts( @@ -93,7 +96,7 @@ static cl::opt<bool> DisableSelectToBranch( cl::desc("Disable select to branch conversion.")); static cl::opt<bool> AddrSinkUsingGEPs( - "addr-sink-using-gep", cl::Hidden, cl::init(false), + "addr-sink-using-gep", cl::Hidden, cl::init(true), cl::desc("Address sinking in CGP using GEPs.")); static cl::opt<bool> EnableAndCmpSinking( @@ -135,15 +138,24 @@ static cl::opt<bool> ForceSplitStore( "force-split-store", cl::Hidden, cl::init(false), cl::desc("Force store splitting no matter what the target query says.")); +static cl::opt<bool> +EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, + cl::desc("Enable merging of redundant sexts when one is dominating" + " the other."), cl::init(true)); + namespace { typedef SmallPtrSet<Instruction *, 16> SetOfInstrs; typedef PointerIntPair<Type *, 1, bool> TypeIsSExt; typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; +typedef SmallVector<Instruction *, 16> SExts; +typedef DenseMap<Value *, SExts> ValueToSExts; class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { const TargetMachine *TM; + const TargetSubtargetInfo *SubtargetInfo; const TargetLowering *TLI; + const TargetRegisterInfo *TRI; const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; const LoopInfo *LI; @@ -165,6 +177,15 @@ class TypePromotionTransaction; /// promotion for the current function. InstrToOrigTy PromotedInsts; + /// Keep track of instructions removed during promotion. + SetOfInstrs RemovedInsts; + + /// Keep track of sext chains based on their initial value. + DenseMap<Value *, Instruction *> SeenChainsForSExt; + + /// Keep track of SExt promoted. + ValueToSExts ValToSExtendedUses; + /// True if CFG is modified in any way. bool ModifiedDT; @@ -206,7 +227,7 @@ class TypePromotionTransaction; Type *AccessTy, unsigned AS); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); - bool moveExtToFormExtLoad(Instruction *&I); + bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); @@ -215,13 +236,21 @@ class TypePromotionTransaction; bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); - bool sinkAndCmp(Function &F); - bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, - Instruction *&Inst, - const SmallVectorImpl<Instruction *> &Exts, - unsigned CreatedInstCost); + bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, + LoadInst *&LI, Instruction *&Inst, bool HasPromoted); + bool tryToPromoteExts(TypePromotionTransaction &TPT, + const SmallVectorImpl<Instruction *> &Exts, + SmallVectorImpl<Instruction *> &ProfitablyMovedExts, + unsigned CreatedInstsCost = 0); + bool mergeSExts(Function &F); + bool performAddressTypePromotion( + Instruction *&Inst, + bool AllowPromotionWithoutCommonHeader, + bool HasPromoted, TypePromotionTransaction &TPT, + SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); + bool splitIndirectCriticalEdges(Function &F); }; } @@ -250,8 +279,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) { BPI.reset(); ModifiedDT = false; - if (TM) - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + if (TM) { + SubtargetInfo = TM->getSubtargetImpl(F); + TLI = SubtargetInfo->getTargetLowering(); + TRI = SubtargetInfo->getRegisterInfo(); + } TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -260,9 +292,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (ProfileGuidedSectionPrefix) { ProfileSummaryInfo *PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - if (PSI->isFunctionEntryHot(&F)) + if (PSI->isFunctionHotInCallGraph(&F)) F.setSectionPrefix(".hot"); - else if (PSI->isFunctionEntryCold(&F)) + else if (PSI->isFunctionColdInCallGraph(&F)) F.setSectionPrefix(".cold"); } @@ -290,18 +322,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // find a node corresponding to the value. EverMadeChange |= placeDbgValues(F); - // If there is a mask, compare against zero, and branch that can be combined - // into a single target instruction, push the mask and compare into branch - // users. Do this before OptimizeBlock -> OptimizeInst -> - // OptimizeCmpExpression, which perturbs the pattern being searched for. - if (!DisableBranchOpts) { - EverMadeChange |= sinkAndCmp(F); + if (!DisableBranchOpts) EverMadeChange |= splitBranchCondition(F); - } + + // Split some critical edges where one of the sources is an indirect branch, + // to help generate sane code for PHIs involving such edges. + EverMadeChange |= splitIndirectCriticalEdges(F); bool MadeChange = true; while (MadeChange) { MadeChange = false; + SeenChainsForSExt.clear(); + ValToSExtendedUses.clear(); + RemovedInsts.clear(); for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; @@ -311,6 +344,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (ModifiedDTOnIteration) break; } + if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) + MadeChange |= mergeSExts(F); + + // Really free removed instructions during promotion. + for (Instruction *I : RemovedInsts) + delete I; + EverMadeChange |= MadeChange; } @@ -432,6 +472,154 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { return DestBB; } +// Return the unique indirectbr predecessor of a block. This may return null +// even if such a predecessor exists, if it's not useful for splitting. +// If a predecessor is found, OtherPreds will contain all other (non-indirectbr) +// predecessors of BB. +static BasicBlock * +findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { + // If the block doesn't have any PHIs, we don't care about it, since there's + // no point in splitting it. + PHINode *PN = dyn_cast<PHINode>(BB->begin()); + if (!PN) + return nullptr; + + // Verify we have exactly one IBR predecessor. + // Conservatively bail out if one of the other predecessors is not a "regular" + // terminator (that is, not a switch or a br). + BasicBlock *IBB = nullptr; + for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { + BasicBlock *PredBB = PN->getIncomingBlock(Pred); + TerminatorInst *PredTerm = PredBB->getTerminator(); + switch (PredTerm->getOpcode()) { + case Instruction::IndirectBr: + if (IBB) + return nullptr; + IBB = PredBB; + break; + case Instruction::Br: + case Instruction::Switch: + OtherPreds.push_back(PredBB); + continue; + default: + return nullptr; + } + } + + return IBB; +} + +// Split critical edges where the source of the edge is an indirectbr +// instruction. This isn't always possible, but we can handle some easy cases. +// This is useful because MI is unable to split such critical edges, +// which means it will not be able to sink instructions along those edges. +// This is especially painful for indirect branches with many successors, where +// we end up having to prepare all outgoing values in the origin block. +// +// Our normal algorithm for splitting critical edges requires us to update +// the outgoing edges of the edge origin block, but for an indirectbr this +// is hard, since it would require finding and updating the block addresses +// the indirect branch uses. But if a block only has a single indirectbr +// predecessor, with the others being regular branches, we can do it in a +// different way. +// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr. +// We can split D into D0 and D1, where D0 contains only the PHIs from D, +// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and +// create the following structure: +// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1 +bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) { + // Check whether the function has any indirectbrs, and collect which blocks + // they may jump to. Since most functions don't have indirect branches, + // this lowers the common case's overhead to O(Blocks) instead of O(Edges). + SmallSetVector<BasicBlock *, 16> Targets; + for (auto &BB : F) { + auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); + if (!IBI) + continue; + + for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) + Targets.insert(IBI->getSuccessor(Succ)); + } + + if (Targets.empty()) + return false; + + bool Changed = false; + for (BasicBlock *Target : Targets) { + SmallVector<BasicBlock *, 16> OtherPreds; + BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); + // If we did not found an indirectbr, or the indirectbr is the only + // incoming edge, this isn't the kind of edge we're looking for. + if (!IBRPred || OtherPreds.empty()) + continue; + + // Don't even think about ehpads/landingpads. + Instruction *FirstNonPHI = Target->getFirstNonPHI(); + if (FirstNonPHI->isEHPad() || Target->isLandingPad()) + continue; + + BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); + // It's possible Target was its own successor through an indirectbr. + // In this case, the indirectbr now comes from BodyBlock. + if (IBRPred == Target) + IBRPred = BodyBlock; + + // At this point Target only has PHIs, and BodyBlock has the rest of the + // block's body. Create a copy of Target that will be used by the "direct" + // preds. + ValueToValueMapTy VMap; + BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); + + for (BasicBlock *Pred : OtherPreds) + Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc); + + // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that + // they are clones, so the number of PHIs are the same. + // (a) Remove the edge coming from IBRPred from the "Direct" PHI + // (b) Leave that as the only edge in the "Indirect" PHI. + // (c) Merge the two in the body block. + BasicBlock::iterator Indirect = Target->begin(), + End = Target->getFirstNonPHI()->getIterator(); + BasicBlock::iterator Direct = DirectSucc->begin(); + BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); + + assert(&*End == Target->getTerminator() && + "Block was expected to only contain PHIs"); + + while (Indirect != End) { + PHINode *DirPHI = cast<PHINode>(Direct); + PHINode *IndPHI = cast<PHINode>(Indirect); + + // Now, clean up - the direct block shouldn't get the indirect value, + // and vice versa. + DirPHI->removeIncomingValue(IBRPred); + Direct++; + + // Advance the pointer here, to avoid invalidation issues when the old + // PHI is erased. + Indirect++; + + PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); + NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), + IBRPred); + + // Create a PHI in the body block, to merge the direct and indirect + // predecessors. + PHINode *MergePHI = + PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); + MergePHI->addIncoming(NewIndPHI, Target); + MergePHI->addIncoming(DirPHI, DirectSucc); + + IndPHI->replaceAllUsesWith(MergePHI); + IndPHI->eraseFromParent(); + } + + Changed = true; + } + + return Changed; +} + /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these @@ -1090,6 +1278,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) { return false; } +/// Duplicate and sink the given 'and' instruction into user blocks where it is +/// used in a compare to allow isel to generate better code for targets where +/// this operation can be combined. +/// +/// Return true if any changes are made. +static bool sinkAndCmp0Expression(Instruction *AndI, + const TargetLowering &TLI, + SetOfInstrs &InsertedInsts) { + // Double-check that we're not trying to optimize an instruction that was + // already optimized by some other part of this pass. + assert(!InsertedInsts.count(AndI) && + "Attempting to optimize already optimized and instruction"); + (void) InsertedInsts; + + // Nothing to do for single use in same basic block. + if (AndI->hasOneUse() && + AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) + return false; + + // Try to avoid cases where sinking/duplicating is likely to increase register + // pressure. + if (!isa<ConstantInt>(AndI->getOperand(0)) && + !isa<ConstantInt>(AndI->getOperand(1)) && + AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) + return false; + + for (auto *U : AndI->users()) { + Instruction *User = cast<Instruction>(U); + + // Only sink for and mask feeding icmp with 0. + if (!isa<ICmpInst>(User)) + return false; + + auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); + if (!CmpC || !CmpC->isZero()) + return false; + } + + if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) + return false; + + DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); + DEBUG(AndI->getParent()->dump()); + + // Push the 'and' into the same block as the icmp 0. There should only be + // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any + // others, so we don't need to keep track of which BBs we insert into. + for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); + + // Keep the 'and' in the same place if the use is already in the same block. + Instruction *InsertPt = + User->getParent() == AndI->getParent() ? AndI : User; + Instruction *InsertedAnd = + BinaryOperator::Create(Instruction::And, AndI->getOperand(0), + AndI->getOperand(1), "", InsertPt); + // Propagate the debug info. + InsertedAnd->setDebugLoc(AndI->getDebugLoc()); + + // Replace a use of the 'and' with a use of the new 'and'. + TheUse = InsertedAnd; + ++NumAndUses; + DEBUG(User->getParent()->dump()); + } + + // We removed all uses, nuke the and. + AndI->eraseFromParent(); + return true; +} + /// Check if the candidates could be combined with a shift instruction, which /// includes: /// 1. Truncate instruction @@ -2028,16 +2293,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } if (TLI) { - // Unknown address space. - // TODO: Target hook to pick which address space the intrinsic cares - // about? - unsigned AddrSpace = ~0u; SmallVector<Value*, 2> PtrOps; Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) - while (!PtrOps.empty()) - if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) + if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) { + Value *PtrVal = PtrOps.pop_back_val(); + unsigned AS = PtrVal->getType()->getPointerAddressSpace(); + if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) return true; + } } } @@ -2168,11 +2432,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CalleeAttrs = CS.getAttributes(); - if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias) != - AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias)) + AttributeList CalleeAttrs = CS.getAttributes(); + if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias) != + AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to @@ -2561,25 +2825,30 @@ class TypePromotionTransaction { OperandsHider Hider; /// Keep track of the uses replaced, if any. UsesReplacer *Replacer; + /// Keep track of instructions removed. + SetOfInstrs &RemovedInsts; public: /// \brief Remove all reference of \p Inst and optinally replace all its /// uses with New. + /// \p RemovedInsts Keep track of the instructions removed by this Action. /// \pre If !Inst->use_empty(), then New != nullptr - InstructionRemover(Instruction *Inst, Value *New = nullptr) + InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, + Value *New = nullptr) : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), - Replacer(nullptr) { + Replacer(nullptr), RemovedInsts(RemovedInsts) { if (New) Replacer = new UsesReplacer(Inst, New); DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); + RemovedInsts.insert(Inst); + /// The instructions removed here will be freed after completing + /// optimizeBlock() for all blocks as we need to keep track of the + /// removed instructions during promotion. Inst->removeFromParent(); } ~InstructionRemover() override { delete Replacer; } - /// \brief Really remove the instruction. - void commit() override { delete Inst; } - /// \brief Resurrect the instruction and reassign it to the proper uses if /// new value was provided when build this action. void undo() override { @@ -2588,6 +2857,7 @@ class TypePromotionTransaction { if (Replacer) Replacer->undo(); Hider.undo(); + RemovedInsts.erase(Inst); } }; @@ -2596,6 +2866,10 @@ public: /// The restoration point is a pointer to an action instead of an iterator /// because the iterator may be invalidated but not the pointer. typedef const TypePromotionAction *ConstRestorationPt; + + TypePromotionTransaction(SetOfInstrs &RemovedInsts) + : RemovedInsts(RemovedInsts) {} + /// Advocate every changes made in that transaction. void commit(); /// Undo all the changes made after the given point. @@ -2627,6 +2901,7 @@ private: /// The ordered list of actions made so far. SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions; typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt; + SetOfInstrs &RemovedInsts; }; void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, @@ -2638,7 +2913,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, void TypePromotionTransaction::eraseInstruction(Instruction *Inst, Value *NewVal) { Actions.push_back( - make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal)); + make_unique<TypePromotionTransaction::InstructionRemover>(Inst, + RemovedInsts, NewVal)); } void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, @@ -2705,8 +2981,8 @@ void TypePromotionTransaction::rollback( /// This encapsulates the logic for matching the target-legal addressing modes. class AddressingModeMatcher { SmallVectorImpl<Instruction*> &AddrModeInsts; - const TargetMachine &TM; const TargetLowering &TLI; + const TargetRegisterInfo &TRI; const DataLayout &DL; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and @@ -2731,14 +3007,14 @@ class AddressingModeMatcher { bool IgnoreProfitability; AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, - const TargetMachine &TM, Type *AT, unsigned AS, + const TargetLowering &TLI, + const TargetRegisterInfo &TRI, + Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) - : AddrModeInsts(AMI), TM(TM), - TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) - ->getTargetLowering()), + : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT) { @@ -2756,13 +3032,15 @@ public: static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, SmallVectorImpl<Instruction*> &AddrModeInsts, - const TargetMachine &TM, + const TargetLowering &TLI, + const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, + AccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT).matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -3583,18 +3861,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { /// Check to see if all uses of OpVal by the specified inline asm call are due /// to memory operands. If so, return true, otherwise return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetMachine &TM) { + const TargetLowering &TLI, + const TargetRegisterInfo &TRI) { const Function *F = CI->getParent()->getParent(); - const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); - const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = - TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, + TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, ImmutableCallSite(CI)); + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. - TLI->ComputeConstraintToUse(OpInfo, SDValue()); + TLI.ComputeConstraintToUse(OpInfo, SDValue()); // If this asm operand is our Value*, and if it isn't an indirect memory // operand, we can't fold it! @@ -3613,7 +3891,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, static bool FindAllMemoryUses( Instruction *I, SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, - SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) { + SmallPtrSetImpl<Instruction *> &ConsideredInsts, + const TargetLowering &TLI, const TargetRegisterInfo &TRI) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -3635,11 +3914,28 @@ static bool FindAllMemoryUses( if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { unsigned opNo = U.getOperandNo(); - if (opNo == 0) return true; // Storing addr, not into addr. + if (opNo != StoreInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { + unsigned opNo = U.getOperandNo(); + if (opNo != AtomicRMWInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(RMW, opNo)); + continue; + } + + if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { + unsigned opNo = U.getOperandNo(); + if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(CmpX, opNo)); + continue; + } + if (CallInst *CI = dyn_cast<CallInst>(UserI)) { // If this is a cold call, we can sink the addressing calculation into // the cold path. See optimizeCallInst @@ -3650,12 +3946,12 @@ static bool FindAllMemoryUses( if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TM)) + if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI)) return true; continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI)) return true; } @@ -3743,7 +4039,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // the use is just a particularly nice way of sinking it. SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -3775,7 +4071,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode Result; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, + AddressAccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; @@ -3844,7 +4141,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool IsNumUsesConsensusValid = false; SmallVector<Instruction*, 16> AddrModeInsts; ExtAddrMode AddrMode; - TypePromotionTransaction TPT; + TypePromotionTransaction TPT(RemovedInsts); TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); while (!worklist.empty()) { @@ -3869,7 +4166,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // addressing instructions might have. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, + V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI, InsertedInsts, PromotedInsts, TPT); // This check is broken into two cases with very similar code to avoid using @@ -3935,11 +4232,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && TM && - TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) - ->useAA())) { + SubtargetInfo->useAA())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4042,7 +4338,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // We need to add this separately from the scale above to help with // SDAG consecutive load/store merging. if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } @@ -4053,12 +4349,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, SunkAddr = ResultPtr; } else { if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4185,14 +4481,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { return MadeChange; } -/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// \brief Check if all the uses of \p Val are equivalent (or free) zero or /// sign extensions. -static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { - assert(!Inst->use_empty() && "Input must have at least one use"); - const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin()); +static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { + assert(!Val->use_empty() && "Input must have at least one use"); + const Instruction *FirstUser = cast<Instruction>(*Val->user_begin()); bool IsSExt = isa<SExtInst>(FirstUser); Type *ExtTy = FirstUser->getType(); - for (const User *U : Inst->users()) { + for (const User *U : Val->users()) { const Instruction *UI = cast<Instruction>(U); if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI))) return false; @@ -4202,11 +4498,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { continue; // If IsSExt is true, we are in this situation: - // a = Inst + // a = Val // b = sext ty1 a to ty2 // c = sext ty1 a to ty3 // Assuming ty2 is shorter than ty3, this could be turned into: - // a = Inst + // a = Val // b = sext ty1 a to ty2 // c = sext ty2 b to ty3 // However, the last sext is not free. @@ -4233,51 +4529,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { return true; } -/// \brief Try to form ExtLd by promoting \p Exts until they reach a -/// load instruction. -/// If an ext(load) can be formed, it is returned via \p LI for the load -/// and \p Inst for the extension. -/// Otherwise LI == nullptr and Inst == nullptr. -/// When some promotion happened, \p TPT contains the proper state to -/// revert them. +/// \brief Try to speculatively promote extensions in \p Exts and continue +/// promoting through newly promoted operands recursively as far as doing so is +/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. +/// When some promotion happened, \p TPT contains the proper state to revert +/// them. /// -/// \return true when promoting was necessary to expose the ext(load) -/// opportunity, false otherwise. -/// -/// Example: -/// \code -/// %ld = load i32* %addr -/// %add = add nuw i32 %ld, 4 -/// %zext = zext i32 %add to i64 -/// \endcode -/// => -/// \code -/// %ld = load i32* %addr -/// %zext = zext i32 %ld to i64 -/// %add = add nuw i64 %zext, 4 -/// \encode -/// Thanks to the promotion, we can match zext(load i32*) to i64. -bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, - LoadInst *&LI, Instruction *&Inst, - const SmallVectorImpl<Instruction *> &Exts, - unsigned CreatedInstsCost = 0) { - // Iterate over all the extensions to see if one form an ext(load). +/// \return true if some promotion happened, false otherwise. +bool CodeGenPrepare::tryToPromoteExts( + TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, + SmallVectorImpl<Instruction *> &ProfitablyMovedExts, + unsigned CreatedInstsCost) { + bool Promoted = false; + + // Iterate over all the extensions to try to promote them. for (auto I : Exts) { - // Check if we directly have ext(load). - if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) { - Inst = I; - // No promotion happened here. - return false; + // Early check if we directly have ext(load). + if (isa<LoadInst>(I->getOperand(0))) { + ProfitablyMovedExts.push_back(I); + continue; } - // Check whether or not we want to do any promotion. + + // Check whether or not we want to do any promotion. The reason we have + // this check inside the for loop is to catch the case where an extension + // is directly fed by a load because in such case the extension can be moved + // up without any promotion on its operands. if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) - continue; + return false; + // Get the action to perform the promotion. - TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( - I, InsertedInsts, *TLI, PromotedInsts); + TypePromotionHelper::Action TPH = + TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts); // Check if we can promote. - if (!TPH) + if (!TPH) { + // Save the current extension as we cannot move up through its operand. + ProfitablyMovedExts.push_back(I); continue; + } + // Save the current state. TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); @@ -4297,110 +4586,293 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, // one extension but leave one. However, we optimistically keep going, // because the new extension may be removed too. long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; - TotalCreatedInstsCost -= ExtCost; + // FIXME: It would be possible to propagate a negative value instead of + // conservatively ceiling it to 0. + TotalCreatedInstsCost = + std::max((long long)0, (TotalCreatedInstsCost - ExtCost)); if (!StressExtLdPromotion && (TotalCreatedInstsCost > 1 || !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { - // The promotion is not profitable, rollback to the previous state. + // This promotion is not profitable, rollback to the previous state, and + // save the current extension in ProfitablyMovedExts as the latest + // speculative promotion turned out to be unprofitable. TPT.rollback(LastKnownGood); + ProfitablyMovedExts.push_back(I); + continue; + } + // Continue promoting NewExts as far as doing so is profitable. + SmallVector<Instruction *, 2> NewlyMovedExts; + (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); + bool NewPromoted = false; + for (auto ExtInst : NewlyMovedExts) { + Instruction *MovedExt = cast<Instruction>(ExtInst); + Value *ExtOperand = MovedExt->getOperand(0); + // If we have reached to a load, we need this extra profitability check + // as it could potentially be merged into an ext(load). + if (isa<LoadInst>(ExtOperand) && + !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || + (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) + continue; + + ProfitablyMovedExts.push_back(MovedExt); + NewPromoted = true; + } + + // If none of speculative promotions for NewExts is profitable, rollback + // and save the current extension (I) as the last profitable extension. + if (!NewPromoted) { + TPT.rollback(LastKnownGood); + ProfitablyMovedExts.push_back(I); continue; } // The promotion is profitable. - // Check if it exposes an ext(load). - (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); - if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || - // If we have created a new extension, i.e., now we have two - // extensions. We must make sure one of them is merged with - // the load, otherwise we may degrade the code quality. - (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) - // Promotion happened. - return true; - // If this does not help to expose an ext(load) then, rollback. - TPT.rollback(LastKnownGood); + Promoted = true; } - // None of the extension can form an ext(load). - LI = nullptr; - Inst = nullptr; - return false; + return Promoted; } -/// Move a zext or sext fed by a load into the same basic block as the load, -/// unless conditions are unfavorable. This allows SelectionDAG to fold the -/// extend into the load. -/// \p I[in/out] the extension may be modified during the process if some -/// promotions apply. -/// -bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { - // ExtLoad formation infrastructure requires TLI to be effective. - if (!TLI) - return false; +/// Merging redundant sexts when one is dominating the other. +bool CodeGenPrepare::mergeSExts(Function &F) { + DominatorTree DT(F); + bool Changed = false; + for (auto &Entry : ValToSExtendedUses) { + SExts &Insts = Entry.second; + SExts CurPts; + for (Instruction *Inst : Insts) { + if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || + Inst->getOperand(0) != Entry.first) + continue; + bool inserted = false; + for (auto &Pt : CurPts) { + if (DT.dominates(Inst, Pt)) { + Pt->replaceAllUsesWith(Inst); + RemovedInsts.insert(Pt); + Pt->removeFromParent(); + Pt = Inst; + inserted = true; + Changed = true; + break; + } + if (!DT.dominates(Pt, Inst)) + // Give up if we need to merge in a common dominator as the + // expermients show it is not profitable. + continue; + Inst->replaceAllUsesWith(Pt); + RemovedInsts.insert(Inst); + Inst->removeFromParent(); + inserted = true; + Changed = true; + break; + } + if (!inserted) + CurPts.push_back(Inst); + } + } + return Changed; +} - // Try to promote a chain of computation if it allows to form - // an extended load. - TypePromotionTransaction TPT; - TypePromotionTransaction::ConstRestorationPt LastKnownGood = - TPT.getRestorationPoint(); - SmallVector<Instruction *, 1> Exts; - Exts.push_back(I); - // Look for a load being extended. - LoadInst *LI = nullptr; - Instruction *OldExt = I; - bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); - if (!LI || !I) { - assert(!HasPromoted && !LI && "If we did not match any load instruction " - "the code must remain the same"); - I = OldExt; - return false; +/// Return true, if an ext(load) can be formed from an extension in +/// \p MovedExts. +bool CodeGenPrepare::canFormExtLd( + const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, + Instruction *&Inst, bool HasPromoted) { + for (auto *MovedExtInst : MovedExts) { + if (isa<LoadInst>(MovedExtInst->getOperand(0))) { + LI = cast<LoadInst>(MovedExtInst->getOperand(0)); + Inst = MovedExtInst; + break; + } } + if (!LI) + return false; // If they're already in the same block, there's nothing to do. // Make the cheap checks first if we did not promote. // If we promoted, we need to check if it is indeed profitable. - if (!HasPromoted && LI->getParent() == I->getParent()) + if (!HasPromoted && LI->getParent() == Inst->getParent()) return false; - EVT VT = TLI->getValueType(*DL, I->getType()); + EVT VT = TLI->getValueType(*DL, Inst->getType()); EVT LoadVT = TLI->getValueType(*DL, LI->getType()); // If the load has other users and the truncate is not free, this probably // isn't worthwhile. - if (!LI->hasOneUse() && - (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && - !TLI->isTruncateFree(I->getType(), LI->getType())) { - I = OldExt; - TPT.rollback(LastKnownGood); + if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && + !TLI->isTruncateFree(Inst->getType(), LI->getType())) return false; - } // Check whether the target supports casts folded into loads. unsigned LType; - if (isa<ZExtInst>(I)) + if (isa<ZExtInst>(Inst)) LType = ISD::ZEXTLOAD; else { - assert(isa<SExtInst>(I) && "Unexpected ext type!"); + assert(isa<SExtInst>(Inst) && "Unexpected ext type!"); LType = ISD::SEXTLOAD; } - if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) { - I = OldExt; - TPT.rollback(LastKnownGood); + + return TLI->isLoadExtLegal(LType, VT, LoadVT); +} + +/// Move a zext or sext fed by a load into the same basic block as the load, +/// unless conditions are unfavorable. This allows SelectionDAG to fold the +/// extend into the load. +/// +/// E.g., +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which +/// allow us to match zext(load i32*) to i64. +/// +/// Also, try to promote the computations used to obtain a sign extended +/// value used into memory accesses. +/// E.g., +/// \code +/// a = add nsw i32 b, 3 +/// d = sext i32 a to i64 +/// e = getelementptr ..., i64 d +/// \endcode +/// => +/// \code +/// f = sext i32 b to i64 +/// a = add nsw i64 f, 3 +/// e = getelementptr ..., i64 a +/// \endcode +/// +/// \p Inst[in/out] the extension may be modified during the process if some +/// promotions apply. +bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { + // ExtLoad formation and address type promotion infrastructure requires TLI to + // be effective. + if (!TLI) return false; + + bool AllowPromotionWithoutCommonHeader = false; + /// See if it is an interesting sext operations for the address type + /// promotion before trying to promote it, e.g., the ones with the right + /// type and used in memory accesses. + bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( + *Inst, AllowPromotionWithoutCommonHeader); + TypePromotionTransaction TPT(RemovedInsts); + TypePromotionTransaction::ConstRestorationPt LastKnownGood = + TPT.getRestorationPoint(); + SmallVector<Instruction *, 1> Exts; + SmallVector<Instruction *, 2> SpeculativelyMovedExts; + Exts.push_back(Inst); + + bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); + + // Look for a load being extended. + LoadInst *LI = nullptr; + Instruction *ExtFedByLoad; + + // Try to promote a chain of computation if it allows to form an extended + // load. + if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { + assert(LI && ExtFedByLoad && "Expect a valid load and extension"); + TPT.commit(); + // Move the extend into the same block as the load + ExtFedByLoad->removeFromParent(); + ExtFedByLoad->insertAfter(LI); + // CGP does not check if the zext would be speculatively executed when moved + // to the same basic block as the load. Preserving its original location + // would pessimize the debugging experience, as well as negatively impact + // the quality of sample pgo. We don't want to use "line 0" as that has a + // size cost in the line-table section and logically the zext can be seen as + // part of the load. Therefore we conservatively reuse the same debug + // location for the load and the zext. + ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); + ++NumExtsMoved; + Inst = ExtFedByLoad; + return true; } - // Move the extend into the same block as the load, so that SelectionDAG - // can fold it. - TPT.commit(); - I->removeFromParent(); - I->insertAfter(LI); - // CGP does not check if the zext would be speculatively executed when moved - // to the same basic block as the load. Preserving its original location would - // pessimize the debugging experience, as well as negatively impact the - // quality of sample pgo. We don't want to use "line 0" as that has a - // size cost in the line-table section and logically the zext can be seen as - // part of the load. Therefore we conservatively reuse the same debug location - // for the load and the zext. - I->setDebugLoc(LI->getDebugLoc()); - ++NumExtsMoved; - return true; + // Continue promoting SExts if known as considerable depending on targets. + if (ATPConsiderable && + performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, + HasPromoted, TPT, SpeculativelyMovedExts)) + return true; + + TPT.rollback(LastKnownGood); + return false; +} + +// Perform address type promotion if doing so is profitable. +// If AllowPromotionWithoutCommonHeader == false, we should find other sext +// instructions that sign extended the same initial value. However, if +// AllowPromotionWithoutCommonHeader == true, we expect promoting the +// extension is just profitable. +bool CodeGenPrepare::performAddressTypePromotion( + Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, + bool HasPromoted, TypePromotionTransaction &TPT, + SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { + bool Promoted = false; + SmallPtrSet<Instruction *, 1> UnhandledExts; + bool AllSeenFirst = true; + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + DenseMap<Value *, Instruction *>::iterator AlreadySeen = + SeenChainsForSExt.find(HeadOfChain); + // If there is an unhandled SExt which has the same header, try to promote + // it as well. + if (AlreadySeen != SeenChainsForSExt.end()) { + if (AlreadySeen->second != nullptr) + UnhandledExts.insert(AlreadySeen->second); + AllSeenFirst = false; + } + } + + if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && + SpeculativelyMovedExts.size() == 1)) { + TPT.commit(); + if (HasPromoted) + Promoted = true; + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + SeenChainsForSExt[HeadOfChain] = nullptr; + ValToSExtendedUses[HeadOfChain].push_back(I); + } + // Update Inst as promotion happen. + Inst = SpeculativelyMovedExts.pop_back_val(); + } else { + // This is the first chain visited from the header, keep the current chain + // as unhandled. Defer to promote this until we encounter another SExt + // chain derived from the same header. + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + SeenChainsForSExt[HeadOfChain] = Inst; + } + return false; + } + + if (!AllSeenFirst && !UnhandledExts.empty()) + for (auto VisitedSExt : UnhandledExts) { + if (RemovedInsts.count(VisitedSExt)) + continue; + TypePromotionTransaction TPT(RemovedInsts); + SmallVector<Instruction *, 1> Exts; + SmallVector<Instruction *, 2> Chains; + Exts.push_back(VisitedSExt); + bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); + TPT.commit(); + if (HasPromoted) + Promoted = true; + for (auto I : Chains) { + Value *HeadOfChain = I->getOperand(0); + // Mark this as handled. + SeenChainsForSExt[HeadOfChain] = nullptr; + ValToSExtendedUses[HeadOfChain].push_back(I); + } + } + return Promoted; } bool CodeGenPrepare::optimizeExtUses(Instruction *I) { @@ -4534,13 +5006,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) return false; - // Skip loads we've already transformed or have no reason to transform. - if (Load->hasOneUse()) { - User *LoadUser = *Load->user_begin(); - if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() && - !dyn_cast<PHINode>(LoadUser)) - return false; - } + // Skip loads we've already transformed. + if (Load->hasOneUse() && + InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) + return false; // Look at all uses of Load, looking through phis, to determine how many bits // of the loaded value are needed. @@ -4620,7 +5089,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { // // Also avoid hoisting if we didn't see any ands with the exact DemandBits // mask, since these are the only ands that will be removed by isel. - if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) || WidestAndBits != DemandBits) return false; @@ -4636,6 +5105,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { IRBuilder<> Builder(Load->getNextNode()); auto *NewAnd = dyn_cast<Instruction>( Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + // Mark this instruction as "inserted by CGP", so that other + // optimizations don't touch it. + InsertedInsts.insert(NewAnd); // Replace all uses of load with new and (except for the use of load in the // new and itself). @@ -4985,7 +5457,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); ExtInst->insertBefore(SI); SI->setCondition(ExtInst); - for (SwitchInst::CaseIt Case : SI->cases()) { + for (auto Case : SI->cases()) { APInt NarrowConst = Case.getCaseValue()->getValue(); APInt WideConst = (ExtType == Instruction::ZExt) ? NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); @@ -5514,7 +5986,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { - bool MadeChange = moveExtToFormExtLoad(I); + bool MadeChange = optimizeExt(I); return MadeChange | optimizeExtUses(I); } } @@ -5548,8 +6020,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { return false; } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { + unsigned AS = RMW->getPointerAddressSpace(); + return optimizeMemoryInst(I, RMW->getPointerOperand(), + RMW->getType(), AS); + } + + if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { + unsigned AS = CmpX->getPointerAddressSpace(); + return optimizeMemoryInst(I, CmpX->getPointerOperand(), + CmpX->getCompareOperand()->getType(), AS); + } + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); + if (BinOp && (BinOp->getOpcode() == Instruction::And) && + EnableAndCmpSinking && TLI) + return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); + if (BinOp && (BinOp->getOpcode() == Instruction::AShr || BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); @@ -5679,68 +6167,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) { return MadeChange; } -// If there is a sequence that branches based on comparing a single bit -// against zero that can be combined into a single instruction, and the -// target supports folding these into a single instruction, sink the -// mask and compare into the branch uses. Do this before OptimizeBlock -> -// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being -// searched for. -bool CodeGenPrepare::sinkAndCmp(Function &F) { - if (!EnableAndCmpSinking) - return false; - if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) - return false; - bool MadeChange = false; - for (BasicBlock &BB : F) { - // Does this BB end with the following? - // %andVal = and %val, #single-bit-set - // %icmpVal = icmp %andResult, 0 - // br i1 %cmpVal label %dest1, label %dest2" - BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator()); - if (!Brcc || !Brcc->isConditional()) - continue; - ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0)); - if (!Cmp || Cmp->getParent() != &BB) - continue; - ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1)); - if (!Zero || !Zero->isZero()) - continue; - Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0)); - if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB) - continue; - ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1)); - if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) - continue; - DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump()); - - // Push the "and; icmp" for any users that are conditional branches. - // Since there can only be one branch use per BB, we don't need to keep - // track of which BBs we insert into. - for (Use &TheUse : Cmp->uses()) { - // Find brcc use. - BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse); - if (!BrccUser || !BrccUser->isConditional()) - continue; - BasicBlock *UserBB = BrccUser->getParent(); - if (UserBB == &BB) continue; - DEBUG(dbgs() << "found Brcc use\n"); - - // Sink the "and; icmp" to use. - MadeChange = true; - BinaryOperator *NewAnd = - BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", - BrccUser); - CmpInst *NewCmp = - CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, - "", BrccUser); - TheUse = NewCmp; - ++NumAndCmpsMoved; - DEBUG(BrccUser->getParent()->dump()); - } - } - return MadeChange; -} - /// \brief Scale down both weights to fit into uint32_t. static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; diff --git a/lib/CodeGen/CountingFunctionInserter.cpp b/lib/CodeGen/CountingFunctionInserter.cpp index 1e46a7a99e7e3..7f7350f5fb5cd 100644 --- a/lib/CodeGen/CountingFunctionInserter.cpp +++ b/lib/CodeGen/CountingFunctionInserter.cpp @@ -41,7 +41,7 @@ namespace { Type *VoidTy = Type::getVoidTy(F.getContext()); Constant *CountingFn = F.getParent()->getOrInsertFunction(CountingFunctionName, - VoidTy, nullptr); + VoidTy); CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt()); return true; } diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index 5d60c3055456d..e1eeddf0816c1 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // callee-saved register that is not saved in the prolog. const MachineFrameInfo &MFI = MF.getFrameInfo(); BitVector Pristine = MFI.getPristineRegs(MF); - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { - if (!IsReturnBlock && !Pristine.test(*I)) continue; + for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; + ++I) { + unsigned Reg = *I; + if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + continue; for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp index 17c229a216ae1..7ac2e5445435d 100644 --- a/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { // Start out assuming that reserved registers are live out of this block. LivePhysRegs = MRI->getReservedRegs(); - // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not + // Add live-ins from successors to LivePhysRegs. Normally, physregs are not // live across blocks, but some targets (x86) can have flags live out of a // block. for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp index a7ba694c144d1..6f4ea1912cf4e 100644 --- a/lib/CodeGen/DetectDeadLanes.cpp +++ b/lib/CodeGen/DetectDeadLanes.cpp @@ -441,7 +441,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) { const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg); CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO); if (CrossCopy) - DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI); + DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI); } if (!CrossCopy) diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp index 32c57e3e37051..e272d25047e63 100644 --- a/lib/CodeGen/ExecutionDepsFix.cpp +++ b/lib/CodeGen/ExecutionDepsFix.cpp @@ -6,21 +6,9 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file contains the execution dependency fix pass. -// -// Some X86 SSE instructions like mov, and, or, xor are available in different -// variants for different operand types. These variant instructions are -// equivalent, but on Nehalem and newer cpus there is extra latency -// transferring data between integer and floating point domains. ARM cores -// have similar issues when they are configured with both VFP and NEON -// pipelines. -// -// This pass changes the variant instructions to minimize domain crossings. -// -//===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ExecutionDepsFix.h" + #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,193 +23,18 @@ using namespace llvm; -#define DEBUG_TYPE "execution-fix" - -/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track -/// of execution domains. -/// -/// An open DomainValue represents a set of instructions that can still switch -/// execution domain. Multiple registers may refer to the same open -/// DomainValue - they will eventually be collapsed to the same execution -/// domain. -/// -/// A collapsed DomainValue represents a single register that has been forced -/// into one of more execution domains. There is a separate collapsed -/// DomainValue for each register, but it may contain multiple execution -/// domains. A register value is initially created in a single execution -/// domain, but if we were forced to pay the penalty of a domain crossing, we -/// keep track of the fact that the register is now available in multiple -/// domains. -namespace { -struct DomainValue { - // Basic reference counting. - unsigned Refs; - - // Bitmask of available domains. For an open DomainValue, it is the still - // possible domains for collapsing. For a collapsed DomainValue it is the - // domains where the register is available for free. - unsigned AvailableDomains; - - // Pointer to the next DomainValue in a chain. When two DomainValues are - // merged, Victim.Next is set to point to Victor, so old DomainValue - // references can be updated by following the chain. - DomainValue *Next; - - // Twiddleable instructions using or defining these registers. - SmallVector<MachineInstr*, 8> Instrs; - - // A collapsed DomainValue has no instructions to twiddle - it simply keeps - // track of the domains where the registers are already available. - bool isCollapsed() const { return Instrs.empty(); } - - // Is domain available? - bool hasDomain(unsigned domain) const { - assert(domain < - static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && - "undefined behavior"); - return AvailableDomains & (1u << domain); - } - - // Mark domain as available. - void addDomain(unsigned domain) { - AvailableDomains |= 1u << domain; - } - - // Restrict to a single domain available. - void setSingleDomain(unsigned domain) { - AvailableDomains = 1u << domain; - } - - // Return bitmask of domains that are available and in mask. - unsigned getCommonDomains(unsigned mask) const { - return AvailableDomains & mask; - } - - // First domain available. - unsigned getFirstDomain() const { - return countTrailingZeros(AvailableDomains); - } - - DomainValue() : Refs(0) { clear(); } - - // Clear this DomainValue and point to next which has all its data. - void clear() { - AvailableDomains = 0; - Next = nullptr; - Instrs.clear(); - } -}; -} - -namespace { -/// Information about a live register. -struct LiveReg { - /// Value currently in this register, or NULL when no value is being tracked. - /// This counts as a DomainValue reference. - DomainValue *Value; - - /// Instruction that defined this register, relative to the beginning of the - /// current basic block. When a LiveReg is used to represent a live-out - /// register, this value is relative to the end of the basic block, so it - /// will be a negative number. - int Def; -}; -} // anonymous namespace - -namespace { -class ExeDepsFix : public MachineFunctionPass { - static char ID; - SpecificBumpPtrAllocator<DomainValue> Allocator; - SmallVector<DomainValue*,16> Avail; - - const TargetRegisterClass *const RC; - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RegClassInfo; - std::vector<SmallVector<int, 1>> AliasMap; - const unsigned NumRegs; - LiveReg *LiveRegs; - typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap; - LiveOutMap LiveOuts; - - /// List of undefined register reads in this block in forward order. - std::vector<std::pair<MachineInstr*, unsigned> > UndefReads; - - /// Storage for register unit liveness. - LivePhysRegs LiveRegSet; - - /// Current instruction number. - /// The first instruction in each basic block is 0. - int CurInstr; - - /// True when the current block has a predecessor that hasn't been visited - /// yet. - bool SeenUnknownBackEdge; - -public: - ExeDepsFix(const TargetRegisterClass *rc) - : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - - StringRef getPassName() const override { return "Execution dependency fix"; } - -private: - iterator_range<SmallVectorImpl<int>::const_iterator> - regIndices(unsigned Reg) const; - - // DomainValue allocation. - DomainValue *alloc(int domain = -1); - DomainValue *retain(DomainValue *DV) { - if (DV) ++DV->Refs; - return DV; - } - void release(DomainValue*); - DomainValue *resolve(DomainValue*&); - - // LiveRegs manipulations. - void setLiveReg(int rx, DomainValue *DV); - void kill(int rx); - void force(int rx, unsigned domain); - void collapse(DomainValue *dv, unsigned domain); - bool merge(DomainValue *A, DomainValue *B); - - void enterBasicBlock(MachineBasicBlock*); - void leaveBasicBlock(MachineBasicBlock*); - void visitInstr(MachineInstr*); - void processDefs(MachineInstr*, bool Kill); - void visitSoftInstr(MachineInstr*, unsigned mask); - void visitHardInstr(MachineInstr*, unsigned domain); - void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); - bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); - void processUndefReads(MachineBasicBlock*); -}; -} - -char ExeDepsFix::ID = 0; +#define DEBUG_TYPE "execution-deps-fix" /// Translate TRI register number to a list of indices into our smaller tables /// of interesting registers. iterator_range<SmallVectorImpl<int>::const_iterator> -ExeDepsFix::regIndices(unsigned Reg) const { +ExecutionDepsFix::regIndices(unsigned Reg) const { assert(Reg < AliasMap.size() && "Invalid register"); const auto &Entry = AliasMap[Reg]; return make_range(Entry.begin(), Entry.end()); } -DomainValue *ExeDepsFix::alloc(int domain) { +DomainValue *ExecutionDepsFix::alloc(int domain) { DomainValue *dv = Avail.empty() ? new(Allocator.Allocate()) DomainValue : Avail.pop_back_val(); @@ -234,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) { /// Release a reference to DV. When the last reference is released, /// collapse if needed. -void ExeDepsFix::release(DomainValue *DV) { +void ExecutionDepsFix::release(DomainValue *DV) { while (DV) { assert(DV->Refs && "Bad DomainValue"); if (--DV->Refs) @@ -254,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) { /// Follow the chain of dead DomainValues until a live DomainValue is reached. /// Update the referenced pointer when necessary. -DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { +DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) { DomainValue *DV = DVRef; if (!DV || !DV->Next) return DV; @@ -271,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { } /// Set LiveRegs[rx] = dv, updating reference counts. -void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { +void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); @@ -283,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { } // Kill register rx, recycle or collapse any DomainValue. -void ExeDepsFix::kill(int rx) { +void ExecutionDepsFix::kill(int rx) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); if (!LiveRegs[rx].Value) @@ -294,7 +107,7 @@ void ExeDepsFix::kill(int rx) { } /// Force register rx into domain. -void ExeDepsFix::force(int rx, unsigned domain) { +void ExecutionDepsFix::force(int rx, unsigned domain) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); if (DomainValue *dv = LiveRegs[rx].Value) { @@ -317,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) { /// Collapse open DomainValue into given domain. If there are multiple /// registers using dv, they each get a unique collapsed DomainValue. -void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { +void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) { assert(dv->hasDomain(domain) && "Cannot collapse"); // Collapse all the instructions. @@ -333,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { } /// All instructions and registers in B are moved to A, and B is released. -bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { +bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) { assert(!A->isCollapsed() && "Cannot merge into collapsed"); assert(!B->isCollapsed() && "Cannot merge from collapsed"); if (A == B) @@ -359,10 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { } /// Set up LiveRegs by merging predecessor live-out values. -void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { - // Detect back-edges from predecessors we haven't processed yet. - SeenUnknownBackEdge = false; - +void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // Reset instruction counter in each basic block. CurInstr = 0; @@ -397,18 +207,21 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // Try to coalesce live-out registers from predecessors. for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), pe = MBB->pred_end(); pi != pe; ++pi) { - LiveOutMap::const_iterator fi = LiveOuts.find(*pi); - if (fi == LiveOuts.end()) { - SeenUnknownBackEdge = true; + auto fi = MBBInfos.find(*pi); + assert(fi != MBBInfos.end() && + "Should have pre-allocated MBBInfos for all MBBs"); + LiveReg *Incoming = fi->second.OutRegs; + // Incoming is null if this is a backedge from a BB + // we haven't processed yet + if (Incoming == nullptr) { continue; } - assert(fi->second && "Can't have NULL entries"); for (unsigned rx = 0; rx != NumRegs; ++rx) { // Use the most recent predecessor def for each register. - LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def); + LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); - DomainValue *pdv = resolve(fi->second[rx].Value); + DomainValue *pdv = resolve(Incoming[rx].Value); if (!pdv) continue; if (!LiveRegs[rx].Value) { @@ -432,35 +245,34 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { force(rx, pdv->getFirstDomain()); } } - DEBUG(dbgs() << "BB#" << MBB->getNumber() - << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n")); + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); } -void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { +void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { assert(LiveRegs && "Must enter basic block first."); - // Save live registers at end of MBB - used by enterBasicBlock(). - // Also use LiveOuts as a visited set to detect back-edges. - bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second; - - if (First) { - // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to - // the end of this block instead of the beginning. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - LiveRegs[i].Def -= CurInstr; - } else { - // Insertion failed, this must be the second pass. + LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; + // Save register clearances at end of MBB - used by enterBasicBlock(). + MBBInfos[MBB].OutRegs = LiveRegs; + + // While processing the basic block, we kept `Def` relative to the start + // of the basic block for convenience. However, future use of this information + // only cares about the clearance from the end of the block, so adjust + // everything to be relative to the end of the basic block. + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def -= CurInstr; + if (OldOutRegs) { + // This must be the second pass. // Release all the DomainValues instead of keeping them. for (unsigned i = 0, e = NumRegs; i != e; ++i) - release(LiveRegs[i].Value); - delete[] LiveRegs; + release(OldOutRegs[i].Value); + delete[] OldOutRegs; } LiveRegs = nullptr; } -void ExeDepsFix::visitInstr(MachineInstr *MI) { - if (MI->isDebugValue()) - return; - +bool ExecutionDepsFix::visitInstr(MachineInstr *MI) { // Update instructions with explicit execution domains. std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI); if (DomP.first) { @@ -470,16 +282,16 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) { visitHardInstr(MI, DomP.first); } - // Process defs to track register ages, and kill values clobbered by generic - // instructions. - processDefs(MI, !DomP.first); + return !DomP.first; } /// \brief Helps avoid false dependencies on undef registers by updating the /// machine instructions' undef operand to use a register that the instruction /// is truly dependent on, or use a register with clearance higher than Pref. -void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +/// Returns true if it was able to find a true dependency, thus not requiring +/// a dependency breaking instruction regardless of clearance. +bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, + unsigned OpIdx, unsigned Pref) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); @@ -487,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // Update only undef operands that are mapped to one register. if (AliasMap[OriginalReg].size() != 1) - return; + return false; // Get the undef operand's register class const TargetRegisterClass *OpRC = @@ -502,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // We found a true dependency - replace the undef register with the true // dependency. MO.setReg(CurrMO.getReg()); - return; + return true; } // Go over all registers in the register class and find the register with @@ -527,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // Update the operand if we found a register with better clearance. if (MaxClearanceReg != OriginalReg) MO.setReg(MaxClearanceReg); + + return false; } /// \brief Return true to if it makes sense to break dependence on a partial def /// or undef use. -bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { unsigned reg = MI->getOperand(OpIdx).getReg(); for (int rx : regIndices(reg)) { unsigned Clearance = CurInstr - LiveRegs[rx].Def; @@ -542,14 +356,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, DEBUG(dbgs() << ": Break dependency.\n"); continue; } - // The current clearance seems OK, but we may be ignoring a def from a - // back-edge. - if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { - DEBUG(dbgs() << ": OK .\n"); - return false; - } - // A def from an unprocessed back-edge may make us break this dependency. - DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); + DEBUG(dbgs() << ": OK .\n"); return false; } return true; @@ -559,16 +366,22 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, // If Kill is set, also kill off DomainValues clobbered by the defs. // // Also break dependencies on partial defs and undef uses. -void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { +void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, + bool Kill) { assert(!MI->isDebugValue() && "Won't process debug values"); // Break dependence on undef uses. Do this before updating LiveRegs below. unsigned OpNum; - unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); - if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); + if (breakDependency) { + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { + bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); + // We don't need to bother trying to break a dependency if this + // instruction has a true dependency on that register through another + // operand - we'll have to wait for it to be available regardless. + if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } } const MCInstrDesc &MCID = MI->getDesc(); for (unsigned i = 0, @@ -584,11 +397,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr << '\t' << *MI); - // Check clearance before partial register updates. - // Call breakDependence before setting LiveRegs[rx].Def. - unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); - if (Pref && shouldBreakDependence(MI, i, Pref)) - TII->breakPartialRegDependency(*MI, i, TRI); + if (breakDependency) { + // Check clearance before partial register updates. + // Call breakDependence before setting LiveRegs[rx].Def. + unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); + if (Pref && shouldBreakDependence(MI, i, Pref)) + TII->breakPartialRegDependency(*MI, i, TRI); + } // How many instructions since rx was last written? LiveRegs[rx].Def = CurInstr; @@ -607,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { /// only do it on demand. Note that the occurrence of undefined register reads /// that should be broken is very rare, but when they occur we may have many in /// a single block. -void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { +void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) { if (UndefReads.empty()) return; @@ -640,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { // A hard instruction only works in one domain. All input registers will be // forced into that domain. -void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { +void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { // Collapse all uses. for (unsigned i = mi->getDesc().getNumDefs(), e = mi->getDesc().getNumOperands(); i != e; ++i) { @@ -663,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { } // A soft instruction can be changed to work in other domains given by mask. -void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { +void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { // Bitmask of available domains for this instruction after taking collapsed // operands into account. unsigned available = mask; @@ -774,7 +589,34 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { } } -bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { +void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, + bool PrimaryPass) { + enterBasicBlock(MBB); + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + bool breakDependency = isBlockDone(MBB); + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { + bool Kill = false; + if (PrimaryPass) + Kill = visitInstr(&MI); + processDefs(&MI, breakDependency, Kill); + } + } + if (breakDependency) + processUndefReads(MBB); + leaveBasicBlock(MBB); +} + +bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { + return MBBInfos[MBB].PrimaryCompleted && + MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && + MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(*mf.getFunction())) return false; MF = &mf; @@ -810,52 +652,104 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { AliasMap[*AI].push_back(i); } + // Initialize the MMBInfos + for (auto &MBB : mf) { + MBBInfo InitialInfo; + MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); + } + + /* + * We want to visit every instruction in every basic block in order to update + * it's execution domain or break any false dependencies. However, for the + * dependency breaking, we need to know clearances from all predecessors + * (including any backedges). One way to do so would be to do two complete + * passes over all basic blocks/instructions, the first for recording + * clearances, the second to break the dependencies. However, for functions + * without backedges, or functions with a lot of straight-line code, and + * a small loop, that would be a lot of unnecessary work (since only the + * BBs that are part of the loop require two passes). As an example, + * consider the following loop. + * + * + * PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT + * ^ | + * +----------------------------------+ + * + * The iteration order is as follows: + * Naive: PH A B C D A' B' C' D' + * Optimized: PH A B C A' B' C' D + * + * Note that we avoid processing D twice, because we can entirely process + * the predecessors before getting to D. We call a block that is ready + * for its second round of processing `done` (isBlockDone). Once we finish + * processing some block, we update the counters in MBBInfos and re-process + * any successors that are now done. + */ + MachineBasicBlock *Entry = &*MF->begin(); ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry); - SmallVector<MachineBasicBlock*, 16> Loops; + SmallVector<MachineBasicBlock *, 4> Workqueue; for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { MachineBasicBlock *MBB = *MBBI; - enterBasicBlock(MBB); - if (SeenUnknownBackEdge) - Loops.push_back(MBB); - for (MachineInstr &MI : *MBB) - visitInstr(&MI); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // N.B: IncomingProcessed and IncomingCompleted were already updated while + // processing this block's predecessors. + MBBInfos[MBB].PrimaryCompleted = true; + MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; + bool Primary = true; + Workqueue.push_back(MBB); + while (!Workqueue.empty()) { + MachineBasicBlock *ActiveMBB = &*Workqueue.back(); + Workqueue.pop_back(); + processBasicBlock(ActiveMBB, Primary); + bool Done = isBlockDone(ActiveMBB); + for (auto *Succ : ActiveMBB->successors()) { + if (!isBlockDone(Succ)) { + if (Primary) { + MBBInfos[Succ].IncomingProcessed++; + } + if (Done) { + MBBInfos[Succ].IncomingCompleted++; + } + if (isBlockDone(Succ)) { + Workqueue.push_back(Succ); + } + } + } + Primary = false; + } } - // Visit all the loop blocks again in order to merge DomainValues from - // back-edges. - for (MachineBasicBlock *MBB : Loops) { - enterBasicBlock(MBB); - for (MachineInstr &MI : *MBB) - if (!MI.isDebugValue()) - processDefs(&MI, false); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // We need to go through again and finalize any blocks that are not done yet. + // This is possible if blocks have dead predecessors, so we didn't visit them + // above. + for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + if (!isBlockDone(MBB)) { + processBasicBlock(MBB, false); + // Don't update successors here. We'll get to them anyway through this + // loop. + } } // Clear the LiveOuts vectors and collapse any remaining DomainValues. for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI); - if (FI == LiveOuts.end() || !FI->second) + auto FI = MBBInfos.find(*MBBI); + if (FI == MBBInfos.end() || !FI->second.OutRegs) continue; for (unsigned i = 0, e = NumRegs; i != e; ++i) - if (FI->second[i].Value) - release(FI->second[i].Value); - delete[] FI->second; + if (FI->second.OutRegs[i].Value) + release(FI->second.OutRegs[i].Value); + delete[] FI->second.OutRegs; } - LiveOuts.clear(); + MBBInfos.clear(); UndefReads.clear(); Avail.clear(); Allocator.DestroyAll(); return false; } - -FunctionPass * -llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) { - return new ExeDepsFix(RC); -} diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp new file mode 100644 index 0000000000000..0759bf6713e07 --- /dev/null +++ b/lib/CodeGen/FEntryInserter.cpp @@ -0,0 +1,55 @@ +//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file edits function bodies to insert fentry calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +namespace { +struct FEntryInserter : public MachineFunctionPass { + static char ID; // Pass identification, replacement for typeid + FEntryInserter() : MachineFunctionPass(ID) { + initializeFEntryInserterPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; +}; +} + +bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) { + const std::string FEntryName = + MF.getFunction()->getFnAttribute("fentry-call").getValueAsString(); + if (FEntryName != "true") + return false; + + auto &FirstMBB = *MF.begin(); + auto &FirstMI = *FirstMBB.begin(); + + auto *TII = MF.getSubtarget().getInstrInfo(); + BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(), + TII->get(TargetOpcode::FENTRY_CALL)); + return true; +} + +char FEntryInserter::ID = 0; +char &llvm::FEntryInserterID = FEntryInserter::ID; +INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false, + false) diff --git a/lib/CodeGen/FaultMaps.cpp b/lib/CodeGen/FaultMaps.cpp index 2acafafdb9fcc..43f3641289787 100644 --- a/lib/CodeGen/FaultMaps.cpp +++ b/lib/CodeGen/FaultMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- FaultMaps.cpp ---------------------------===// +//===- FaultMaps.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +7,17 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/FaultMaps.h" - +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel, } } - const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) { switch (FT) { default: llvm_unreachable("unhandled fault type!"); - case FaultMaps::FaultingLoad: return "FaultingLoad"; + case FaultMaps::FaultingLoadStore: + return "FaultingLoadStore"; + case FaultMaps::FaultingStore: + return "FaultingStore"; } } diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp index 31ab86fdf2761..6be4c16c6301e 100644 --- a/lib/CodeGen/GCStrategy.cpp +++ b/lib/CodeGen/GCStrategy.cpp @@ -1,4 +1,4 @@ -//===-- GCStrategy.cpp - Garbage Collector Description --------------------===// +//===- GCStrategy.cpp - Garbage Collector Description ---------------------===// // // The LLVM Compiler Infrastructure // @@ -18,7 +18,4 @@ using namespace llvm; LLVM_INSTANTIATE_REGISTRY(GCRegistry) -GCStrategy::GCStrategy() - : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false), - CustomWriteBarriers(false), CustomRoots(false), InitRoots(true), - UsesMetadata(false) {} +GCStrategy::GCStrategy() = default; diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt index 76ab5d36047e5..03a8c4f5f909a 100644 --- a/lib/CodeGen/GlobalISel/CMakeLists.txt +++ b/lib/CodeGen/GlobalISel/CMakeLists.txt @@ -22,7 +22,6 @@ else() set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) endif() - # In LLVMBuild.txt files, it is not possible to mark a dependency to a # library as optional. So instead, generate an empty library if we did # not ask for it. diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp index 13212212fa016..035a2ac78ed99 100644 --- a/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -24,40 +24,42 @@ using namespace llvm; bool CallLowering::lowerCall( - MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg, + MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg, ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const { - auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout(); + auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout(); // First step is to marshall all the function's parameters into the correct // physregs and memory locations. Gather the sequence of argument types that // we'll pass to the assigner function. SmallVector<ArgInfo, 8> OrigArgs; unsigned i = 0; - for (auto &Arg : CI.arg_operands()) { - ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}}; - setArgFlags(OrigArg, i + 1, DL, CI); + unsigned NumFixedArgs = CS.getFunctionType()->getNumParams(); + for (auto &Arg : CS.args()) { + ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}, + i < NumFixedArgs}; + setArgFlags(OrigArg, i + 1, DL, CS); OrigArgs.push_back(OrigArg); ++i; } MachineOperand Callee = MachineOperand::CreateImm(0); - if (Function *F = CI.getCalledFunction()) + if (const Function *F = CS.getCalledFunction()) Callee = MachineOperand::CreateGA(F, 0); else Callee = MachineOperand::CreateReg(GetCalleeReg(), false); - ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}}; + ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}}; if (!OrigRet.Ty->isVoidTy()) - setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI); + setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS); - return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs); + return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs); } template <typename FuncInfoTy> void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const { - const AttributeSet &Attrs = FuncInfo.getAttributes(); + const AttributeList &Attrs = FuncInfo.getAttributes(); if (Attrs.hasAttribute(OpIdx, Attribute::ZExt)) Arg.Flags.setZExt(); if (Attrs.hasAttribute(OpIdx, Attribute::SExt)) @@ -103,7 +105,6 @@ CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx, const CallInst &FuncInfo) const; bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, - CCAssignFn *AssignFn, ArrayRef<ArgInfo> Args, ValueHandler &Handler) const { MachineFunction &MF = MIRBuilder.getMF(); @@ -116,12 +117,20 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, unsigned NumArgs = Args.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT CurVT = MVT::getVT(Args[i].Ty); - if (AssignFn(i, CurVT, CurVT, CCValAssign::Full, Args[i].Flags, CCInfo)) + if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo)) return false; } - for (unsigned i = 0, e = Args.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) { + assert(j < ArgLocs.size() && "Skipped too many arg locs"); + + CCValAssign &VA = ArgLocs[j]; + assert(VA.getValNo() == i && "Location doesn't correspond to current arg"); + + if (VA.needsCustom()) { + j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j)); + continue; + } if (VA.isRegLoc()) Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA); diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 89a042ffc477e..7661873784469 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -12,7 +12,10 @@ #include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFunction.h" @@ -21,11 +24,13 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" @@ -40,11 +45,21 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI", false, false) -static void reportTranslationError(const Value &V, const Twine &Message) { - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << Message << ": " << V << '\n'; - report_fatal_error(Err.str()); +static void reportTranslationError(MachineFunction &MF, + const TargetPassConfig &TPC, + OptimizationRemarkEmitter &ORE, + OptimizationRemarkMissed &R) { + MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) + R << (" (in function: " + MF.getName() + ")").str(); + + if (TPC.isGlobalISelAbortEnabled()) + report_fatal_error(R.getMsg()); + else + ORE.emit(R); } IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) { @@ -59,28 +74,31 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const { unsigned IRTranslator::getOrCreateVReg(const Value &Val) { unsigned &ValReg = ValToVReg[&Val]; - // Check if this is the first time we see Val. - if (!ValReg) { - // Fill ValRegsSequence with the sequence of registers - // we need to concat together to produce the value. - assert(Val.getType()->isSized() && - "Don't know how to create an empty vreg"); - unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL}); - ValReg = VReg; - - if (auto CV = dyn_cast<Constant>(&Val)) { - bool Success = translate(*CV, VReg); - if (!Success) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return VReg; - } - reportTranslationError(Val, "unable to translate constant"); - } + + if (ValReg) + return ValReg; + + // Fill ValRegsSequence with the sequence of registers + // we need to concat together to produce the value. + assert(Val.getType()->isSized() && + "Don't know how to create an empty vreg"); + unsigned VReg = + MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL)); + ValReg = VReg; + + if (auto CV = dyn_cast<Constant>(&Val)) { + bool Success = translate(*CV, VReg); + if (!Success) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + MF->getFunction()->getSubprogram(), + &MF->getFunction()->getEntryBlock()); + R << "unable to translate constant: " << ore::NV("Type", Val.getType()); + reportTranslationError(*MF, *TPC, *ORE, R); + return VReg; } } - return ValReg; + + return VReg; } int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) { @@ -112,28 +130,27 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) { } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) { Alignment = LI->getAlignment(); ValTy = LI->getType(); - } else if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); + } else { + OptimizationRemarkMissed R("gisel-irtranslator", "", &I); + R << "unable to translate memop: " << ore::NV("Opcode", &I); + reportTranslationError(*MF, *TPC, *ORE, R); return 1; - } else - llvm_unreachable("unhandled memory instruction"); + } return Alignment ? Alignment : DL->getABITypeAlignment(ValTy); } -MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) { +MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) { MachineBasicBlock *&MBB = BBToMBB[&BB]; - if (!MBB) { - MBB = MF->CreateMachineBasicBlock(&BB); - MF->push_back(MBB); - - if (BB.hasAddressTaken()) - MBB->setHasAddressTaken(); - } + assert(MBB && "BasicBlock was not encountered before"); return *MBB; } +void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { + assert(NewPred && "new predecessor must be a real MachineBasicBlock"); + MachinePreds[Edge].push_back(NewPred); +} + bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { // FIXME: handle signed/unsigned wrapping flags. @@ -149,6 +166,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, return true; } +bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) { + // -0.0 - X --> G_FNEG + if (isa<Constant>(U.getOperand(0)) && + U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) { + MIRBuilder.buildInstr(TargetOpcode::G_FNEG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(1))); + return true; + } + return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder); +} + bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { const CmpInst *CI = dyn_cast<CmpInst>(&U); @@ -158,9 +187,14 @@ bool IRTranslator::translateCompare(const User &U, CmpInst::Predicate Pred = CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>( cast<ConstantExpr>(U).getPredicate()); - if (CmpInst::isIntPredicate(Pred)) MIRBuilder.buildICmp(Pred, Res, Op0, Op1); + else if (Pred == CmpInst::FCMP_FALSE) + MIRBuilder.buildCopy( + Res, getOrCreateVReg(*Constant::getNullValue(CI->getType()))); + else if (Pred == CmpInst::FCMP_TRUE) + MIRBuilder.buildCopy( + Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType()))); else MIRBuilder.buildFCmp(Pred, Res, Op0, Op1); @@ -183,18 +217,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) { // We want a G_BRCOND to the true BB followed by an unconditional branch. unsigned Tst = getOrCreateVReg(*BrInst.getCondition()); const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++)); - MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt); + MachineBasicBlock &TrueBB = getMBB(TrueTgt); MIRBuilder.buildBrCond(Tst, TrueBB); } const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ)); - MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt); - MIRBuilder.buildBr(TgtBB); + MachineBasicBlock &TgtBB = getMBB(BrTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the unconditional target is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); // Link successors. - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); for (const BasicBlock *Succ : BrInst.successors()) - CurBB.addSuccessor(&getOrCreateBB(*Succ)); + CurBB.addSuccessor(&getMBB(*Succ)); return true; } @@ -209,30 +246,52 @@ bool IRTranslator::translateSwitch(const User &U, const SwitchInst &SwInst = cast<SwitchInst>(U); const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition()); + const BasicBlock *OrigBB = SwInst.getParent(); - LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL); + LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL); for (auto &CaseIt : SwInst.cases()) { const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue()); const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1); MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue); - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); - MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor()); + MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); + const BasicBlock *TrueBB = CaseIt.getCaseSuccessor(); + MachineBasicBlock &TrueMBB = getMBB(*TrueBB); - MIRBuilder.buildBrCond(Tst, TrueBB); - CurBB.addSuccessor(&TrueBB); + MIRBuilder.buildBrCond(Tst, TrueMBB); + CurMBB.addSuccessor(&TrueMBB); + addMachineCFGPred({OrigBB, TrueBB}, &CurMBB); - MachineBasicBlock *FalseBB = + MachineBasicBlock *FalseMBB = MF->CreateMachineBasicBlock(SwInst.getParent()); - MF->push_back(FalseBB); - MIRBuilder.buildBr(*FalseBB); - CurBB.addSuccessor(FalseBB); + // Insert the comparison blocks one after the other. + MF->insert(std::next(CurMBB.getIterator()), FalseMBB); + MIRBuilder.buildBr(*FalseMBB); + CurMBB.addSuccessor(FalseMBB); - MIRBuilder.setMBB(*FalseBB); + MIRBuilder.setMBB(*FalseMBB); } // handle default case - MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest()); - MIRBuilder.buildBr(DefaultBB); - MIRBuilder.getMBB().addSuccessor(&DefaultBB); + const BasicBlock *DefaultBB = SwInst.getDefaultDest(); + MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB); + MIRBuilder.buildBr(DefaultMBB); + MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); + CurMBB.addSuccessor(&DefaultMBB); + addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB); + + return true; +} + +bool IRTranslator::translateIndirectBr(const User &U, + MachineIRBuilder &MIRBuilder) { + const IndirectBrInst &BrInst = cast<IndirectBrInst>(U); + + const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress()); + MIRBuilder.buildBrIndirect(Tgt); + + // Link successors. + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + for (const BasicBlock *Succ : BrInst.successors()) + CurBB.addSuccessor(&getMBB(*Succ)); return true; } @@ -240,47 +299,38 @@ bool IRTranslator::translateSwitch(const User &U, bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { const LoadInst &LI = cast<LoadInst>(U); - if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic()) - return false; - - assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment"); auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; Flags |= MachineMemOperand::MOLoad; unsigned Res = getOrCreateVReg(LI); unsigned Addr = getOrCreateVReg(*LI.getPointerOperand()); - LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL}; + MIRBuilder.buildLoad( Res, Addr, *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()), Flags, DL->getTypeStoreSize(LI.getType()), - getMemOpAlignment(LI))); + getMemOpAlignment(LI), AAMDNodes(), nullptr, + LI.getSynchScope(), LI.getOrdering())); return true; } bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { const StoreInst &SI = cast<StoreInst>(U); - - if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic()) - return false; - - assert(!SI.isAtomic() && "only non-atomic stores supported at the moment"); auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; Flags |= MachineMemOperand::MOStore; unsigned Val = getOrCreateVReg(*SI.getValueOperand()); unsigned Addr = getOrCreateVReg(*SI.getPointerOperand()); - LLT VTy{*SI.getValueOperand()->getType(), *DL}, - PTy{*SI.getPointerOperand()->getType(), *DL}; MIRBuilder.buildStore( Val, Addr, *MF->getMachineMemOperand( MachinePointerInfo(SI.getPointerOperand()), Flags, DL->getTypeStoreSize(SI.getValueOperand()->getType()), - getMemOpAlignment(SI))); + getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSynchScope(), + SI.getOrdering())); return true; } @@ -305,7 +355,7 @@ bool IRTranslator::translateExtractValue(const User &U, uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices); unsigned Res = getOrCreateVReg(U); - MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src)); + MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset); return true; } @@ -348,12 +398,18 @@ bool IRTranslator::translateSelect(const User &U, bool IRTranslator::translateBitCast(const User &U, MachineIRBuilder &MIRBuilder) { - if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) { + // If we're bitcasting to the source type, we can reuse the source vreg. + if (getLLTForType(*U.getOperand(0)->getType(), *DL) == + getLLTForType(*U.getType(), *DL)) { + // Get the source vreg now, to avoid invalidating ValToVReg. + unsigned SrcReg = getOrCreateVReg(*U.getOperand(0)); unsigned &Reg = ValToVReg[&U]; + // If we already assigned a vreg for this bitcast, we can't change that. + // Emit a copy to satisfy the users we already emitted. if (Reg) - MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0))); + MIRBuilder.buildCopy(Reg, SrcReg); else - Reg = getOrCreateVReg(*U.getOperand(0)); + Reg = SrcReg; return true; } return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder); @@ -375,9 +431,10 @@ bool IRTranslator::translateGetElementPtr(const User &U, Value &Op0 = *U.getOperand(0); unsigned BaseReg = getOrCreateVReg(Op0); - LLT PtrTy{*Op0.getType(), *DL}; - unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace()); - LLT OffsetTy = LLT::scalar(PtrSize); + Type *PtrIRTy = Op0.getType(); + LLT PtrTy = getLLTForType(*PtrIRTy, *DL); + Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy); + LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); int64_t Offset = 0; for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U); @@ -399,8 +456,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, if (Offset != 0) { unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy); - unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(OffsetReg, Offset); + unsigned OffsetReg = + getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset)); MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg); BaseReg = NewBaseReg; @@ -408,8 +465,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, } // N = N + Idx * ElementSize; - unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(ElementSizeReg, ElementSize); + unsigned ElementSizeReg = + getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize)); unsigned IdxReg = getOrCreateVReg(*Idx); if (MRI->getType(IdxReg) != OffsetTy) { @@ -428,8 +485,7 @@ bool IRTranslator::translateGetElementPtr(const User &U, } if (Offset != 0) { - unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(OffsetReg, Offset); + unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset)); MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg); return true; } @@ -438,13 +494,12 @@ bool IRTranslator::translateGetElementPtr(const User &U, return true; } -bool IRTranslator::translateMemcpy(const CallInst &CI, - MachineIRBuilder &MIRBuilder) { - LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL}; - if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() != - 0 || - cast<PointerType>(CI.getArgOperand(1)->getType())->getAddressSpace() != - 0 || +bool IRTranslator::translateMemfunc(const CallInst &CI, + MachineIRBuilder &MIRBuilder, + unsigned ID) { + LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL); + Type *DstTy = CI.getArgOperand(0)->getType(); + if (cast<PointerType>(DstTy)->getAddressSpace() != 0 || SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0)) return false; @@ -454,14 +509,32 @@ bool IRTranslator::translateMemcpy(const CallInst &CI, Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); } - MachineOperand Callee = MachineOperand::CreateES("memcpy"); + const char *Callee; + switch (ID) { + case Intrinsic::memmove: + case Intrinsic::memcpy: { + Type *SrcTy = CI.getArgOperand(1)->getType(); + if(cast<PointerType>(SrcTy)->getAddressSpace() != 0) + return false; + Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove"; + break; + } + case Intrinsic::memset: + Callee = "memset"; + break; + default: + return false; + } - return CLI->lowerCall(MIRBuilder, Callee, + return CLI->lowerCall(MIRBuilder, CI.getCallingConv(), + MachineOperand::CreateES(Callee), CallLowering::ArgInfo(0, CI.getType()), Args); } void IRTranslator::getStackGuard(unsigned DstReg, MachineIRBuilder &MIRBuilder) { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF)); auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD); MIB.addDef(DstReg); @@ -482,7 +555,7 @@ void IRTranslator::getStackGuard(unsigned DstReg, bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, MachineIRBuilder &MIRBuilder) { - LLT Ty{*CI.getOperand(0)->getType(), *DL}; + LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL); LLT s1 = LLT::scalar(1); unsigned Width = Ty.getSizeInBits(); unsigned Res = MRI->createGenericVirtualRegister(Ty); @@ -494,8 +567,8 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, .addUse(getOrCreateVReg(*CI.getOperand(1))); if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) { - unsigned Zero = MRI->createGenericVirtualRegister(s1); - EntryBuilder.buildConstant(Zero, 0); + unsigned Zero = getOrCreateVReg( + *Constant::getNullValue(Type::getInt1Ty(CI.getContext()))); MIB.addUse(Zero); } @@ -508,12 +581,83 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, switch (ID) { default: break; - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - // FIXME: these obviously need to be supported properly. - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // Stack coloring is not enabled in O0 (which we care about now) so we can + // drop these. Make sure someone notices when we start compiling at higher + // opts though. + if (MF->getTarget().getOptLevel() != CodeGenOpt::None) + return false; + return true; + case Intrinsic::dbg_declare: { + const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI); + assert(DI.getVariable() && "Missing variable"); + + const Value *Address = DI.getAddress(); + if (!Address || isa<UndefValue>(Address)) { + DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); + return true; + } + + assert(DI.getVariable()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + auto AI = dyn_cast<AllocaInst>(Address); + if (AI && AI->isStaticAlloca()) { + // Static allocas are tracked at the MF level, no need for DBG_VALUE + // instructions (in fact, they get ignored if they *do* exist). + MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(), + getOrCreateFrameIndex(*AI), DI.getDebugLoc()); + } else + MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address), + DI.getVariable(), DI.getExpression()); + return true; + } + case Intrinsic::vaend: + // No target I know of cares about va_end. Certainly no in-tree target + // does. Simplest intrinsic ever! return true; + case Intrinsic::vastart: { + auto &TLI = *MF->getSubtarget().getTargetLowering(); + Value *Ptr = CI.getArgOperand(0); + unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8; + + MIRBuilder.buildInstr(TargetOpcode::G_VASTART) + .addUse(getOrCreateVReg(*Ptr)) + .addMemOperand(MF->getMachineMemOperand( + MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0)); + return true; + } + case Intrinsic::dbg_value: { + // This form of DBG_VALUE is target-independent. + const DbgValueInst &DI = cast<DbgValueInst>(CI); + const Value *V = DI.getValue(); + assert(DI.getVariable()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + if (!V) { + // Currently the optimizer can produce this; insert an undef to + // help debugging. Probably the optimizer should not do this. + MIRBuilder.buildIndirectDbgValue(0, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + } else if (const auto *CI = dyn_cast<Constant>(V)) { + MIRBuilder.buildConstDbgValue(*CI, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + } else { + unsigned Reg = getOrCreateVReg(*V); + // FIXME: This does not handle register-indirect values at offset 0. The + // direct/indirect thing shouldn't really be handled by something as + // implicit as reg+noreg vs reg+imm in the first palce, but it seems + // pretty baked in right now. + if (DI.getOffset() != 0) + MIRBuilder.buildIndirectDbgValue(Reg, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + else + MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), + DI.getExpression()); + } + return true; + } case Intrinsic::uadd_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder); case Intrinsic::sadd_with_overflow: @@ -526,8 +670,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder); case Intrinsic::smul_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder); + case Intrinsic::pow: + MIRBuilder.buildInstr(TargetOpcode::G_FPOW) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))) + .addUse(getOrCreateVReg(*CI.getArgOperand(1))); + return true; case Intrinsic::memcpy: - return translateMemcpy(CI, MIRBuilder); + case Intrinsic::memmove: + case Intrinsic::memset: + return translateMemfunc(CI, MIRBuilder, ID); case Intrinsic::eh_typeid_for: { GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0)); unsigned Reg = getOrCreateVReg(CI); @@ -546,7 +698,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getStackGuard(getOrCreateVReg(CI), MIRBuilder); return true; case Intrinsic::stackprotector: { - LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL}; + LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy); getStackGuard(GuardVal, MIRBuilder); @@ -564,18 +716,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return false; } +bool IRTranslator::translateInlineAsm(const CallInst &CI, + MachineIRBuilder &MIRBuilder) { + const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue()); + if (!IA.getConstraintString().empty()) + return false; + + unsigned ExtraInfo = 0; + if (IA.hasSideEffects()) + ExtraInfo |= InlineAsm::Extra_HasSideEffects; + if (IA.getDialect() == InlineAsm::AD_Intel) + ExtraInfo |= InlineAsm::Extra_AsmDialect; + + MIRBuilder.buildInstr(TargetOpcode::INLINEASM) + .addExternalSymbol(IA.getAsmString().c_str()) + .addImm(ExtraInfo); + + return true; +} + bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { const CallInst &CI = cast<CallInst>(U); auto TII = MF->getTarget().getIntrinsicInfo(); const Function *F = CI.getCalledFunction(); + if (CI.isInlineAsm()) + return translateInlineAsm(CI, MIRBuilder); + if (!F || !F->isIntrinsic()) { unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI); SmallVector<unsigned, 8> Args; for (auto &Arg: CI.arg_operands()) Args.push_back(getOrCreateVReg(*Arg)); - return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() { + MF->getFrameInfo().setHasCalls(true); + return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() { return getOrCreateVReg(*CI.getCalledValue()); }); } @@ -594,10 +769,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory()); for (auto &Arg : CI.arg_operands()) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) - MIB.addImm(CI->getSExtValue()); - else - MIB.addUse(getOrCreateVReg(*Arg)); + // Some intrinsics take metadata parameters. Reject them. + if (isa<MetadataAsValue>(Arg)) + return false; + MIB.addUse(getOrCreateVReg(*Arg)); } return true; } @@ -610,7 +785,7 @@ bool IRTranslator::translateInvoke(const User &U, const BasicBlock *ReturnBB = I.getSuccessor(0); const BasicBlock *EHPadBB = I.getSuccessor(1); - const Value *Callee(I.getCalledValue()); + const Value *Callee = I.getCalledValue(); const Function *Fn = dyn_cast<Function>(Callee); if (isa<InlineAsm>(Callee)) return false; @@ -634,23 +809,24 @@ bool IRTranslator::translateInvoke(const User &U, MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol); unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I); - SmallVector<CallLowering::ArgInfo, 8> Args; + SmallVector<unsigned, 8> Args; for (auto &Arg: I.arg_operands()) - Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); + Args.push_back(getOrCreateVReg(*Arg)); - if (!CLI->lowerCall(MIRBuilder, MachineOperand::CreateGA(Fn, 0), - CallLowering::ArgInfo(Res, I.getType()), Args)) + if (!CLI->lowerCall(MIRBuilder, &I, Res, Args, + [&]() { return getOrCreateVReg(*I.getCalledValue()); })) return false; MCSymbol *EndSymbol = Context.createTempSymbol(); MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol); // FIXME: track probabilities. - MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB), - &ReturnMBB = getOrCreateBB(*ReturnBB); + MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB), + &ReturnMBB = getMBB(*ReturnBB); MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol); MIRBuilder.getMBB().addSuccessor(&ReturnMBB); MIRBuilder.getMBB().addSuccessor(&EHPadMBB); + MIRBuilder.buildBr(ReturnMBB); return true; } @@ -684,37 +860,158 @@ bool IRTranslator::translateLandingPad(const User &U, MIRBuilder.buildInstr(TargetOpcode::EH_LABEL) .addSym(MF->addLandingPad(&MBB)); + LLT Ty = getLLTForType(*LP.getType(), *DL); + unsigned Undef = MRI->createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Undef); + + SmallVector<LLT, 2> Tys; + for (Type *Ty : cast<StructType>(LP.getType())->elements()) + Tys.push_back(getLLTForType(*Ty, *DL)); + assert(Tys.size() == 2 && "Only two-valued landingpads are supported"); + // Mark exception register as live in. - SmallVector<unsigned, 2> Regs; - SmallVector<uint64_t, 2> Offsets; - LLT p0 = LLT::pointer(0, DL->getPointerSizeInBits()); - if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) { - unsigned VReg = MRI->createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(VReg, Reg); - Regs.push_back(VReg); - Offsets.push_back(0); + unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn); + if (!ExceptionReg) + return false; + + MBB.addLiveIn(ExceptionReg); + unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]), + Tmp = MRI->createGenericVirtualRegister(Ty); + MIRBuilder.buildCopy(VReg, ExceptionReg); + MIRBuilder.buildInsert(Tmp, Undef, VReg, 0); + + unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn); + if (!SelectorReg) + return false; + + MBB.addLiveIn(SelectorReg); + + // N.b. the exception selector register always has pointer type and may not + // match the actual IR-level type in the landingpad so an extra cast is + // needed. + unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]); + MIRBuilder.buildCopy(PtrVReg, SelectorReg); + + VReg = MRI->createGenericVirtualRegister(Tys[1]); + MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg); + MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg, + Tys[0].getSizeInBits()); + return true; +} + +bool IRTranslator::translateAlloca(const User &U, + MachineIRBuilder &MIRBuilder) { + auto &AI = cast<AllocaInst>(U); + + if (AI.isStaticAlloca()) { + unsigned Res = getOrCreateVReg(AI); + int FI = getOrCreateFrameIndex(AI); + MIRBuilder.buildFrameIndex(Res, FI); + return true; + } + + // Now we're in the harder dynamic case. + Type *Ty = AI.getAllocatedType(); + unsigned Align = + std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment()); + + unsigned NumElts = getOrCreateVReg(*AI.getArraySize()); + + Type *IntPtrIRTy = DL->getIntPtrType(AI.getType()); + LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL); + if (MRI->getType(NumElts) != IntPtrTy) { + unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy); + MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts); + NumElts = ExtElts; } - if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) { - unsigned VReg = MRI->createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(VReg, Reg); - Regs.push_back(VReg); - Offsets.push_back(p0.getSizeInBits()); + unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy); + unsigned TySize = + getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty))); + MIRBuilder.buildMul(AllocSize, NumElts, TySize); + + LLT PtrTy = getLLTForType(*AI.getType(), *DL); + auto &TLI = *MF->getSubtarget().getTargetLowering(); + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + + unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildCopy(SPTmp, SPReg); + + unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize); + + // Handle alignment. We have to realign if the allocation granule was smaller + // than stack alignment, or the specific alloca requires more than stack + // alignment. + unsigned StackAlign = + MF->getSubtarget().getFrameLowering()->getStackAlignment(); + Align = std::max(Align, StackAlign); + if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) { + // Round the size of the allocation up to the stack alignment size + // by add SA-1 to the size. This doesn't overflow because we're computing + // an address inside an alloca. + unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align)); + AllocTmp = AlignedAlloc; } - MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets); + MIRBuilder.buildCopy(SPReg, AllocTmp); + MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp); + + MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI); + assert(MF->getFrameInfo().hasVarSizedObjects()); return true; } -bool IRTranslator::translateStaticAlloca(const AllocaInst &AI, - MachineIRBuilder &MIRBuilder) { - if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca()) - return false; +bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) { + // FIXME: We may need more info about the type. Because of how LLT works, + // we're completely discarding the i64/double distinction here (amongst + // others). Fortunately the ABIs I know of where that matters don't use va_arg + // anyway but that's not guaranteed. + MIRBuilder.buildInstr(TargetOpcode::G_VAARG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(0))) + .addImm(DL->getABITypeAlignment(U.getType())); + return true; +} - assert(AI.isStaticAlloca() && "only handle static allocas now"); - unsigned Res = getOrCreateVReg(AI); - int FI = getOrCreateFrameIndex(AI); - MIRBuilder.buildFrameIndex(Res, FI); +bool IRTranslator::translateInsertElement(const User &U, + MachineIRBuilder &MIRBuilder) { + // If it is a <1 x Ty> vector, use the scalar as it is + // not a legal vector type in LLT. + if (U.getType()->getVectorNumElements() == 1) { + unsigned Elt = getOrCreateVReg(*U.getOperand(1)); + ValToVReg[&U] = Elt; + return true; + } + MIRBuilder.buildInsertVectorElement( + getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)), + getOrCreateVReg(*U.getOperand(1)), getOrCreateVReg(*U.getOperand(2))); + return true; +} + +bool IRTranslator::translateExtractElement(const User &U, + MachineIRBuilder &MIRBuilder) { + // If it is a <1 x Ty> vector, use the scalar as it is + // not a legal vector type in LLT. + if (U.getOperand(0)->getType()->getVectorNumElements() == 1) { + unsigned Elt = getOrCreateVReg(*U.getOperand(0)); + ValToVReg[&U] = Elt; + return true; + } + MIRBuilder.buildExtractVectorElement(getOrCreateVReg(U), + getOrCreateVReg(*U.getOperand(0)), + getOrCreateVReg(*U.getOperand(1))); + return true; +} + +bool IRTranslator::translateShuffleVector(const User &U, + MachineIRBuilder &MIRBuilder) { + MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(0))) + .addUse(getOrCreateVReg(*U.getOperand(1))) + .addUse(getOrCreateVReg(*U.getOperand(2))); return true; } @@ -736,11 +1033,21 @@ void IRTranslator::finishPendingPhis() { // won't create extra control flow here, otherwise we need to find the // dominating predecessor here (or perhaps force the weirder IRTranslators // to provide a simple boundary). + SmallSet<const BasicBlock *, 4> HandledPreds; + for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) { - assert(BBToMBB[PI->getIncomingBlock(i)]->isSuccessor(MIB->getParent()) && - "I appear to have misunderstood Machine PHIs"); - MIB.addUse(getOrCreateVReg(*PI->getIncomingValue(i))); - MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]); + auto IRPred = PI->getIncomingBlock(i); + if (HandledPreds.count(IRPred)) + continue; + + HandledPreds.insert(IRPred); + unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i)); + for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) { + assert(Pred->isSuccessor(MIB->getParent()) && + "incorrect CFG at MachineBasicBlock level"); + MIB.addUse(ValReg); + MIB.addMBB(Pred); + } } } } @@ -752,9 +1059,7 @@ bool IRTranslator::translate(const Instruction &Inst) { case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder); #include "llvm/IR/Instruction.def" default: - if (!TPC->isGlobalISelAbortEnabled()) - return false; - llvm_unreachable("unknown opcode"); + return false; } } @@ -764,25 +1069,43 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) { else if (auto CF = dyn_cast<ConstantFP>(&C)) EntryBuilder.buildFConstant(Reg, *CF); else if (isa<UndefValue>(C)) - EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg); + EntryBuilder.buildUndef(Reg); else if (isa<ConstantPointerNull>(C)) EntryBuilder.buildConstant(Reg, 0); else if (auto GV = dyn_cast<GlobalValue>(&C)) EntryBuilder.buildGlobalValue(Reg, GV); - else if (auto CE = dyn_cast<ConstantExpr>(&C)) { + else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) { + if (!CAZ->getType()->isVectorTy()) + return false; + // Return the scalar if it is a <1 x Ty> vector. + if (CAZ->getNumElements() == 1) + return translate(*CAZ->getElementValue(0u), Reg); + std::vector<unsigned> Ops; + for (unsigned i = 0; i < CAZ->getNumElements(); ++i) { + Constant &Elt = *CAZ->getElementValue(i); + Ops.push_back(getOrCreateVReg(Elt)); + } + EntryBuilder.buildMerge(Reg, Ops); + } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) { + // Return the scalar if it is a <1 x Ty> vector. + if (CV->getNumElements() == 1) + return translate(*CV->getElementAsConstant(0), Reg); + std::vector<unsigned> Ops; + for (unsigned i = 0; i < CV->getNumElements(); ++i) { + Constant &Elt = *CV->getElementAsConstant(i); + Ops.push_back(getOrCreateVReg(Elt)); + } + EntryBuilder.buildMerge(Reg, Ops); + } else if (auto CE = dyn_cast<ConstantExpr>(&C)) { switch(CE->getOpcode()) { #define HANDLE_INST(NUM, OPCODE, CLASS) \ case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder); #include "llvm/IR/Instruction.def" default: - if (!TPC->isGlobalISelAbortEnabled()) - return false; - llvm_unreachable("unknown opcode"); + return false; } - } else if (!TPC->isGlobalISelAbortEnabled()) + } else return false; - else - llvm_unreachable("unhandled constant kind"); return true; } @@ -793,7 +1116,7 @@ void IRTranslator::finalizeFunction() { PendingPHIs.clear(); ValToVReg.clear(); FrameIndices.clear(); - Constants.clear(); + MachinePreds.clear(); } bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { @@ -807,85 +1130,101 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MRI = &MF->getRegInfo(); DL = &F.getParent()->getDataLayout(); TPC = &getAnalysis<TargetPassConfig>(); + ORE = make_unique<OptimizationRemarkEmitter>(&F); assert(PendingPHIs.empty() && "stale PHIs"); - // Setup a separate basic-block for the arguments and constants, falling - // through to the IR-level Function's entry block. + // Release the per-function state when we return, whether we succeeded or not. + auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); }); + + // Setup a separate basic-block for the arguments and constants MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock(); MF->push_back(EntryBB); - EntryBB->addSuccessor(&getOrCreateBB(F.front())); EntryBuilder.setMBB(*EntryBB); + // Create all blocks, in IR order, to preserve the layout. + for (const BasicBlock &BB: F) { + auto *&MBB = BBToMBB[&BB]; + + MBB = MF->CreateMachineBasicBlock(&BB); + MF->push_back(MBB); + + if (BB.hasAddressTaken()) + MBB->setHasAddressTaken(); + } + + // Make our arguments/constants entry block fallthrough to the IR entry block. + EntryBB->addSuccessor(&getMBB(F.front())); + // Lower the actual args into this basic block. SmallVector<unsigned, 8> VRegArgs; for (const Argument &Arg: F.args()) VRegArgs.push_back(getOrCreateVReg(Arg)); - bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs); - if (!Succeeded) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - finalizeFunction(); - return false; - } - report_fatal_error("Unable to lower arguments"); + if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + MF->getFunction()->getSubprogram(), + &MF->getFunction()->getEntryBlock()); + R << "unable to lower arguments: " << ore::NV("Prototype", F.getType()); + reportTranslationError(*MF, *TPC, *ORE, R); + return false; } // And translate the function! for (const BasicBlock &BB: F) { - MachineBasicBlock &MBB = getOrCreateBB(BB); + MachineBasicBlock &MBB = getMBB(BB); // Set the insertion point of all the following translations to // the end of this basic block. CurBuilder.setMBB(MBB); for (const Instruction &Inst: BB) { - Succeeded &= translate(Inst); - if (!Succeeded) { - if (TPC->isGlobalISelAbortEnabled()) - reportTranslationError(Inst, "unable to translate instruction"); - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - break; - } - } - } - - if (Succeeded) { - finishPendingPhis(); - - // Now that the MachineFrameInfo has been configured, no further changes to - // the reserved registers are possible. - MRI->freezeReservedRegs(*MF); - - // Merge the argument lowering and constants block with its single - // successor, the LLVM-IR entry block. We want the basic block to - // be maximal. - assert(EntryBB->succ_size() == 1 && - "Custom BB used for lowering should have only one successor"); - // Get the successor of the current entry block. - MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); - assert(NewEntryBB.pred_size() == 1 && - "LLVM-IR entry block has a predecessor!?"); - // Move all the instruction from the current entry block to the - // new entry block. - NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), - EntryBB->end()); - - // Update the live-in information for the new entry block. - for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) - NewEntryBB.addLiveIn(LiveIn); - NewEntryBB.sortUniqueLiveIns(); + if (translate(Inst)) + continue; - // Get rid of the now empty basic block. - EntryBB->removeSuccessor(&NewEntryBB); - MF->remove(EntryBB); + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << Inst; - assert(&MF->front() == &NewEntryBB && - "New entry wasn't next in the list of basic block!"); + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + Inst.getDebugLoc(), &BB); + R << "unable to translate instruction: " << ore::NV("Opcode", &Inst) + << ": '" << InstStr.str() << "'"; + reportTranslationError(*MF, *TPC, *ORE, R); + return false; + } } - finalizeFunction(); + finishPendingPhis(); + + // Now that the MachineFrameInfo has been configured, no further changes to + // the reserved registers are possible. + MRI->freezeReservedRegs(*MF); + + // Merge the argument lowering and constants block with its single + // successor, the LLVM-IR entry block. We want the basic block to + // be maximal. + assert(EntryBB->succ_size() == 1 && + "Custom BB used for lowering should have only one successor"); + // Get the successor of the current entry block. + MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); + assert(NewEntryBB.pred_size() == 1 && + "LLVM-IR entry block has a predecessor!?"); + // Move all the instruction from the current entry block to the + // new entry block. + NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), + EntryBB->end()); + + // Update the live-in information for the new entry block. + for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) + NewEntryBB.addLiveIn(LiveIn); + NewEntryBB.sortUniqueLiveIns(); + + // Get rid of the now empty basic block. + EntryBB->removeSuccessor(&NewEntryBB); + MF->remove(EntryBB); + MF->DeleteMachineBasicBlock(EntryBB); + + assert(&MF->front() == &NewEntryBB && + "New entry wasn't next in the list of basic block!"); return false; } diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 1d205cd6c9c8b..26454c1ef00f9 100644 --- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -12,11 +12,15 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -44,17 +48,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -static void reportSelectionError(const MachineInstr *MI, const Twine &Message) { - const MachineFunction &MF = *MI->getParent()->getParent(); - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << Message << ":\nIn function: " << MF.getName() << '\n'; - if (MI) - Err << *MI << '\n'; - report_fatal_error(Err.str()); -} - bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // No matter what happens, whether we successfully select the function or not, + // nothing is going to use the vreg types after us. Make sure they disappear. + auto ClearVRegTypesOnReturn = + make_scope_exit([&]() { MRI.getVRegToType().clear(); }); + // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) @@ -66,11 +67,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector(); assert(ISel && "Cannot work without InstructionSelector"); + // An optimization remark emitter. Used to report failures. + MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); + // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many // other MF/MFI fields we need to initialize. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - #ifndef NDEBUG // Check that our input is fully legal: we require the function to have the // Legalized property, so it should be. @@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { // that it has the same layering problem, but we only use inline methods so // end up not needing to link against the GlobalISel library. if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) - for (const MachineBasicBlock &MBB : MF) - for (const MachineInstr &MI : MBB) - if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) - reportSelectionError(&MI, "Instruction is not legal"); + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "instruction is not legal", MI); + return false; + } #endif // FIXME: We could introduce new blocks and will need to fix the outer loop. // Until then, keep track of the number of blocks to assert that we don't. const size_t NumBlocks = MF.size(); - bool Failed = false; for (MachineBasicBlock *MBB : post_order(&MF)) { if (MBB->empty()) continue; @@ -115,14 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Selecting: \n " << MI); + // We could have folded this instruction away already, making it dead. + // If so, erase it. + if (isTriviallyDead(MI, MRI)) { + DEBUG(dbgs() << "Is dead; erasing.\n"); + MI.eraseFromParentAndMarkDBGValuesForRemoval(); + continue; + } + if (!ISel->select(MI)) { - if (TPC.isGlobalISelAbortEnabled()) - // FIXME: It would be nice to dump all inserted instructions. It's - // not - // obvious how, esp. considering select() can insert after MI. - reportSelectionError(&MI, "Cannot select"); - Failed = true; - break; + // FIXME: It would be nice to dump all inserted instructions. It's + // not obvious how, esp. considering select() can insert after MI. + reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI); + return false; } // Dump the range of instructions that MI expanded into. @@ -142,33 +151,36 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { for (auto &VRegToType : MRI.getVRegToType()) { unsigned VReg = VRegToType.first; auto *RC = MRI.getRegClassOrNull(VReg); - auto *MI = MRI.def_instr_begin(VReg) == MRI.def_instr_end() - ? nullptr - : &*MRI.def_instr_begin(VReg); - if (!RC) { - if (TPC.isGlobalISelAbortEnabled()) - reportSelectionError(MI, "VReg as no regclass after selection"); - Failed = true; - break; - } + MachineInstr *MI = nullptr; + if (!MRI.def_empty(VReg)) + MI = &*MRI.def_instr_begin(VReg); + else if (!MRI.use_empty(VReg)) + MI = &*MRI.use_instr_begin(VReg); + + if (MI && !RC) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "VReg has no regclass after selection", *MI); + return false; + } else if (!RC) + continue; if (VRegToType.second.isValid() && VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) { - if (TPC.isGlobalISelAbortEnabled()) - reportSelectionError( - MI, "VReg has explicit size different from class size"); - Failed = true; - break; + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "VReg has explicit size different from class size", + *MI); + return false; } } - MRI.getVRegToType().clear(); - - if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) { - MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + if (MF.size() != NumBlocks) { + MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure", + MF.getFunction()->getSubprogram(), + /*MBB=*/nullptr); + R << "inserting blocks is not supported yet"; + reportGISelFailure(MF, TPC, MORE, R); return false; } - assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet"); // FIXME: Should we accurately track changes? return true; diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 5c34da0dc5579..fb9d01ef8542a 100644 --- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -14,6 +14,8 @@ #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -55,6 +57,45 @@ bool InstructionSelector::constrainSelectedInstRegOperands( // constrainOperandRegClass does that for us. MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(), Reg, OpI)); + + // Tie uses to defs as indicated in MCInstrDesc. + if (MO.isUse()) { + int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO); + if (DefIdx != -1) + I.tieOperands(DefIdx, OpI); + } } return true; } + +Optional<int64_t> +InstructionSelector::getConstantVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI) const { + MachineInstr *MI = MRI.getVRegDef(VReg); + if (MI->getOpcode() != TargetOpcode::G_CONSTANT) + return None; + + if (MI->getOperand(1).isImm()) + return MI->getOperand(1).getImm(); + + if (MI->getOperand(1).isCImm() && + MI->getOperand(1).getCImm()->getBitWidth() <= 64) + return MI->getOperand(1).getCImm()->getSExtValue(); + + return None; +} + +bool InstructionSelector::isOperandImmEqual( + const MachineOperand &MO, int64_t Value, + const MachineRegisterInfo &MRI) const { + + if (MO.getReg()) + if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI)) + return *VRegVal == Value; + return false; +} + +bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const { + return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() && + MI.implicit_operands().begin() == MI.implicit_operands().end(); +} diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp index e86356880e99f..657ddb3079195 100644 --- a/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -16,6 +16,8 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" @@ -92,10 +94,7 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI, "unexpected physical register in G_SEQUENCE"); // Finally we can replace the uses. - for (auto &Use : MRI.use_operands(ExtractReg)) { - Changed = true; - Use.setReg(OrigReg); - } + MRI.replaceRegWith(ExtractReg, OrigReg); } if (AllDefsReplaced) { @@ -114,6 +113,36 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI, return Changed; } +bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI, + const TargetInstrInfo &TII) { + if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES) + return false; + + unsigned NumDefs = MI.getNumOperands() - 1; + unsigned SrcReg = MI.getOperand(NumDefs).getReg(); + MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg); + if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES) + return false; + + if (MergeI.getNumOperands() - 1 != NumDefs) + return false; + + // FIXME: is a COPY appropriate if the types mismatch? We know both registers + // are allocatable by now. + if (MRI.getType(MI.getOperand(0).getReg()) != + MRI.getType(MergeI.getOperand(1).getReg())) + return false; + + for (unsigned Idx = 0; Idx < NumDefs; ++Idx) + MRI.replaceRegWith(MI.getOperand(Idx).getReg(), + MergeI.getOperand(Idx + 1).getReg()); + + MI.eraseFromParent(); + if (MRI.use_empty(MergeI.getOperand(0).getReg())) + MergeI.eraseFromParent(); + return true; +} + bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( @@ -122,7 +151,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n'); init(MF); const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); - const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo(); + MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); LegalizerHelper Helper(MF); // FIXME: an instruction may need more than one pass before it is legal. For @@ -142,27 +171,33 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // and are assumed to be legal. if (!isPreISelGenericOpcode(MI->getOpcode())) continue; - - auto Res = Helper.legalizeInstr(*MI, LegalizerInfo); - - // Error out if we couldn't legalize this instruction. We may want to fall - // back to DAG ISel instead in the future. - if (Res == LegalizerHelper::UnableToLegalize) { - if (!TPC.isGlobalISelAbortEnabled()) { - MF.getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return false; + SmallVector<MachineInstr *, 4> WorkList; + Helper.MIRBuilder.recordInsertions( + [&](MachineInstr *MI) { WorkList.push_back(MI); }); + WorkList.push_back(&*MI); + + LegalizerHelper::LegalizeResult Res; + unsigned Idx = 0; + do { + Res = Helper.legalizeInstrStep(*WorkList[Idx]); + // Error out if we couldn't legalize this instruction. We may want to + // fall + // back to DAG ISel instead in the future. + if (Res == LegalizerHelper::UnableToLegalize) { + Helper.MIRBuilder.stopRecordingInsertions(); + if (Res == LegalizerHelper::UnableToLegalize) { + reportGISelFailure(MF, TPC, MORE, "gisel-legalize", + "unable to legalize instruction", + *WorkList[Idx]); + return false; + } } - std::string Msg; - raw_string_ostream OS(Msg); - OS << "unable to legalize instruction: "; - MI->print(OS); - report_fatal_error(OS.str()); - } - - Changed |= Res == LegalizerHelper::Legalized; - } + Changed |= Res == LegalizerHelper::Legalized; + ++Idx; + } while (Idx < WorkList.size()); + Helper.MIRBuilder.stopRecordingInsertions(); + } MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -173,6 +208,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { NextMI = std::next(MI); Changed |= combineExtracts(*MI, MRI, TII); + Changed |= combineMerges(*MI, MRI, TII); } } diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index eb25b6ca268f7..20358f7ee6c2e 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -29,14 +29,13 @@ using namespace llvm; LegalizerHelper::LegalizerHelper(MachineFunction &MF) - : MRI(MF.getRegInfo()) { + : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) { MIRBuilder.setMF(MF); } LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstrStep(MachineInstr &MI, - const LegalizerInfo &LegalizerInfo) { - auto Action = LegalizerInfo.getAction(MI, MRI); +LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { + auto Action = LI.getAction(MI, MRI); switch (std::get<0>(Action)) { case LegalizerInfo::Legal: return AlreadyLegal; @@ -50,46 +49,32 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI, return lower(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::FewerElements: return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action)); + case LegalizerInfo::Custom: + return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized + : UnableToLegalize; default: return UnableToLegalize; } } -LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstr(MachineInstr &MI, - const LegalizerInfo &LegalizerInfo) { - SmallVector<MachineInstr *, 4> WorkList; - MIRBuilder.recordInsertions( - [&](MachineInstr *MI) { WorkList.push_back(MI); }); - WorkList.push_back(&MI); - - bool Changed = false; - LegalizeResult Res; - unsigned Idx = 0; - do { - Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo); - if (Res == UnableToLegalize) { - MIRBuilder.stopRecordingInsertions(); - return UnableToLegalize; - } - Changed |= Res == Legalized; - ++Idx; - } while (Idx < WorkList.size()); - - MIRBuilder.stopRecordingInsertions(); - - return Changed ? Legalized : AlreadyLegal; -} - void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts, SmallVectorImpl<unsigned> &VRegs) { - unsigned Size = Ty.getSizeInBits(); - SmallVector<uint64_t, 4> Indexes; - for (int i = 0; i < NumParts; ++i) { + for (int i = 0; i < NumParts; ++i) VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); - Indexes.push_back(i * Size); + MIRBuilder.buildUnmerge(VRegs, Reg); +} + +static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { + switch (Opcode) { + case TargetOpcode::G_FADD: + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32; + case TargetOpcode::G_FREM: + return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32; + case TargetOpcode::G_FPOW: + return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32; } - MIRBuilder.buildExtract(VRegs, Indexes, Reg); + llvm_unreachable("Unknown libcall function"); } LegalizerHelper::LegalizeResult @@ -101,17 +86,19 @@ LegalizerHelper::libcall(MachineInstr &MI) { switch (MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_FADD: + case TargetOpcode::G_FPOW: case TargetOpcode::G_FREM: { auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx); auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); - const char *Name = - TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32); - + auto Libcall = getRTLibDesc(MI.getOpcode(), Size); + const char *Name = TLI.getLibcallName(Libcall); + MIRBuilder.getMF().getFrameInfo().setHasCalls(true); CLI.lowerCall( - MIRBuilder, MachineOperand::CreateES(Name), - {MI.getOperand(0).getReg(), Ty}, + MIRBuilder, TLI.getLibcallCallingConv(Libcall), + MachineOperand::CreateES(Name), {MI.getOperand(0).getReg(), Ty}, {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}}); MI.eraseFromParent(); return Legalized; @@ -125,19 +112,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, // FIXME: Don't know how to handle secondary types yet. if (TypeIdx != 0) return UnableToLegalize; + + MIRBuilder.setInstr(MI); + switch (MI.getOpcode()) { default: return UnableToLegalize; case TargetOpcode::G_ADD: { // Expand in terms of carry-setting/consuming G_ADDE instructions. - unsigned NarrowSize = NarrowTy.getSizeInBits(); int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowTy.getSizeInBits(); - MIRBuilder.setInstr(MI); - SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; - SmallVector<uint64_t, 2> Indexes; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -152,11 +138,138 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, Src2Regs[i], CarryIn); DstRegs.push_back(DstReg); - Indexes.push_back(i * NarrowSize); CarryIn = CarryOut; } unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); + MIRBuilder.buildMerge(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_INSERT: { + if (TypeIdx != 0) + return UnableToLegalize; + + int64_t NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + + SmallVector<unsigned, 2> SrcRegs, DstRegs; + SmallVector<uint64_t, 2> Indexes; + extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + + unsigned OpReg = MI.getOperand(2).getReg(); + int64_t OpStart = MI.getOperand(3).getImm(); + int64_t OpSize = MRI.getType(OpReg).getSizeInBits(); + for (int i = 0; i < NumParts; ++i) { + unsigned DstStart = i * NarrowSize; + + if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { + // No part of the insert affects this subregister, forward the original. + DstRegs.push_back(SrcRegs[i]); + continue; + } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { + // The entire subregister is defined by this insert, forward the new + // value. + DstRegs.push_back(OpReg); + continue; + } + + // OpSegStart is where this destination segment would start in OpReg if it + // extended infinitely in both directions. + int64_t ExtractOffset, InsertOffset, SegSize; + if (OpStart < DstStart) { + InsertOffset = 0; + ExtractOffset = DstStart - OpStart; + SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); + } else { + InsertOffset = OpStart - DstStart; + ExtractOffset = 0; + SegSize = + std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); + } + + unsigned SegReg = OpReg; + if (ExtractOffset != 0 || SegSize != OpSize) { + // A genuine extract is needed. + SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); + MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); + } + + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset); + DstRegs.push_back(DstReg); + } + + assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered"); + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_LOAD: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + LLT NarrowPtrTy = LLT::pointer( + MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + unsigned SrcReg = MRI.createGenericVirtualRegister(NarrowPtrTy); + unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64)); + + MIRBuilder.buildConstant(Offset, i * NarrowSize / 8); + MIRBuilder.buildGEP(SrcReg, MI.getOperand(1).getReg(), Offset); + // TODO: This is conservatively correct, but we probably want to split the + // memory operands in the future. + MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin()); + + DstRegs.push_back(DstReg); + } + unsigned DstReg = MI.getOperand(0).getReg(); + MIRBuilder.buildMerge(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_STORE: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + LLT NarrowPtrTy = LLT::pointer( + MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize); + + SmallVector<unsigned, 2> SrcRegs; + extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs); + + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowPtrTy); + unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIRBuilder.buildConstant(Offset, i * NarrowSize / 8); + MIRBuilder.buildGEP(DstReg, MI.getOperand(1).getReg(), Offset); + // TODO: This is conservatively correct, but we probably want to split the + // memory operands in the future. + MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin()); + } + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_CONSTANT: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + const APInt &Cst = MI.getOperand(1).getCImm()->getValue(); + LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + ConstantInt *CI = + ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize)); + MIRBuilder.buildConstant(DstReg, *CI); + DstRegs.push_back(DstReg); + } + unsigned DstReg = MI.getOperand(0).getReg(); + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -175,7 +288,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_MUL: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: - case TargetOpcode::G_SUB: { + case TargetOpcode::G_SUB: + case TargetOpcode::G_SHL: { // Perform operation at larger width (any extension is fine here, high bits // don't affect the result) and then truncate the result back to the // original type. @@ -195,10 +309,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_SDIV: - case TargetOpcode::G_UDIV: { - unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV - ? TargetOpcode::G_SEXT - : TargetOpcode::G_ZEXT; + case TargetOpcode::G_UDIV: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: { + unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV || + MI.getOpcode() == TargetOpcode::G_ASHR + ? TargetOpcode::G_SEXT + : TargetOpcode::G_ZEXT; unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy); MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse( @@ -218,6 +335,85 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_SELECT: { + if (TypeIdx != 0) + return UnableToLegalize; + + // Perform operation at larger width (any extension is fine here, high bits + // don't affect the result) and then truncate the result back to the + // original type. + unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy); + unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg()); + MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg()); + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildInstr(TargetOpcode::G_SELECT) + .addDef(DstExt) + .addReg(MI.getOperand(1).getReg()) + .addUse(Src1Ext) + .addUse(Src2Ext); + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: { + if (TypeIdx != 0) + return UnableToLegalize; + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildInstr(MI.getOpcode()) + .addDef(DstExt) + .addUse(MI.getOperand(1).getReg()); + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: { + if (TypeIdx != 1) + return UnableToLegalize; + + unsigned Src = MI.getOperand(1).getReg(); + unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + + if (MI.getOpcode() == TargetOpcode::G_SITOFP) { + MIRBuilder.buildSExt(SrcExt, Src); + } else { + assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op"); + MIRBuilder.buildZExt(SrcExt, Src); + } + + MIRBuilder.buildInstr(MI.getOpcode()) + .addDef(MI.getOperand(0).getReg()) + .addUse(SrcExt); + + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_INSERT: { + if (TypeIdx != 0) + return UnableToLegalize; + + unsigned Src = MI.getOperand(1).getReg(); + unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildAnyExt(SrcExt, Src); + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(), + MI.getOperand(3).getImm()); + for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) { + MIB.addReg(MI.getOperand(OpNum).getReg()); + MIB.addImm(MI.getOperand(OpNum + 1).getImm()); + } + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_LOAD: { assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == WideTy.getSizeInBits() && @@ -231,12 +427,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_STORE: { - assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == - WideTy.getSizeInBits() && - "illegal to increase number of bytes modified by a store"); + if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) || + WideTy != LLT::scalar(8)) + return UnableToLegalize; + + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + auto Content = TLI.getBooleanContents(false, false); + + unsigned ExtOp = TargetOpcode::G_ANYEXT; + if (Content == TargetLoweringBase::ZeroOrOneBooleanContent) + ExtOp = TargetOpcode::G_ZEXT; + else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent) + ExtOp = TargetOpcode::G_SEXT; + else + ExtOp = TargetOpcode::G_ANYEXT; unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); - MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg()); + MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse( + MI.getOperand(0).getReg()); MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(), **MI.memoperands_begin()); MI.eraseFromParent(); @@ -315,6 +523,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_SMULO: + case TargetOpcode::G_UMULO: { + // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the + // result. + unsigned Res = MI.getOperand(0).getReg(); + unsigned Overflow = MI.getOperand(1).getReg(); + unsigned LHS = MI.getOperand(2).getReg(); + unsigned RHS = MI.getOperand(3).getReg(); + + MIRBuilder.buildMul(Res, LHS, RHS); + + unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO + ? TargetOpcode::G_SMULH + : TargetOpcode::G_UMULH; + + unsigned HiPart = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInstr(Opcode) + .addDef(HiPart) + .addUse(LHS) + .addUse(RHS); + + unsigned Zero = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildConstant(Zero, 0); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FNEG: { + // TODO: Handle vector types once we are able to + // represent them. + if (Ty.isVector()) + return UnableToLegalize; + unsigned Res = MI.getOperand(0).getReg(); + Type *ZeroTy; + LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + switch (Ty.getSizeInBits()) { + case 16: + ZeroTy = Type::getHalfTy(Ctx); + break; + case 32: + ZeroTy = Type::getFloatTy(Ctx); + break; + case 64: + ZeroTy = Type::getDoubleTy(Ctx); + break; + default: + llvm_unreachable("unexpected floating-point type"); + } + ConstantFP &ZeroForNegation = + *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy)); + unsigned Zero = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildFConstant(Zero, ZeroForNegation); + MIRBuilder.buildInstr(TargetOpcode::G_FSUB) + .addDef(Res) + .addUse(Zero) + .addUse(MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FSUB: { + // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). + // First, check if G_FNEG is marked as Lower. If so, we may + // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. + if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower) + return UnableToLegalize; + unsigned Res = MI.getOperand(0).getReg(); + unsigned LHS = MI.getOperand(1).getReg(); + unsigned RHS = MI.getOperand(2).getReg(); + unsigned Neg = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS); + MIRBuilder.buildInstr(TargetOpcode::G_FADD) + .addDef(Res) + .addUse(LHS) + .addUse(Neg); + MI.eraseFromParent(); + return Legalized; + } } } @@ -335,7 +620,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, MIRBuilder.setInstr(MI); SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; - SmallVector<uint64_t, 2> Indexes; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -343,10 +627,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]); DstRegs.push_back(DstReg); - Indexes.push_back(i * NarrowSize); } - MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index e49662075ed5e..eaf4056e47eaf 100644 --- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -41,6 +41,8 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) { DefaultActions[TargetOpcode::G_STORE] = NarrowScalar; DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar; + DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar; + DefaultActions[TargetOpcode::G_FNEG] = Lower; } void LegalizerInfo::computeTables() { @@ -71,28 +73,36 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const { // These *have* to be implemented for now, they're the fundamental basis of // how everything else is transformed. - // Nothing is going to go well with types that aren't a power of 2 yet, so - // don't even try because we might make things worse. - if (!isPowerOf2_64(Aspect.Type.getSizeInBits())) - return std::make_pair(Unsupported, LLT()); - // FIXME: the long-term plan calls for expansion in terms of load/store (if // they're not legal). if (Aspect.Opcode == TargetOpcode::G_SEQUENCE || - Aspect.Opcode == TargetOpcode::G_EXTRACT) + Aspect.Opcode == TargetOpcode::G_EXTRACT || + Aspect.Opcode == TargetOpcode::G_MERGE_VALUES || + Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES) return std::make_pair(Legal, Aspect.Type); + LLT Ty = Aspect.Type; LegalizeAction Action = findInActions(Aspect); + // LegalizerHelper is not able to handle non-power-of-2 types right now, so do + // not try to legalize them unless they are marked as Legal or Custom. + // FIXME: This is a temporary hack until the general non-power-of-2 + // legalization works. + if (!isPowerOf2_64(Ty.getSizeInBits()) && + !(Action == Legal || Action == Custom)) + return std::make_pair(Unsupported, LLT()); + if (Action != NotFound) return findLegalAction(Aspect, Action); unsigned Opcode = Aspect.Opcode; - LLT Ty = Aspect.Type; if (!Ty.isVector()) { auto DefaultAction = DefaultActions.find(Aspect.Opcode); if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal) return std::make_pair(Legal, Ty); + if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower) + return std::make_pair(Lower, Ty); + if (DefaultAction == DefaultActions.end() || DefaultAction->second != NarrowScalar) return std::make_pair(Unsupported, LLT()); @@ -160,6 +170,7 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, case Legal: case Lower: case Libcall: + case Custom: return Aspect.Type; case NarrowScalar: { return findLegalType(Aspect, @@ -180,3 +191,9 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, } } } + +bool LegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + return false; +} diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index c04f6e4ae897a..8d1a263395a0e 100644 --- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -54,7 +55,7 @@ void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB, void MachineIRBuilder::recordInsertions( std::function<void(MachineInstr *)> Inserted) { - InsertedInstr = Inserted; + InsertedInstr = std::move(Inserted); } void MachineIRBuilder::stopRecordingInsertions() { @@ -82,6 +83,70 @@ MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) { return MIB; } +MachineInstrBuilder MachineIRBuilder::buildDirectDbgValue( + unsigned Reg, const MDNode *Variable, const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addReg(Reg, RegState::Debug) + .addReg(0, RegState::Debug) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildIndirectDbgValue( + unsigned Reg, unsigned Offset, const MDNode *Variable, const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addReg(Reg, RegState::Debug) + .addImm(Offset) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI, + const MDNode *Variable, + const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addFrameIndex(FI) + .addImm(0) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, + unsigned Offset, + const MDNode *Variable, + const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + auto MIB = buildInstr(TargetOpcode::DBG_VALUE); + if (auto *CI = dyn_cast<ConstantInt>(&C)) { + if (CI->getBitWidth() > 64) + MIB.addCImm(CI); + else + MIB.addImm(CI->getZExtValue()); + } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) { + MIB.addFPImm(CFP); + } else { + // Insert %noreg if we didn't find a usable constant and had to drop it. + MIB.addReg(0U); + } + + return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr); +} + MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) { assert(MRI->getType(Res).isPointer() && "invalid operand type"); return buildInstr(TargetOpcode::G_FRAME_INDEX) @@ -126,6 +191,17 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0, + uint32_t NumBits) { + assert(MRI->getType(Res).isPointer() && + MRI->getType(Res) == MRI->getType(Op0) && "type mismatch"); + + return buildInstr(TargetOpcode::G_PTR_MASK) + .addDef(Res) + .addUse(Op0) + .addImm(NumBits); +} + MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0, unsigned Op1) { assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && @@ -152,10 +228,27 @@ MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0, + unsigned Op1) { + assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && + "invalid operand type"); + assert(MRI->getType(Res) == MRI->getType(Op0) && + MRI->getType(Res) == MRI->getType(Op1) && "type mismatch"); + + return buildInstr(TargetOpcode::G_AND) + .addDef(Res) + .addUse(Op0) + .addUse(Op1); +} + MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) { return buildInstr(TargetOpcode::G_BR).addMBB(&Dest); } +MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) { + return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt); +} + MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) { return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op); } @@ -262,34 +355,56 @@ MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res, return buildInstr(Opcode).addDef(Res).addUse(Op); } -MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results, - ArrayRef<uint64_t> Indices, - unsigned Src) { -#ifndef NDEBUG - assert(Results.size() == Indices.size() && "inconsistent number of regs"); - assert(!Results.empty() && "invalid trivial extract"); - assert(std::is_sorted(Indices.begin(), Indices.end()) && - "extract offsets must be in ascending order"); +MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res, + unsigned Op) { + unsigned Opcode = TargetOpcode::COPY; + if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) + Opcode = TargetOpcode::G_ZEXT; + else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) + Opcode = TargetOpcode::G_TRUNC; - assert(MRI->getType(Src).isValid() && "invalid operand type"); - for (auto Res : Results) - assert(MRI->getType(Res).isValid() && "invalid operand type"); -#endif + return buildInstr(Opcode).addDef(Res).addUse(Op); +} - auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT)); - for (auto Res : Results) - MIB.addDef(Res); - MIB.addUse(Src); +MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) { + LLT SrcTy = MRI->getType(Src); + LLT DstTy = MRI->getType(Dst); + if (SrcTy == DstTy) + return buildCopy(Dst, Src); + + unsigned Opcode; + if (SrcTy.isPointer() && DstTy.isScalar()) + Opcode = TargetOpcode::G_PTRTOINT; + else if (DstTy.isPointer() && SrcTy.isScalar()) + Opcode = TargetOpcode::G_INTTOPTR; + else { + assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet"); + Opcode = TargetOpcode::G_BITCAST; + } - for (auto Idx : Indices) - MIB.addImm(Idx); + return buildInstr(Opcode).addDef(Dst).addUse(Src); +} - getMBB().insert(getInsertPt(), MIB); - if (InsertedInstr) - InsertedInstr(MIB); +MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src, + uint64_t Index) { +#ifndef NDEBUG + assert(MRI->getType(Src).isValid() && "invalid operand type"); + assert(MRI->getType(Res).isValid() && "invalid operand type"); + assert(Index + MRI->getType(Res).getSizeInBits() <= + MRI->getType(Src).getSizeInBits() && + "extracting off end of register"); +#endif - return MIB; + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) { + assert(Index == 0 && "insertion past the end of a register"); + return buildCast(Res, Src); + } + + return buildInstr(TargetOpcode::G_EXTRACT) + .addDef(Res) + .addUse(Src) + .addImm(Index); } MachineInstrBuilder @@ -316,6 +431,64 @@ MachineIRBuilder::buildSequence(unsigned Res, return MIB; } +MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) { + return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res); +} + +MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res, + ArrayRef<unsigned> Ops) { + +#ifndef NDEBUG + assert(!Ops.empty() && "invalid trivial sequence"); + LLT Ty = MRI->getType(Ops[0]); + for (auto Reg : Ops) + assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); + assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() == + MRI->getType(Res).getSizeInBits() && + "input operands do not cover output register"); +#endif + + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES); + MIB.addDef(Res); + for (unsigned i = 0; i < Ops.size(); ++i) + MIB.addUse(Ops[i]); + return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res, + unsigned Op) { + +#ifndef NDEBUG + assert(!Res.empty() && "invalid trivial sequence"); + LLT Ty = MRI->getType(Res[0]); + for (auto Reg : Res) + assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); + assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() == + MRI->getType(Op).getSizeInBits() && + "input operands do not cover output register"); +#endif + + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES); + for (unsigned i = 0; i < Res.size(); ++i) + MIB.addDef(Res[i]); + MIB.addUse(Op); + return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src, + unsigned Op, unsigned Index) { + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) { + assert(Index == 0 && "insertion past the end of a register"); + return buildCast(Res, Op); + } + + return buildInstr(TargetOpcode::G_INSERT) + .addDef(Res) + .addUse(Src) + .addUse(Op) + .addImm(Index); +} + MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID, unsigned Res, bool HasSideEffects) { @@ -395,9 +568,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst, if (ResTy.isScalar() || ResTy.isPointer()) assert(MRI->getType(Tst).isScalar() && "type mismatch"); else - assert(MRI->getType(Tst).isVector() && - MRI->getType(Tst).getNumElements() == - MRI->getType(Op0).getNumElements() && + assert((MRI->getType(Tst).isScalar() || + (MRI->getType(Tst).isVector() && + MRI->getType(Tst).getNumElements() == + MRI->getType(Op0).getNumElements())) && "type mismatch"); #endif @@ -408,6 +582,46 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res, + unsigned Val, + unsigned Elt, + unsigned Idx) { +#ifndef NDEBUG + LLT ResTy = MRI->getType(Res); + LLT ValTy = MRI->getType(Val); + LLT EltTy = MRI->getType(Elt); + LLT IdxTy = MRI->getType(Idx); + assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type"); + assert(EltTy.isScalar() && IdxTy.isScalar() && "invalid operand type"); + assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch"); + assert(ResTy.getElementType() == EltTy && "type mismatch"); +#endif + + return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT) + .addDef(Res) + .addUse(Val) + .addUse(Elt) + .addUse(Idx); +} + +MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res, + unsigned Val, + unsigned Idx) { +#ifndef NDEBUG + LLT ResTy = MRI->getType(Res); + LLT ValTy = MRI->getType(Val); + LLT IdxTy = MRI->getType(Idx); + assert(ValTy.isVector() && "invalid operand type"); + assert(ResTy.isScalar() && IdxTy.isScalar() && "invalid operand type"); + assert(ValTy.getElementType() == ResTy && "type mismatch"); +#endif + + return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT) + .addDef(Res) + .addUse(Val) + .addUse(Idx); +} + void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src, bool IsExtend) { #ifndef NDEBUG diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp index cc026ef27296a..f935390a8d1bd 100644 --- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -71,6 +72,7 @@ void RegBankSelect::init(MachineFunction &MF) { MBPI = nullptr; } MIRBuilder.setMF(MF); + MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); } void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { @@ -585,18 +587,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { // LegalizerInfo as it's currently in the separate GlobalISel library. const MachineRegisterInfo &MRI = MF.getRegInfo(); if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) { - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF.getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return false; - } - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << "Instruction is not legal: " << MI << '\n'; - report_fatal_error(Err.str()); + reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", + "instruction is not legal", MI); + return false; } } } @@ -622,9 +618,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { continue; if (!assignInstr(MI)) { - if (TPC->isGlobalISelAbortEnabled()) - report_fatal_error("Unable to map instruction"); - MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", + "unable to map instruction", MI); return false; } } @@ -968,10 +963,12 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const { LocalFreq == Cost.LocalFreq; } -void RegBankSelect::MappingCost::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegBankSelect::MappingCost::print(raw_ostream &OS) const { if (*this == ImpossibleCost()) { diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp index 49d676f11da6a..940957d021524 100644 --- a/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -19,10 +19,11 @@ using namespace llvm; const unsigned RegisterBank::InvalidID = UINT_MAX; -RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size, - const uint32_t *CoveredClasses) +RegisterBank::RegisterBank( + unsigned ID, const char *Name, unsigned Size, + const uint32_t *CoveredClasses, unsigned NumRegClasses) : ID(ID), Name(Name), Size(Size) { - ContainedRegClasses.resize(200); + ContainedRegClasses.resize(NumRegClasses); ContainedRegClasses.setBitsInMask(CoveredClasses); } @@ -75,9 +76,11 @@ bool RegisterBank::operator==(const RegisterBank &OtherRB) const { return &OtherRB == this; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const { print(dbgs(), /* IsForDebug */ true, TRI); } +#endif void RegisterBank::print(raw_ostream &OS, bool IsForDebug, const TargetRegisterInfo *TRI) const { diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index da5ab0b9fb7b6..b2df2f1596769 100644 --- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -63,13 +63,6 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, #endif // NDEBUG } -RegisterBankInfo::~RegisterBankInfo() { - for (auto It : MapOfPartialMappings) - delete It.second; - for (auto It : MapOfValueMappings) - delete It.second; -} - bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const { #ifndef NDEBUG for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { @@ -133,15 +126,26 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister( return &RC; } +/// Check whether or not \p MI should be treated like a copy +/// for the mappings. +/// Copy like instruction are special for mapping because +/// they don't have actual register constraints. Moreover, +/// they sometimes have register classes assigned and we can +/// just use that instead of failing to provide a generic mapping. +static bool isCopyLike(const MachineInstr &MI) { + return MI.isCopy() || MI.isPHI() || + MI.getOpcode() == TargetOpcode::REG_SEQUENCE; +} + RegisterBankInfo::InstructionMapping RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // For copies we want to walk over the operands and try to find one // that has a register bank since the instruction itself will not get // us any constraint. - bool isCopyLike = MI.isCopy() || MI.isPHI(); + bool IsCopyLike = isCopyLike(MI); // For copy like instruction, only the mapping of the definition // is important. The rest is not constrained. - unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands(); + unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands(); RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1, /*OperandsMapping*/ nullptr, @@ -175,7 +179,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // For copy-like instruction, we want to reuse the register bank // that is already set on Reg, if any, since those instructions do // not have any constraints. - const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr; + const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr; if (!CurRegBank) { // If this is a target specific instruction, we can deduce // the register bank from the encoding constraints. @@ -184,7 +188,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // All our attempts failed, give up. CompleteMapping = false; - if (!isCopyLike) + if (!IsCopyLike) // MI does not carry enough information to guess the mapping. return InstructionMapping(); continue; @@ -192,7 +196,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { } const ValueMapping *ValMapping = &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank); - if (isCopyLike) { + if (IsCopyLike) { OperandsMapping[0] = ValMapping; CompleteMapping = true; break; @@ -200,7 +204,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { OperandsMapping[OpIdx] = ValMapping; } - if (isCopyLike && !CompleteMapping) + if (IsCopyLike && !CompleteMapping) // No way to deduce the type from what we have. return InstructionMapping(); @@ -234,8 +238,8 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length, ++NumPartialMappingsCreated; - const PartialMapping *&PartMapping = MapOfPartialMappings[Hash]; - PartMapping = new PartialMapping{StartIdx, Length, RegBank}; + auto &PartMapping = MapOfPartialMappings[Hash]; + PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank); return *PartMapping; } @@ -268,8 +272,8 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown, ++NumValueMappingsCreated; - const ValueMapping *&ValMapping = MapOfValueMappings[Hash]; - ValMapping = new ValueMapping{BreakDown, NumBreakDowns}; + auto &ValMapping = MapOfValueMappings[Hash]; + ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns); return *ValMapping; } @@ -282,9 +286,9 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { // The addresses of the value mapping are unique. // Therefore, we can use them directly to hash the operand mapping. hash_code Hash = hash_combine_range(Begin, End); - const auto &It = MapOfOperandsMappings.find(Hash); - if (It != MapOfOperandsMappings.end()) - return It->second; + auto &Res = MapOfOperandsMappings[Hash]; + if (Res) + return Res.get(); ++NumOperandsMappingsCreated; @@ -293,8 +297,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { // mapping, because we use the pointer of the ValueMapping // to hash and we expect them to uniquely identify an instance // of value mapping. - ValueMapping *&Res = MapOfOperandsMappings[Hash]; - Res = new ValueMapping[std::distance(Begin, End)]; + Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End)); unsigned Idx = 0; for (Iterator It = Begin; It != End; ++It, ++Idx) { const ValueMapping *ValMap = *It; @@ -302,7 +305,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { continue; Res[Idx] = *ValMap; } - return Res; + return Res.get(); } const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping( @@ -349,6 +352,7 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const { void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); DEBUG(dbgs() << "Applying default-like mapping\n"); for (unsigned OpIdx = 0, EndIdx = OpdMapper.getInstrMapping().getNumOperands(); @@ -359,6 +363,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { DEBUG(dbgs() << " is not a register, nothing to be done\n"); continue; } + if (!MO.getReg()) { + DEBUG(dbgs() << " is %%noreg, nothing to be done\n"); + continue; + } + assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns != + 0 && + "Invalid mapping"); assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns == 1 && "This mapping is too complex for this function"); @@ -368,9 +379,25 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { DEBUG(dbgs() << " has not been repaired, nothing to be done\n"); continue; } - DEBUG(dbgs() << " changed, replace " << MO.getReg()); - MO.setReg(*NewRegs.begin()); - DEBUG(dbgs() << " with " << MO.getReg()); + unsigned OrigReg = MO.getReg(); + unsigned NewReg = *NewRegs.begin(); + DEBUG(dbgs() << " changed, replace " << PrintReg(OrigReg, nullptr)); + MO.setReg(NewReg); + DEBUG(dbgs() << " with " << PrintReg(NewReg, nullptr)); + + // The OperandsMapper creates plain scalar, we may have to fix that. + // Check if the types match and if not, fix that. + LLT OrigTy = MRI.getType(OrigReg); + LLT NewTy = MRI.getType(NewReg); + if (OrigTy != NewTy) { + assert(OrigTy.getSizeInBits() == NewTy.getSizeInBits() && + "Types with difference size cannot be handled by the default " + "mapping"); + DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to " + << OrigTy); + MRI.setType(NewReg, OrigTy); + } + DEBUG(dbgs() << '\n'); } } @@ -400,10 +427,12 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg, //------------------------------------------------------------------------------ // Helper classes implementation. //------------------------------------------------------------------------------ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif bool RegisterBankInfo::PartialMapping::verify() const { assert(RegBank && "Register bank not set"); @@ -451,10 +480,12 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const { return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const { OS << "#BreakDown: " << NumBreakDowns << " "; @@ -472,8 +503,7 @@ bool RegisterBankInfo::InstructionMapping::verify( // Check that all the register operands are properly mapped. // Check the constructor invariant. // For PHI, we only care about mapping the definition. - assert(NumOperands == - ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) && + assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) && "NumOperands must match, see constructor"); assert(MI.getParent() && MI.getParent()->getParent() && "MI must be connected to a MachineFunction"); @@ -503,10 +533,12 @@ bool RegisterBankInfo::InstructionMapping::verify( return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const { OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: "; @@ -576,6 +608,11 @@ void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) { for (unsigned &NewVReg : NewVRegsForOpIdx) { assert(PartMap != ValMapping.end() && "Out-of-bound access"); assert(NewVReg == 0 && "Register has already been created"); + // The new registers are always bound to scalar with the right size. + // The actual type has to be set when the target does the mapping + // of the instruction. + // The rationale is that this generic code cannot guess how the + // target plans to split the input type. NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length)); MRI.setRegBank(NewVReg, *PartMap->RegBank); ++PartMap; @@ -619,10 +656,12 @@ RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx, return Res; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const { print(dbgs(), true); dbgs() << '\n'; } +#endif void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS, bool ForDebug) const { diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp index e50091833c26d..606a59680a3d4 100644 --- a/lib/CodeGen/GlobalISel/Utils.cpp +++ b/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,10 +11,13 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -43,3 +46,50 @@ unsigned llvm::constrainOperandRegClass( return Reg; } + +bool llvm::isTriviallyDead(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { + // If we can move an instruction, we can remove it. Otherwise, it has + // a side-effect of some sort. + bool SawStore = false; + if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore)) + return false; + + // Instructions without side-effects are dead iff they only define dead vregs. + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI.use_nodbg_empty(Reg)) + return false; + } + return true; +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, + MachineOptimizationRemarkEmitter &MORE, + MachineOptimizationRemarkMissed &R) { + MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) + R << (" (in function: " + MF.getName() + ")").str(); + + if (TPC.isGlobalISelAbortEnabled()) + report_fatal_error(R.getMsg()); + else + MORE.emit(R); +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, + MachineOptimizationRemarkEmitter &MORE, + const char *PassName, StringRef Msg, + const MachineInstr &MI) { + MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ", + MI.getDebugLoc(), MI.getParent()); + R << Msg << ": " << ore::MNV("Inst", MI); + reportGISelFailure(MF, TPC, MORE, R); +} diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index b9f3d86eabd84..37fe41582333d 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -588,19 +588,6 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI, return TExit && TExit == FalseBBI.BB; } -/// Shrink the provided inclusive range by one instruction. -/// If the range was one instruction (\p It == \p Begin), It is not modified, -/// but \p Empty is set to true. -static inline void shrinkInclusiveRange( - MachineBasicBlock::iterator &Begin, - MachineBasicBlock::iterator &It, - bool &Empty) { - if (It == Begin) - Empty = true; - else - It--; -} - /// Count duplicated instructions and move the iterators to show where they /// are. /// @param TIB True Iterator Begin @@ -633,10 +620,8 @@ bool IfConverter::CountDuplicatedInstructions( while (TIB != TIE && FIB != FIE) { // Skip dbg_value instructions. These do not count. TIB = skipDebugInstructionsForward(TIB, TIE); - if(TIB == TIE) - break; FIB = skipDebugInstructionsForward(FIB, FIE); - if(FIB == FIE) + if (TIB == TIE || FIB == FIE) break; if (!TIB->isIdenticalTo(*FIB)) break; @@ -656,58 +641,42 @@ bool IfConverter::CountDuplicatedInstructions( if (TIB == TIE || FIB == FIE) return true; // Now, in preparation for counting duplicate instructions at the ends of the - // blocks, move the end iterators up past any branch instructions. - --TIE; - --FIE; - - // After this point TIB and TIE define an inclusive range, which means that - // TIB == TIE is true when there is one more instruction to consider, not at - // the end. Because we may not be able to go before TIB, we need a flag to - // indicate a completely empty range. - bool TEmpty = false, FEmpty = false; - - // Upon exit TIE and FIE will both point at the last non-shared instruction. - // They need to be moved forward to point past the last non-shared - // instruction if the range they delimit is non-empty. - auto IncrementEndIteratorsOnExit = make_scope_exit([&]() { - if (!TEmpty) - ++TIE; - if (!FEmpty) - ++FIE; - }); + // blocks, switch to reverse_iterators. Note that getReverse() returns an + // iterator that points to the same instruction, unlike std::reverse_iterator. + // We have to do our own shifting so that we get the same range. + MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse()); + MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse()); + const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse()); + const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse()); if (!TBB.succ_empty() || !FBB.succ_empty()) { if (SkipUnconditionalBranches) { - while (!TEmpty && TIE->isUnconditionalBranch()) - shrinkInclusiveRange(TIB, TIE, TEmpty); - while (!FEmpty && FIE->isUnconditionalBranch()) - shrinkInclusiveRange(FIB, FIE, FEmpty); + while (RTIE != RTIB && RTIE->isUnconditionalBranch()) + ++RTIE; + while (RFIE != RFIB && RFIE->isUnconditionalBranch()) + ++RFIE; } } - // If Dups1 includes all of a block, then don't count duplicate - // instructions at the end of the blocks. - if (TEmpty || FEmpty) - return true; - // Count duplicate instructions at the ends of the blocks. - while (!TEmpty && !FEmpty) { + while (RTIE != RTIB && RFIE != RFIB) { // Skip dbg_value instructions. These do not count. - TIE = skipDebugInstructionsBackward(TIE, TIB); - FIE = skipDebugInstructionsBackward(FIE, FIB); - TEmpty = TIE == TIB && TIE->isDebugValue(); - FEmpty = FIE == FIB && FIE->isDebugValue(); - if (TEmpty || FEmpty) + // Note that these are reverse iterators going forward. + RTIE = skipDebugInstructionsForward(RTIE, RTIB); + RFIE = skipDebugInstructionsForward(RFIE, RFIB); + if (RTIE == RTIB || RFIE == RFIB) break; - if (!TIE->isIdenticalTo(*FIE)) + if (!RTIE->isIdenticalTo(*RFIE)) break; // We have to verify that any branch instructions are the same, and then we // don't count them toward the # of duplicate instructions. - if (!TIE->isBranch()) + if (!RTIE->isBranch()) ++Dups2; - shrinkInclusiveRange(TIB, TIE, TEmpty); - shrinkInclusiveRange(FIB, FIE, FEmpty); + ++RTIE; + ++RFIE; } + TIE = std::next(RTIE.getReverse()); + FIE = std::next(RFIE.getReverse()); return true; } @@ -741,25 +710,21 @@ bool IfConverter::RescanInstructions( static void verifySameBranchInstructions( MachineBasicBlock *MBB1, MachineBasicBlock *MBB2) { - MachineBasicBlock::iterator B1 = MBB1->begin(); - MachineBasicBlock::iterator B2 = MBB2->begin(); - MachineBasicBlock::iterator E1 = std::prev(MBB1->end()); - MachineBasicBlock::iterator E2 = std::prev(MBB2->end()); - bool Empty1 = false, Empty2 = false; - while (!Empty1 && !Empty2) { - E1 = skipDebugInstructionsBackward(E1, B1); - E2 = skipDebugInstructionsBackward(E2, B2); - Empty1 = E1 == B1 && E1->isDebugValue(); - Empty2 = E2 == B2 && E2->isDebugValue(); - - if (Empty1 && Empty2) + const MachineBasicBlock::reverse_iterator B1 = MBB1->rend(); + const MachineBasicBlock::reverse_iterator B2 = MBB2->rend(); + MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin(); + MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin(); + while (E1 != B1 && E2 != B2) { + skipDebugInstructionsForward(E1, B1); + skipDebugInstructionsForward(E2, B2); + if (E1 == B1 && E2 == B2) break; - if (Empty1) { + if (E1 == B1) { assert(!E2->isBranch() && "Branch mis-match, one block is empty."); break; } - if (Empty2) { + if (E2 == B2) { assert(!E1->isBranch() && "Branch mis-match, one block is empty."); break; } @@ -769,8 +734,8 @@ static void verifySameBranchInstructions( "Branch mis-match, branch instructions don't match."); else break; - shrinkInclusiveRange(B1, E1, Empty1); - shrinkInclusiveRange(B2, E2, Empty2); + ++E1; + ++E2; } } #endif @@ -2183,7 +2148,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // unknown probabilities into known ones. // FIXME: This usage is too tricky and in the future we would like to // eliminate all unknown probabilities in MBB. - ToBBI.BB->normalizeSuccProbs(); + if (ToBBI.IsBrAnalyzable) + ToBBI.BB->normalizeSuccProbs(); SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(), FromMBB.succ_end()); @@ -2263,7 +2229,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // Normalize the probabilities of ToBBI.BB's successors with all adjustment // we've done above. - ToBBI.BB->normalizeSuccProbs(); + if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable) + ToBBI.BB->normalizeSuccProbs(); ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end()); FromBBI.Predicate.clear(); diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index 9588dfb720586..920c2a372a9b8 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -22,6 +22,7 @@ // With the help of a runtime that understands the .fault_maps section, // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs // a page fault. +// Store and LoadStore are also supported. // //===----------------------------------------------------------------------===// @@ -29,6 +30,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass { const TargetRegisterInfo *TRI = nullptr; AliasAnalysis *AA = nullptr; MachineModuleInfo *MMI = nullptr; + MachineFrameInfo *MFI = nullptr; bool analyzeBlockForNullChecks(MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList); - MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB, - MachineBasicBlock *HandlerMBB); + MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *HandlerMBB); void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList); - /// Is \p MI a memory operation that can be used to implicitly null check the - /// value in \p PointerReg? \p PrevInsts is the set of instruction seen since + enum AliasResult { + AR_NoAlias, + AR_MayAlias, + AR_WillAliasEverything + }; + /// Returns AR_NoAlias if \p MI memory operation does not alias with + /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if + /// they may alias and any further memory operation may alias with \p PrevMI. + AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI); + + enum SuitabilityResult { + SR_Suitable, + SR_Unsuitable, + SR_Impossible + }; + /// Return SR_Suitable if \p MI a memory operation that can be used to + /// implicitly null check the value in \p PointerReg, SR_Unsuitable if + /// \p MI cannot be used to null check and SR_Impossible if there is + /// no sense to continue lookup due to any other instruction will not be able + /// to be used. \p PrevInsts is the set of instruction seen since /// the explicit null check on \p PointerReg. - bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, - ArrayRef<MachineInstr *> PrevInsts); + SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, + ArrayRef<MachineInstr *> PrevInsts); /// Return true if \p FaultingMI can be hoisted from after the the /// instructions in \p InstsSeenSoFar to before them. Set \p Dependence to a /// non-null value if we also need to (and legally can) hoist a depedency. - bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg, - ArrayRef<MachineInstr *> InstsSeenSoFar, - MachineBasicBlock *NullSucc, MachineInstr *&Dependence); + bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg, + ArrayRef<MachineInstr *> InstsSeenSoFar, + MachineBasicBlock *NullSucc, MachineInstr *&Dependence); public: static char ID; @@ -193,7 +214,7 @@ public: } bool ImplicitNullChecks::canHandle(const MachineInstr *MI) { - if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects()) + if (MI->isCall() || MI->hasUnmodeledSideEffects()) return false; auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); }; (void)IsRegMask; @@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A, unsigned RegB = MOB.getReg(); - if (TRI->regsOverlap(RegA, RegB)) + if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef())) return false; } } @@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getRegInfo().getTargetRegisterInfo(); MMI = &MF.getMMI(); + MFI = &MF.getFrameInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); SmallVector<NullCheck, 16> NullCheckList; @@ -283,36 +305,91 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI, return false; } -bool ImplicitNullChecks::isSuitableMemoryOp( - MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) { +ImplicitNullChecks::AliasResult +ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI, + MachineInstr *PrevMI) { + // If it is not memory access, skip the check. + if (!(PrevMI->mayStore() || PrevMI->mayLoad())) + return AR_NoAlias; + // Load-Load may alias + if (!(MI.mayStore() || PrevMI->mayStore())) + return AR_NoAlias; + // We lost info, conservatively alias. If it was store then no sense to + // continue because we won't be able to check against it further. + if (MI.memoperands_empty()) + return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias; + if (PrevMI->memoperands_empty()) + return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias; + + for (MachineMemOperand *MMO1 : MI.memoperands()) { + // MMO1 should have a value due it comes from operation we'd like to use + // as implicit null check. + assert(MMO1->getValue() && "MMO1 should have a Value!"); + for (MachineMemOperand *MMO2 : PrevMI->memoperands()) { + if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) { + if (PSV->mayAlias(MFI)) + return AR_MayAlias; + continue; + } + llvm::AliasResult AAResult = AA->alias( + MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, + MMO1->getAAInfo()), + MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, + MMO2->getAAInfo())); + if (AAResult != NoAlias) + return AR_MayAlias; + } + } + return AR_NoAlias; +} + +ImplicitNullChecks::SuitabilityResult +ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, + ArrayRef<MachineInstr *> PrevInsts) { int64_t Offset; unsigned BaseReg; if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) || BaseReg != PointerReg) - return false; - - // We want the load to be issued at a sane offset from PointerReg, so that - // if PointerReg is null then the load reliably page faults. - if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize)) - return false; - - // Finally, we need to make sure that the load instruction actually is - // loading from PointerReg, and there isn't some re-definition of PointerReg - // between the compare and the load. + return SR_Unsuitable; + + // We want the mem access to be issued at a sane offset from PointerReg, + // so that if PointerReg is null then the access reliably page faults. + if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() && + Offset < PageSize)) + return SR_Unsuitable; + + // Finally, we need to make sure that the access instruction actually is + // accessing from PointerReg, and there isn't some re-definition of PointerReg + // between the compare and the memory access. + // If PointerReg has been redefined before then there is no sense to continue + // lookup due to this condition will fail for any further instruction. + SuitabilityResult Suitable = SR_Suitable; for (auto *PrevMI : PrevInsts) - for (auto &PrevMO : PrevMI->operands()) - if (PrevMO.isReg() && PrevMO.getReg() && + for (auto &PrevMO : PrevMI->operands()) { + if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() && TRI->regsOverlap(PrevMO.getReg(), PointerReg)) - return false; - - return true; + return SR_Impossible; + + // Check whether the current memory access aliases with previous one. + // If we already found that it aliases then no need to continue. + // But we continue base pointer check as it can result in SR_Impossible. + if (Suitable == SR_Suitable) { + AliasResult AR = areMemoryOpsAliased(MI, PrevMI); + if (AR == AR_WillAliasEverything) + return SR_Impossible; + if (AR == AR_MayAlias) + Suitable = SR_Unsuitable; + } + } + return Suitable; } -bool ImplicitNullChecks::canHoistLoadInst( - MachineInstr *FaultingMI, unsigned PointerReg, - ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc, - MachineInstr *&Dependence) { +bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, + unsigned PointerReg, + ArrayRef<MachineInstr *> InstsSeenSoFar, + MachineBasicBlock *NullSucc, + MachineInstr *&Dependence) { auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar); if (!DepResult.CanReorder) return false; @@ -359,7 +436,8 @@ bool ImplicitNullChecks::canHoistLoadInst( // The Dependency can't be re-defining the base register -- then we won't // get the memory operation on the address we want. This is already // checked in \c IsSuitableMemoryOp. - assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) && + assert(!(DependenceMO.isDef() && + TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) && "Should have been checked before!"); } @@ -481,9 +559,11 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( return false; MachineInstr *Dependence; - if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) && - canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, - Dependence)) { + SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar); + if (SR == SR_Impossible) + return false; + if (SR == SR_Suitable && + canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) { NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc, NullSucc, Dependence); return true; @@ -495,36 +575,42 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( return false; } -/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine -/// instruction. The FAULTING_LOAD_OP instruction does the same load as LoadMI -/// (defining the same register), and branches to HandlerMBB if the load -/// faults. The FAULTING_LOAD_OP instruction is inserted at the end of MBB. -MachineInstr * -ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI, - MachineBasicBlock *MBB, - MachineBasicBlock *HandlerMBB) { +/// Wrap a machine instruction, MI, into a FAULTING machine instruction. +/// The FAULTING instruction does the same load/store as MI +/// (defining the same register), and branches to HandlerMBB if the mem access +/// faults. The FAULTING instruction is inserted at the end of MBB. +MachineInstr *ImplicitNullChecks::insertFaultingInstr( + MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) { const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for // all targets. DebugLoc DL; - unsigned NumDefs = LoadMI->getDesc().getNumDefs(); + unsigned NumDefs = MI->getDesc().getNumDefs(); assert(NumDefs <= 1 && "other cases unhandled!"); unsigned DefReg = NoRegister; if (NumDefs != 0) { - DefReg = LoadMI->defs().begin()->getReg(); - assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 && + DefReg = MI->defs().begin()->getReg(); + assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 && "expected exactly one def!"); } - auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg) + FaultMaps::FaultKind FK; + if (MI->mayLoad()) + FK = + MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad; + else + FK = FaultMaps::FaultingStore; + + auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg) + .addImm(FK) .addMBB(HandlerMBB) - .addImm(LoadMI->getOpcode()); + .addImm(MI->getOpcode()); - for (auto &MO : LoadMI->uses()) - MIB.addOperand(MO); + for (auto &MO : MI->uses()) + MIB.add(MO); - MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end()); + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return MIB; } @@ -545,18 +631,18 @@ void ImplicitNullChecks::rewriteNullChecks( NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI); } - // Insert a faulting load where the conditional branch was originally. We - // check earlier ensures that this bit of code motion is legal. We do not - // touch the successors list for any basic block since we haven't changed - // control flow, we've just made it implicit. - MachineInstr *FaultingLoad = insertFaultingLoad( + // Insert a faulting instruction where the conditional branch was + // originally. We check earlier ensures that this bit of code motion + // is legal. We do not touch the successors list for any basic block + // since we haven't changed control flow, we've just made it implicit. + MachineInstr *FaultingInstr = insertFaultingInstr( NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc()); // Now the values defined by MemOperation, if any, are live-in of // the block of MemOperation. - // The original load operation may define implicit-defs alongside - // the loaded value. + // The original operation may define implicit-defs alongside + // the value. MachineBasicBlock *MBB = NC.getMemOperation()->getParent(); - for (const MachineOperand &MO : FaultingLoad->operands()) { + for (const MachineOperand &MO : FaultingInstr->operands()) { if (!MO.isReg() || !MO.isDef()) continue; unsigned Reg = MO.getReg(); diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index 3d81184f774a3..a1cb0a0695bfa 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI); // We take the DebugLoc from MI, since OrigMI may be attributed to a - // different source location. + // different source location. auto *NewMI = LIS.getInstructionFromIndex(DefIdx); NewMI->setDebugLoc(MI.getDebugLoc()); @@ -686,7 +686,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) { return true; } -#if !defined(NDEBUG) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD // Dump the range of instructions from B to E with their slot indexes. static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E, diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp index afd24067ace76..c6cc909e25d38 100644 --- a/lib/CodeGen/IntrinsicLowering.cpp +++ b/lib/CodeGen/IntrinsicLowering.cpp @@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) { Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::memmove: M.getOrInsertFunction("memmove", Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::memset: M.getOrInsertFunction("memset", Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt32Ty(M.getContext()), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::sqrt: EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl"); diff --git a/lib/CodeGen/LLVMBuild.txt b/lib/CodeGen/LLVMBuild.txt index 86d3624a9d6e0..07ea9dcaea7ae 100644 --- a/lib/CodeGen/LLVMBuild.txt +++ b/lib/CodeGen/LLVMBuild.txt @@ -22,4 +22,4 @@ subdirectories = AsmPrinter SelectionDAG MIRParser GlobalISel type = Library name = CodeGen parent = Libraries -required_libraries = Analysis BitReader BitWriter Core MC Scalar Support Target TransformUtils +required_libraries = Analysis BitReader BitWriter Core MC ProfileData Scalar Support Target TransformUtils diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 26794e28020eb..7b1706f0f4ba9 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -42,8 +42,8 @@ static cl::opt<cl::boolOrDefault> EnableFastISelOption("fast-isel", cl::Hidden, cl::desc("Enable the \"fast\" instruction selector")); -static cl::opt<bool> - EnableGlobalISel("global-isel", cl::Hidden, cl::init(false), +static cl::opt<cl::boolOrDefault> + EnableGlobalISel("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector")); void LLVMTargetMachine::initAsmInfo() { @@ -85,7 +85,7 @@ void LLVMTargetMachine::initAsmInfo() { LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef DataLayoutString, const Triple &TT, StringRef CPU, - StringRef FS, TargetOptions Options, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) { @@ -149,7 +149,9 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, TM->setFastISel(true); // Ask the target for an isel. - if (LLVM_UNLIKELY(EnableGlobalISel)) { + // Enable GlobalISel if the target wants to, but allow that to be overriden. + if (EnableGlobalISel == cl::BOU_TRUE || (EnableGlobalISel == cl::BOU_UNSET && + PassConfig->isGlobalISelEnabled())) { if (PassConfig->addIRTranslator()) return nullptr; @@ -172,11 +174,12 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, // Pass to reset the MachineFunction if the ISel failed. PM.add(createResetMachineFunctionPass( - PassConfig->reportDiagnosticWhenGlobalISelFallback())); + PassConfig->reportDiagnosticWhenGlobalISelFallback(), + PassConfig->isGlobalISelAbortEnabled())); // Provide a fallback path when we do not want to abort on // not-yet-supported input. - if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) && + if (!PassConfig->isGlobalISelAbortEnabled() && PassConfig->addInstSelector()) return nullptr; diff --git a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp new file mode 100644 index 0000000000000..996d40ca6e1ee --- /dev/null +++ b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp @@ -0,0 +1,97 @@ +///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===// +/// +/// The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// This is an alternative analysis pass to MachineBlockFrequencyInfo. The +/// difference is that with this pass the block frequencies are not computed +/// when the analysis pass is executed but rather when the BFI result is +/// explicitly requested by the analysis client. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "lazy-machine-block-freq" + +INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, + "Lazy Machine Block Frequency Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, + "Lazy Machine Block Frequency Analysis", true, true) + +char LazyMachineBlockFrequencyInfoPass::ID = 0; + +LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass() + : MachineFunctionPass(ID) { + initializeLazyMachineBlockFrequencyInfoPassPass( + *PassRegistry::getPassRegistry()); +} + +void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS, + const Module *M) const { + getBFI().print(OS, M); +} + +void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +void LazyMachineBlockFrequencyInfoPass::releaseMemory() { + OwnedMBFI.reset(); + OwnedMLI.reset(); + OwnedMDT.reset(); +} + +MachineBlockFrequencyInfo & +LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const { + auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>(); + if (MBFI) { + DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n"); + return *MBFI; + } + + auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>(); + auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>(); + auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>(); + DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n"); + DEBUG(if (MLI) dbgs() << "LoopInfo is available\n"); + + if (!MLI) { + DEBUG(dbgs() << "Building LoopInfo on the fly\n"); + // First create a dominator tree. + DEBUG(if (MDT) dbgs() << "DominatorTree is available\n"); + + if (!MDT) { + DEBUG(dbgs() << "Building DominatorTree on the fly\n"); + OwnedMDT = make_unique<MachineDominatorTree>(); + OwnedMDT->getBase().recalculate(*MF); + MDT = OwnedMDT.get(); + } + + // Generate LoopInfo from it. + OwnedMLI = make_unique<MachineLoopInfo>(); + OwnedMLI->getBase().analyze(MDT->getBase()); + MLI = OwnedMLI.get(); + } + + OwnedMBFI = make_unique<MachineBlockFrequencyInfo>(); + OwnedMBFI->calculate(*MF, MBPI, *MLI); + return *OwnedMBFI.get(); +} + +bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction( + MachineFunction &F) { + MF = &F; + return false; +} diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp index 834ed5f06c945..275d84e2c185f 100644 --- a/lib/CodeGen/LexicalScopes.cpp +++ b/lib/CodeGen/LexicalScopes.cpp @@ -14,14 +14,23 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <string> +#include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "lexicalscopes" @@ -38,6 +47,10 @@ void LexicalScopes::reset() { /// initialize - Scan machine function and constuct lexical scope nest. void LexicalScopes::initialize(const MachineFunction &Fn) { + // Don't attempt any lexical scope creation for a NoDebug compile unit. + if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() == + DICompileUnit::NoDebug) + return; reset(); MF = &Fn; SmallVector<InsnRange, 4> MIRanges; @@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) { void LexicalScopes::extractLexicalScopes( SmallVectorImpl<InsnRange> &MIRanges, DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { - // Scan each instruction and create scopes. First build working set of scopes. for (const auto &MBB : *MF) { const MachineInstr *RangeBeginMI = nullptr; @@ -127,6 +139,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) { LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope, const DILocation *IA) { if (IA) { + // Skip scopes inlined from a NoDebug compile unit. + if (Scope->getSubprogram()->getUnit()->getEmissionKind() == + DICompileUnit::NoDebug) + return getOrCreateLexicalScope(IA); // Create an abstract scope for inlined function. getOrCreateAbstractScope(Scope); // Create an inlined scope for inlined function. @@ -181,10 +197,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope, else Parent = getOrCreateLexicalScope(InlinedAt); - I = InlinedLexicalScopeMap.emplace(std::piecewise_construct, - std::forward_as_tuple(P), - std::forward_as_tuple(Parent, Scope, - InlinedAt, false)) + I = InlinedLexicalScopeMap + .emplace(std::piecewise_construct, std::forward_as_tuple(P), + std::forward_as_tuple(Parent, Scope, InlinedAt, false)) .first; return &I->second; } @@ -241,7 +256,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) { void LexicalScopes::assignInstructionRanges( SmallVectorImpl<InsnRange> &MIRanges, DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { - LexicalScope *PrevLexicalScope = nullptr; for (const auto &R : MIRanges) { LexicalScope *S = MI2ScopeMap.lookup(R.first); @@ -299,9 +313,8 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) { return Result; } -/// dump - Print data structures. -void LexicalScope::dump(unsigned Indent) const { -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const { raw_ostream &err = dbgs(); err.indent(Indent); err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n"; @@ -316,5 +329,5 @@ void LexicalScope::dump(unsigned Indent) const { for (unsigned i = 0, e = Children.size(); i != e; ++i) if (Children[i] != this) Children[i]->dump(Indent + 2); -#endif } +#endif diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp index c945376560f72..f956974b1aafe 100644 --- a/lib/CodeGen/LiveDebugValues.cpp +++ b/lib/CodeGen/LiveDebugValues.cpp @@ -24,13 +24,16 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DebugInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass { private: const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; + const TargetFrameLowering *TFI; LexicalScopes LS; /// Keeps track of lexical scopes associated with a user value's source @@ -127,11 +131,13 @@ private: if (int RegNo = isDbgValueDescribedByReg(MI)) { Kind = RegisterKind; Loc.RegisterLoc.RegNo = RegNo; - uint64_t Offset = + int64_t Offset = MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0; // We don't support offsets larger than 4GiB here. They are // slated to be replaced with DIExpressions anyway. - if (Offset >= (1ULL << 32)) + // With indirect debug values used for spill locations, Offset + // can be negative. + if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32)) Kind = InvalidKind; else Loc.RegisterLoc.Offset = Offset; @@ -150,7 +156,9 @@ private: /// dominates MBB. bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); } - void dump() const { MI.dump(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { MI.dump(); } +#endif bool operator==(const VarLoc &Other) const { return Var == Other.Var && Loc.Hash == Other.Loc.Hash; @@ -167,6 +175,11 @@ private: typedef UniqueVector<VarLoc> VarLocMap; typedef SparseBitVector<> VarLocSet; typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB; + struct SpillDebugPair { + MachineInstr *SpillInst; + MachineInstr *DebugInst; + }; + typedef SmallVector<SpillDebugPair, 4> SpillMap; /// This holds the working set of currently open ranges. For fast /// access, this is done both as a set of VarLocIDs, and a map of @@ -216,14 +229,21 @@ private: } }; + bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF, + unsigned &Reg); + int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg); + void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs); + void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges, + VarLocMap &VarLocIDs, SpillMap &Spills); void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges, const VarLocMap &VarLocIDs); bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs); bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, - VarLocInMBB &OutLocs, VarLocMap &VarLocIDs); + VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills, + bool transferSpills); bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs, const VarLocMap &VarLocIDs, @@ -282,6 +302,7 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const { // Debug Range Extension Implementation //===----------------------------------------------------------------------===// +#ifndef NDEBUG void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V, const VarLocMap &VarLocIDs, @@ -300,6 +321,22 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, } Out << "\n"; } +#endif + +/// Given a spill instruction, extract the register and offset used to +/// address the spill location in a target independent way. +int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI, + unsigned &Reg) { + assert(MI.hasOneMemOperand() && + "Spill instruction does not have exactly one memory operand?"); + auto MMOI = MI.memoperands_begin(); + const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue(); + assert(PVal->kind() == PseudoSourceValue::FixedStack && + "Inconsistent memory operand in spill instruction"); + int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex(); + const MachineBasicBlock *MBB = MI.getParent(); + return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg); +} /// End all previous ranges related to @MI and start a new range from @MI /// if it is a DBG_VALUE instr. @@ -336,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI, unsigned SP = TLI->getStackPointerRegisterToSaveRestore(); SparseBitVector<> KillSet; for (const MachineOperand &MO : MI.operands()) { + // Determine whether the operand is a register def. Assume that call + // instructions never clobber SP, because some backends (e.g., AArch64) + // never list SP in the regmask. if (MO.isReg() && MO.isDef() && MO.getReg() && - TRI->isPhysicalRegister(MO.getReg())) { + TRI->isPhysicalRegister(MO.getReg()) && + !(MI.isCall() && MO.getReg() == SP)) { // Remove ranges of all aliased registers. for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI) for (unsigned ID : OpenRanges.getVarLocs()) @@ -358,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI, OpenRanges.erase(KillSet, VarLocIDs); } +/// Decide if @MI is a spill instruction and return true if it is. We use 2 +/// criteria to make this decision: +/// - Is this instruction a store to a spill slot? +/// - Is there a register operand that is both used and killed? +/// TODO: Store optimization can fold spills into other stores (including +/// other spills). We do not handle this yet (more than one memory operand). +bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI, + MachineFunction *MF, unsigned &Reg) { + const MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + int FI; + const MachineMemOperand *MMO; + + // TODO: Handle multiple stores folded into one. + if (!MI.hasOneMemOperand()) + return false; + + // To identify a spill instruction, use the same criteria as in AsmPrinter. + if (!((TII->isStoreToStackSlotPostFE(MI, FI) || + TII->hasStoreToStackSlot(MI, MMO, FI)) && + FrameInfo.isSpillSlotObjectIndex(FI))) + return false; + + // In a spill instruction generated by the InlineSpiller the spilled register + // has its kill flag set. Return false if we don't find such a register. + Reg = 0; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isUse() && MO.isKill()) { + Reg = MO.getReg(); + break; + } + } + return Reg != 0; +} + +/// A spilled register may indicate that we have to end the current range of +/// a variable and create a new one for the spill location. +/// We don't want to insert any instructions in transfer(), so we just create +/// the DBG_VALUE witout inserting it and keep track of it in @Spills. +/// It will be inserted into the BB when we're done iterating over the +/// instructions. +void LiveDebugValues::transferSpillInst(MachineInstr &MI, + OpenRangesSet &OpenRanges, + VarLocMap &VarLocIDs, + SpillMap &Spills) { + unsigned Reg; + MachineFunction *MF = MI.getParent()->getParent(); + if (!isSpillInstruction(MI, MF, Reg)) + return; + + // Check if the register is the location of a debug value. + for (unsigned ID : OpenRanges.getVarLocs()) { + if (VarLocIDs[ID].isDescribedByReg() == Reg) { + DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '(' + << VarLocIDs[ID].Var.getVar()->getName() << ")\n"); + + // Create a DBG_VALUE instruction to describe the Var in its spilled + // location, but don't insert it yet to avoid invalidating the + // iterator in our caller. + unsigned SpillBase; + int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase); + const MachineInstr *DMI = &VarLocIDs[ID].MI; + MachineInstr *SpDMI = + BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0, + DMI->getDebugVariable(), DMI->getDebugExpression()); + SpDMI->getOperand(1).setImm(SpillOffset); + DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: "; + SpDMI->print(dbgs(), false, TII)); + + // The newly created DBG_VALUE instruction SpDMI must be inserted after + // MI. Keep track of the pairing. + SpillDebugPair MIP = {&MI, SpDMI}; + Spills.push_back(MIP); + + // End all previous ranges of Var. + OpenRanges.erase(VarLocIDs[ID].Var); + + // Add the VarLoc to OpenRanges. + VarLoc VL(*SpDMI, LS); + unsigned SpillLocID = VarLocIDs.insert(VL); + OpenRanges.insert(SpillLocID, VL.Var); + return; + } + } +} + /// Terminate all open ranges at the end of the current basic block. bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges, @@ -383,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, /// This routine creates OpenRanges and OutLocs. bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, - VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) { + VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, + SpillMap &Spills, bool transferSpills) { bool Changed = false; transferDebugValue(MI, OpenRanges, VarLocIDs); transferRegisterDef(MI, OpenRanges, VarLocIDs); + if (transferSpills) + transferSpillInst(MI, OpenRanges, VarLocIDs, Spills); Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs); return Changed; } @@ -475,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { bool OLChanged = false; bool MBBJoined = false; - VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. + VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. OpenRangesSet OpenRanges; // Ranges that are open until end of bb. - VarLocInMBB OutLocs; // Ranges that exist beyond bb. - VarLocInMBB InLocs; // Ranges that are incoming after joining. + VarLocInMBB OutLocs; // Ranges that exist beyond bb. + VarLocInMBB InLocs; // Ranges that are incoming after joining. + SpillMap Spills; // DBG_VALUEs associated with spills. DenseMap<unsigned int, MachineBasicBlock *> OrderToBB; DenseMap<MachineBasicBlock *, unsigned int> BBToOrder; @@ -490,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { Pending; // Initialize every mbb with OutLocs. + // We are not looking at any spill instructions during the initial pass + // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE + // instructions for spills of registers that are known to be user variables + // within the BB in which the spill occurs. for (auto &MBB : MF) for (auto &MI : MBB) - transfer(MI, OpenRanges, OutLocs, VarLocIDs); + transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, + /*transferSpills=*/false); DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization", dbgs())); @@ -524,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { if (MBBJoined) { MBBJoined = false; Changed = true; + // Now that we have started to extend ranges across BBs we need to + // examine spill instructions to see whether they spill registers that + // correspond to user variables. for (auto &MI : *MBB) - OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs); + OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, + /*transferSpills=*/true); + + // Add any DBG_VALUE instructions necessitated by spills. + for (auto &SP : Spills) + MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst), + SP.DebugInst); + Spills.clear(); DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after propagating", dbgs())); @@ -559,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); TII = MF.getSubtarget().getInstrInfo(); + TFI = MF.getSubtarget().getFrameLowering(); LS.initialize(MF); bool Changed = ExtendRanges(MF); diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index 0934d8cfeaa1f..bcf7c8e99c7ff 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -944,7 +944,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, IsIndirect, Loc.getReg(), offset, Variable, Expression); else BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(Loc) + .add(Loc) .addImm(offset) .addMetadata(Variable) .addMetadata(Expression); @@ -1005,7 +1005,7 @@ bool LiveDebugVariables::doInitialization(Module &M) { return Pass::doInitialization(M); } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void LiveDebugVariables::dump() { if (pImpl) static_cast<LDVImpl*>(pImpl)->print(dbgs()); diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp index 623af492fcd4f..9ef9f238fdcea 100644 --- a/lib/CodeGen/LiveInterval.cpp +++ b/lib/CodeGen/LiveInterval.cpp @@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() { SubRanges = nullptr; } +void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator, + LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) { + + LaneBitmask ToApply = LaneMask; + for (SubRange &SR : subranges()) { + LaneBitmask SRMask = SR.LaneMask; + LaneBitmask Matching = SRMask & LaneMask; + if (Matching.none()) + continue; + + SubRange *MatchingRange; + if (SRMask == Matching) { + // The subrange fits (it does not cover bits outside \p LaneMask). + MatchingRange = &SR; + } else { + // We have to split the subrange into a matching and non-matching part. + // Reduce lanemask of existing lane to non-matching part. + SR.LaneMask = SRMask & ~Matching; + // Create a new subrange for the matching part + MatchingRange = createSubRangeFrom(Allocator, Matching, SR); + } + Apply(*MatchingRange); + ToApply &= ~Matching; + } + // Create a new subrange if there are uncovered bits left. + if (ToApply.any()) { + SubRange *NewRange = createSubRange(Allocator, ToApply); + Apply(*NewRange); + } +} + unsigned LiveInterval::getSize() const { unsigned Sum = 0; for (const Segment &S : segments) @@ -1032,6 +1063,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const { // When they exist, Spills.back().start <= LastStart, // and WriteI[-1].start <= LastStart. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LiveRangeUpdater::print(raw_ostream &OS) const { if (!isDirty()) { if (LR) @@ -1058,6 +1090,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const { LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const { print(errs()); } +#endif // Determine if A and B should be coalesced. static inline bool coalescable(const LiveRange::Segment &A, diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp index 70d34838b2374..3f5b8e19d1f0c 100644 --- a/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -7,10 +7,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements the LiveInterval analysis pass which is used -// by the Linear Scan Register allocator. This pass linearizes the -// basic blocks of the function in DFS order and computes live intervals for -// each virtual and physical register. +/// \file This file implements the LiveInterval analysis pass which is used +/// by the Linear Scan Register allocator. This pass linearizes the +/// basic blocks of the function in DFS order and computes live intervals for +/// each virtual and physical register. // //===----------------------------------------------------------------------===// @@ -96,16 +96,14 @@ void LiveIntervals::releaseMemory() { RegMaskBits.clear(); RegMaskBlocks.clear(); - for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) - delete RegUnitRanges[i]; + for (LiveRange *LR : RegUnitRanges) + delete LR; RegUnitRanges.clear(); // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd. VNInfoAllocator.Reset(); } -/// runOnMachineFunction - calculates LiveIntervals -/// bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { MF = &fn; MRI = &MF->getRegInfo(); @@ -135,14 +133,13 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { return true; } -/// print - Implement the dump method. void LiveIntervals::print(raw_ostream &OS, const Module* ) const { OS << "********** INTERVALS **********\n"; // Dump the regunits. - for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) - if (LiveRange *LR = RegUnitRanges[i]) - OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n'; + for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit) + if (LiveRange *LR = RegUnitRanges[Unit]) + OS << PrintRegUnit(Unit, TRI) << ' ' << *LR << '\n'; // Dump the virtregs. for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { @@ -152,8 +149,8 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const { } OS << "RegMasks:"; - for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i) - OS << ' ' << RegMaskSlots[i]; + for (SlotIndex Idx : RegMaskSlots) + OS << ' ' << Idx; OS << '\n'; printInstrs(OS); @@ -165,7 +162,7 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void LiveIntervals::dumpInstrs() const { +LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const { printInstrs(dbgs()); } #endif @@ -177,8 +174,7 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) { } -/// computeVirtRegInterval - Compute the live interval of a virtual register, -/// based on defs and uses. +/// Compute the live interval of a virtual register, based on defs and uses. void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LRCalc && "LRCalc not initialized."); assert(LI.empty() && "Should only compute empty intervals."); @@ -200,7 +196,7 @@ void LiveIntervals::computeRegMasks() { RegMaskBlocks.resize(MF->getNumBlockIDs()); // Find all instructions with regmask operands. - for (MachineBasicBlock &MBB : *MF) { + for (const MachineBasicBlock &MBB : *MF) { std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()]; RMB.first = RegMaskSlots.size(); @@ -210,7 +206,7 @@ void LiveIntervals::computeRegMasks() { RegMaskBits.push_back(Mask); } - for (MachineInstr &MI : MBB) { + for (const MachineInstr &MI : MBB) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isRegMask()) continue; @@ -245,9 +241,9 @@ void LiveIntervals::computeRegMasks() { // interference. // -/// computeRegUnitInterval - Compute the live range of a register unit, based -/// on the uses and defs of aliasing registers. The range should be empty, -/// or contain only dead phi-defs from ABI blocks. +/// Compute the live range of a register unit, based on the uses and defs of +/// aliasing registers. The range should be empty, or contain only dead +/// phi-defs from ABI blocks. void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { assert(LRCalc && "LRCalc not initialized."); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -257,22 +253,30 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { // may share super-registers. That's OK because createDeadDefs() is // idempotent. It is very rare for a register unit to have multiple roots, so // uniquing super-registers is probably not worthwhile. - for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { - for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); - Supers.isValid(); ++Supers) { - if (!MRI->reg_empty(*Supers)) - LRCalc->createDeadDefs(LR, *Supers); + bool IsReserved = true; + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { + for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); + Super.isValid(); ++Super) { + unsigned Reg = *Super; + if (!MRI->reg_empty(Reg)) + LRCalc->createDeadDefs(LR, Reg); + // A register unit is considered reserved if all its roots and all their + // super registers are reserved. + if (!MRI->isReserved(Reg)) + IsReserved = false; } } // Now extend LR to reach all uses. // Ignore uses of reserved registers. We only track defs of those. - for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { - for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); - Supers.isValid(); ++Supers) { - unsigned Reg = *Supers; - if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg)) - LRCalc->extendToUses(LR, Reg); + if (!IsReserved) { + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { + for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); + Super.isValid(); ++Super) { + unsigned Reg = *Super; + if (!MRI->reg_empty(Reg)) + LRCalc->extendToUses(LR, Reg); + } } } @@ -281,11 +285,9 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { LR.flushSegmentSet(); } - -/// computeLiveInRegUnits - Precompute the live ranges of any register units -/// that are live-in to an ABI block somewhere. Register values can appear -/// without a corresponding def when entering the entry block or a landing pad. -/// +/// Precompute the live ranges of any register units that are live-in to an ABI +/// block somewhere. Register values can appear without a corresponding def when +/// entering the entry block or a landing pad. void LiveIntervals::computeLiveInRegUnits() { RegUnitRanges.resize(TRI->getNumRegUnits()); DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); @@ -294,18 +296,15 @@ void LiveIntervals::computeLiveInRegUnits() { SmallVector<unsigned, 8> NewRanges; // Check all basic blocks for live-ins. - for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); - MFI != MFE; ++MFI) { - const MachineBasicBlock *MBB = &*MFI; - + for (const MachineBasicBlock &MBB : *MF) { // We only care about ABI blocks: Entry + landing pads. - if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty()) + if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty()) continue; // Create phi-defs at Begin for all live-in registers. - SlotIndex Begin = Indexes->getMBBStartIdx(MBB); - DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber()); - for (const auto &LI : MBB->liveins()) { + SlotIndex Begin = Indexes->getMBBStartIdx(&MBB); + DEBUG(dbgs() << Begin << "\tBB#" << MBB.getNumber()); + for (const auto &LI : MBB.liveins()) { for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { unsigned Unit = *Units; LiveRange *LR = RegUnitRanges[Unit]; @@ -324,16 +323,13 @@ void LiveIntervals::computeLiveInRegUnits() { DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. - for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) { - unsigned Unit = NewRanges[i]; + for (unsigned Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); - } } - static void createSegmentsForValues(LiveRange &LR, - iterator_range<LiveInterval::vni_iterator> VNIs) { - for (auto VNI : VNIs) { + iterator_range<LiveInterval::vni_iterator> VNIs) { + for (VNInfo *VNI : VNIs) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; @@ -349,7 +345,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, // Keep track of the PHIs that are in use. SmallPtrSet<VNInfo*, 8> UsedPHIs; // Blocks that have already been added to WorkList as live-out. - SmallPtrSet<MachineBasicBlock*, 16> LiveOut; + SmallPtrSet<const MachineBasicBlock*, 16> LiveOut; // Extend intervals to reach all uses in WorkList. while (!WorkList.empty()) { @@ -368,7 +364,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, !UsedPHIs.insert(VNI).second) continue; // The PHI is live, make sure the predecessors are live-out. - for (auto &Pred : MBB->predecessors()) { + for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -384,7 +380,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI)); // Make sure VNI is live-out from the predecessors. - for (auto &Pred : MBB->predecessors()) { + for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -415,22 +411,20 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, ShrinkToUsesWorkList WorkList; // Visit all instructions reading li->reg. - for (MachineRegisterInfo::reg_instr_iterator - I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end(); - I != E; ) { - MachineInstr *UseMI = &*(I++); - if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg)) + unsigned Reg = li->reg; + for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) { + if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg)) continue; - SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot(); + SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot(); LiveQueryResult LRQ = li->Query(Idx); VNInfo *VNI = LRQ.valueIn(); if (!VNI) { // This shouldn't happen: readsVirtualRegister returns true, but there is // no live value. It is likely caused by a target getting <undef> flags // wrong. - DEBUG(dbgs() << Idx << '\t' << *UseMI + DEBUG(dbgs() << Idx << '\t' << UseMI << "Warning: Instr claims to read non-existent value in " - << *li << '\n'); + << *li << '\n'); continue; } // Special case: An early-clobber tied operand reads and writes the @@ -458,7 +452,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, bool LiveIntervals::computeDeadValues(LiveInterval &LI, SmallVectorImpl<MachineInstr*> *dead) { bool MayHaveSplitComponents = false; - for (auto VNI : LI.valnos) { + for (VNInfo *VNI : LI.valnos) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; @@ -548,7 +542,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) { SR.segments.swap(NewLR.segments); // Remove dead PHI value numbers - for (auto VNI : SR.valnos) { + for (VNInfo *VNI : SR.valnos) { if (VNI->isUnused()) continue; const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def); @@ -571,8 +565,8 @@ void LiveIntervals::extendToIndices(LiveRange &LR, ArrayRef<SlotIndex> Undefs) { assert(LRCalc && "LRCalc not initialized."); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); - for (unsigned i = 0, e = Indices.size(); i != e; ++i) - LRCalc->extend(LR, Indices[i], /*PhysReg=*/0, Undefs); + for (SlotIndex Idx : Indices) + LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs); } void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, @@ -601,11 +595,9 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, // from each successor. typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy; VisitedTy Visited; - for (MachineBasicBlock::succ_iterator - SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end(); - SuccI != SuccE; ++SuccI) { + for (MachineBasicBlock *Succ : KillMBB->successors()) { for (df_ext_iterator<MachineBasicBlock*, VisitedTy> - I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited); + I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited); I != E;) { MachineBasicBlock *MBB = *I; @@ -657,9 +649,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid(); - ++Units) { - const LiveRange &RURange = getRegUnit(*Units); + for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + ++Unit) { + const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) continue; RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end))); @@ -802,9 +794,8 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const { // Conservatively return true instead of scanning huge predecessor lists. if (PHIMBB->pred_size() > 100) return true; - for (MachineBasicBlock::const_pred_iterator - PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI) - if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI))) + for (const MachineBasicBlock *Pred : PHIMBB->predecessors()) + if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred))) return true; } return false; @@ -895,7 +886,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, // IntervalUpdate class. //===----------------------------------------------------------------------===// -// HMEditor is a toolkit used by handleMove to trim or extend live intervals. +/// Toolkit used by handleMove to trim or extend live intervals. class LiveIntervals::HMEditor { private: LiveIntervals& LIS; @@ -1241,10 +1232,12 @@ private: LiveRange::iterator NewIdxIn = NewIdxOut; assert(NewIdxIn == LR.find(NewIdx.getBaseIndex())); const SlotIndex SplitPos = NewIdxDef; + OldIdxVNI = OldIdxIn->valno; // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut. + OldIdxOut->valno->def = OldIdxIn->start; *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end, - OldIdxIn->valno); + OldIdxOut->valno); // OldIdxIn and OldIdxVNI are now undef and can be overridden. // We Slide [NewIdxIn, OldIdxIn) down one position. // |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -| @@ -1514,8 +1507,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, } } - for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) { - unsigned Reg = OrigRegs[i]; + for (unsigned Reg : OrigRegs) { if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; @@ -1524,16 +1516,16 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, if (!LI.hasAtLeastOneValue()) continue; - for (LiveInterval::SubRange &S : LI.subranges()) { + for (LiveInterval::SubRange &S : LI.subranges()) repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask); - } + repairOldRegInRange(Begin, End, endIdx, LI, Reg); } } void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) { - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { - if (LiveRange *LR = getCachedRegUnit(*Units)) + for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { + if (LiveRange *LR = getCachedRegUnit(*Unit)) if (VNInfo *VNI = LR->getVNInfoAt(Pos)) LR->removeValNo(VNI); } diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp index fc2f233f6d687..b4aa0dc326a58 100644 --- a/lib/CodeGen/LiveIntervalUnion.cpp +++ b/lib/CodeGen/LiveIntervalUnion.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===// +//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===// // // The LLVM Compiler Infrastructure // @@ -13,19 +13,19 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveIntervalUnion.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SparseBitVector.h" -#include "llvm/Support/Debug.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" -#include <algorithm> +#include <cassert> +#include <cstdlib> using namespace llvm; #define DEBUG_TYPE "regalloc" - // Merge a LiveInterval's segments. Guarantee no overlaps. void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) { if (Range.empty()) @@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) { LiveRange::const_iterator RegEnd = Range.end(); SegmentIter SegPos = Segments.find(RegPos->start); - for (;;) { + while (true) { assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval"); SegPos.erase(); if (!SegPos.valid()) @@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { CheckedFirstInterference = true; // Quickly skip interference check for empty sets. - if (VirtReg->empty() || LiveUnion->empty()) { + if (LR->empty() || LiveUnion->empty()) { SeenAllInterferences = true; return 0; } - // In most cases, the union will start before VirtReg. - VirtRegI = VirtReg->begin(); + // In most cases, the union will start before LR. + LRI = LR->begin(); LiveUnionI.setMap(LiveUnion->getMap()); - LiveUnionI.find(VirtRegI->start); + LiveUnionI.find(LRI->start); } - LiveInterval::iterator VirtRegEnd = VirtReg->end(); + LiveRange::const_iterator LREnd = LR->end(); LiveInterval *RecentReg = nullptr; while (LiveUnionI.valid()) { - assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg"); + assert(LRI != LREnd && "Reached end of LR"); // Check for overlapping interference. - while (VirtRegI->start < LiveUnionI.stop() && - VirtRegI->end > LiveUnionI.start()) { + while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) { // This is an overlap, record the interfering register. LiveInterval *VReg = LiveUnionI.value(); if (VReg != RecentReg && !isSeenInterference(VReg)) { @@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { } // The iterators are now not overlapping, LiveUnionI has been advanced - // beyond VirtRegI. - assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap"); + // beyond LRI. + assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap"); // Advance the iterator that ends first. - VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start()); - if (VirtRegI == VirtRegEnd) + LRI = LR->advanceTo(LRI, LiveUnionI.start()); + if (LRI == LREnd) break; // Detect overlap, handle above. - if (VirtRegI->start < LiveUnionI.stop()) + if (LRI->start < LiveUnionI.stop()) continue; // Still not overlapping. Catch up LiveUnionI. - LiveUnionI.advanceTo(VirtRegI->start); + LiveUnionI.advanceTo(LRI->start); } SeenAllInterferences = true; return InterferingVRegs.size(); diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp index dcc41c1718a62..9f7d7cf548480 100644 --- a/lib/CodeGen/LivePhysRegs.cpp +++ b/lib/CodeGen/LivePhysRegs.cpp @@ -120,12 +120,11 @@ void LivePhysRegs::print(raw_ostream &OS) const { OS << "\n"; } -/// Dumps the currently live registers to the debug output. -LLVM_DUMP_METHOD void LivePhysRegs::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LivePhysRegs::dump() const { dbgs() << " " << *this; -#endif } +#endif bool LivePhysRegs::available(const MachineRegisterInfo &MRI, unsigned Reg) const { @@ -161,7 +160,9 @@ void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) { static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF, const MachineFrameInfo &MFI, const TargetRegisterInfo &TRI) { - for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; + ++CSR) LiveRegs.addReg(*CSR); for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) LiveRegs.removeReg(Info.getReg()); @@ -180,7 +181,8 @@ void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) { if (MBB.isReturnBlock()) { // The return block has no successors whose live-ins we could merge // below. So instead we add the callee saved registers manually. - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *I = MRI.getCalleeSavedRegs(); *I; ++I) addReg(*I); } else { addPristines(*this, MF, MFI, *TRI); diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp index 0128376086281..398066bf8903e 100644 --- a/lib/CodeGen/LiveRangeCalc.cpp +++ b/lib/CodeGen/LiveRangeCalc.cpp @@ -75,34 +75,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) { LI.createSubRangeFrom(*Alloc, ClassMask, LI); } - LaneBitmask Mask = SubMask; - for (LiveInterval::SubRange &S : LI.subranges()) { - // A Mask for subregs common to the existing subrange and current def. - LaneBitmask Common = S.LaneMask & Mask; - if (Common.none()) - continue; - LiveInterval::SubRange *CommonRange; - // A Mask for subregs covered by the subrange but not the current def. - LaneBitmask RM = S.LaneMask & ~Mask; - if (RM.any()) { - // Split the subrange S into two parts: one covered by the current - // def (CommonRange), and the one not affected by it (updated S). - S.LaneMask = RM; - CommonRange = LI.createSubRangeFrom(*Alloc, Common, S); - } else { - assert(Common == S.LaneMask); - CommonRange = &S; - } + LI.refineSubRanges(*Alloc, SubMask, + [&MO, this](LiveInterval::SubRange &SR) { if (MO.isDef()) - createDeadDef(*Indexes, *Alloc, *CommonRange, MO); - Mask &= ~Common; - } - // Create a new SubRange for subregs we did not cover yet. - if (Mask.any()) { - LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask); - if (MO.isDef()) - createDeadDef(*Indexes, *Alloc, *NewRange, MO); - } + createDeadDef(*Indexes, *Alloc, SR, MO); + }); } // Create the def in the main liverange. We do not have to do this if @@ -289,8 +266,7 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, if (UndefOnEntry[BN]) return false; - auto MarkDefined = - [this,BN,&DefOnEntry,&UndefOnEntry] (MachineBasicBlock &B) -> bool { + auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool { for (MachineBasicBlock *S : B.successors()) DefOnEntry[S->getNumber()] = true; DefOnEntry[BN] = true; @@ -311,7 +287,12 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, return MarkDefined(B); SlotIndex Begin, End; std::tie(Begin, End) = Indexes->getMBBRange(&B); - LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), End); + // Treat End as not belonging to B. + // If LR has a segment S that starts at the next block, i.e. [End, ...), + // std::upper_bound will return the segment following S. Instead, + // S should be treated as the first segment that does not overlap B. + LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), + End.getPrevSlot()); if (UB != LR.begin()) { LiveRange::Segment &Seg = *std::prev(UB); if (Seg.end > Begin) { diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp index 7f1c69c0b4a2a..92cca1a54951e 100644 --- a/lib/CodeGen/LiveRangeEdit.cpp +++ b/lib/CodeGen/LiveRangeEdit.cpp @@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } LiveInterval &LI = LIS.createEmptyInterval(VReg); + if (Parent && !Parent->isSpillable()) + LI.markNotSpillable(); // Create empty subranges if the OldReg's interval has them. Do not create // the main range here---it will be constructed later after the subranges // have been finalized. @@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) { if (VRM) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } + // FIXME: Getting the interval here actually computes it. + // In theory, this may not be what we want, but in practice + // the createEmptyIntervalFrom API is used when this is not + // the case. Generally speaking we just want to annotate the + // LiveInterval when it gets created but we cannot do that at + // the moment. + if (Parent && !Parent->isSpillable()) + LIS.getInterval(VReg).markNotSpillable(); return VReg; } @@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg) if (VRM) VRM->grow(); - if (Parent && !Parent->isSpillable()) - LIS.getInterval(VReg).markNotSpillable(); - NewRegs.push_back(VReg); } diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp index 7a51386aa9caa..882de1a3fad96 100644 --- a/lib/CodeGen/LiveRegMatrix.cpp +++ b/lib/CodeGen/LiveRegMatrix.cpp @@ -1,4 +1,4 @@ -//===-- LiveRegMatrix.cpp - Track register interference -------------------===// +//===- LiveRegMatrix.cpp - Track register interference --------------------===// // // The LLVM Compiler Infrastructure // @@ -11,15 +11,22 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveRegMatrix.h" #include "RegisterCoalescer.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Pass.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> using namespace llvm; @@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix", "Live Register Matrix", false, false) -LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID), - UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {} +LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {} void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); @@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg, return Result; } -LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg, +LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR, unsigned RegUnit) { LiveIntervalUnion::Query &Q = Queries[RegUnit]; - Q.init(UserTag, &VirtReg, &Matrix[RegUnit]); + Q.init(UserTag, LR, Matrix[RegUnit]); return Q; } @@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) { return IK_RegUnit; // Check the matrix for virtual register interference. - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) - if (query(VirtReg, *Units).checkInterference()) - return IK_VirtReg; + bool Interference = foreachUnit(TRI, VirtReg, PhysReg, + [&](unsigned Unit, const LiveRange &LR) { + return query(LR, Unit).checkInterference(); + }); + if (Interference) + return IK_VirtReg; return IK_Free; } diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp new file mode 100644 index 0000000000000..dff555f49565e --- /dev/null +++ b/lib/CodeGen/LiveRegUnits.cpp @@ -0,0 +1,126 @@ +//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file imlements the LiveRegUnits set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) { + for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { + for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { + if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) + Units.reset(U); + } + } +} + +void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) { + for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { + for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { + if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) + Units.set(U); + } + } +} + +void LiveRegUnits::stepBackward(const MachineInstr &MI) { + // Remove defined registers and regmask kills from the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (O->isReg()) { + if (!O->isDef()) + continue; + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + removeReg(Reg); + } else if (O->isRegMask()) + removeRegsNotPreserved(O->getRegMask()); + } + + // Add uses to the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (!O->isReg() || !O->readsReg()) + continue; + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + addReg(Reg); + } +} + +void LiveRegUnits::accumulateBackward(const MachineInstr &MI) { + // Add defs, uses and regmask clobbers to the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (O->isReg()) { + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + if (!O->isDef() && !O->readsReg()) + continue; + addReg(Reg); + } else if (O->isRegMask()) + addRegsInMask(O->getRegMask()); + } +} + +/// Add live-in registers of basic block \p MBB to \p LiveUnits. +static void addLiveIns(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { + for (const auto &LI : MBB.liveins()) + LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask); +} + +static void addLiveOuts(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addLiveIns(LiveUnits, *Succ); +} + +/// Add pristine registers to the given \p LiveUnits. This function removes +/// actually saved callee save registers when \p InPrologueEpilogue is false. +static void removeSavedRegs(LiveRegUnits &LiveUnits, const MachineFunction &MF, + const MachineFrameInfo &MFI, + const TargetRegisterInfo &TRI) { + for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) + LiveUnits.removeReg(Info.getReg()); +} + +void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) + addReg(*I); + if (!MBB.isReturnBlock()) + removeSavedRegs(*this, MF, MFI, *TRI); + } + ::addLiveOuts(*this, MBB); +} + +void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) + addReg(*I); + if (&MBB != &MF.front()) + removeSavedRegs(*this, MF, MFI, *TRI); + } + ::addLiveIns(*this, MBB); +} diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp index 269b990a31493..3568b0294ad9a 100644 --- a/lib/CodeGen/LiveVariables.cpp +++ b/lib/CodeGen/LiveVariables.cpp @@ -64,8 +64,8 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const { return nullptr; } -LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { dbgs() << " Alive in blocks: "; for (SparseBitVector<>::iterator I = AliveBlocks.begin(), E = AliveBlocks.end(); I != E; ++I) @@ -78,8 +78,8 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { dbgs() << "\n #" << i << ": " << *Kills[i]; dbgs() << "\n"; } -#endif } +#endif /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg. LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) { diff --git a/lib/CodeGen/LowLevelType.cpp b/lib/CodeGen/LowLevelType.cpp index d74b7306e0f43..c4b9068fa905a 100644 --- a/lib/CodeGen/LowLevelType.cpp +++ b/lib/CodeGen/LowLevelType.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===// +//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -18,54 +18,21 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -LLT::LLT(Type &Ty, const DataLayout &DL) { +LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) { if (auto VTy = dyn_cast<VectorType>(&Ty)) { - SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits(); - ElementsOrAddrSpace = VTy->getNumElements(); - Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; + auto NumElements = VTy->getNumElements(); + auto ScalarSizeInBits = VTy->getElementType()->getPrimitiveSizeInBits(); + if (NumElements == 1) + return LLT::scalar(ScalarSizeInBits); + return LLT::vector(NumElements, ScalarSizeInBits); } else if (auto PTy = dyn_cast<PointerType>(&Ty)) { - Kind = Pointer; - SizeInBits = DL.getTypeSizeInBits(&Ty); - ElementsOrAddrSpace = PTy->getAddressSpace(); + return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty)); } else if (Ty.isSized()) { // Aggregates are no different from real scalars as far as GlobalISel is // concerned. - Kind = Scalar; - SizeInBits = DL.getTypeSizeInBits(&Ty); - ElementsOrAddrSpace = 1; + auto SizeInBits = DL.getTypeSizeInBits(&Ty); assert(SizeInBits != 0 && "invalid zero-sized type"); - } else { - Kind = Invalid; - SizeInBits = ElementsOrAddrSpace = 0; + return LLT::scalar(SizeInBits); } -} - -LLT::LLT(MVT VT) { - if (VT.isVector()) { - SizeInBits = VT.getVectorElementType().getSizeInBits(); - ElementsOrAddrSpace = VT.getVectorNumElements(); - Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; - } else if (VT.isValid()) { - // Aggregates are no different from real scalars as far as GlobalISel is - // concerned. - Kind = Scalar; - SizeInBits = VT.getSizeInBits(); - ElementsOrAddrSpace = 1; - assert(SizeInBits != 0 && "invalid zero-sized type"); - } else { - Kind = Invalid; - SizeInBits = ElementsOrAddrSpace = 0; - } -} - -void LLT::print(raw_ostream &OS) const { - if (isVector()) - OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">"; - else if (isPointer()) - OS << "p" << getAddressSpace(); - else if (isValid()) { - assert(isScalar() && "unexpected type"); - OS << "s" << getScalarSizeInBits(); - } else - llvm_unreachable("trying to print an invalid type"); + return LLT(); } diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index c8bed0890dd60..cac22af32956e 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -41,8 +41,11 @@ using namespace llvm; PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF, - SourceMgr &SM, const SlotMapping &IRSlots) - : MF(MF), SM(&SM), IRSlots(IRSlots) { + SourceMgr &SM, const SlotMapping &IRSlots, + const Name2RegClassMap &Names2RegClasses, + const Name2RegBankMap &Names2RegBanks) + : MF(MF), SM(&SM), IRSlots(IRSlots), Names2RegClasses(Names2RegClasses), + Names2RegBanks(Names2RegBanks) { } VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) { @@ -139,6 +142,7 @@ public: bool parseVirtualRegister(VRegInfo *&Info); bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo); bool parseRegisterFlag(unsigned &Flags); + bool parseRegisterClassOrBank(VRegInfo &RegInfo); bool parseSubRegisterIndex(unsigned &SubReg); bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx); bool parseRegisterOperand(MachineOperand &Dest, @@ -172,6 +176,7 @@ public: bool parseIntrinsicOperand(MachineOperand &Dest); bool parsePredicateOperand(MachineOperand &Dest); bool parseTargetIndexOperand(MachineOperand &Dest); + bool parseCustomRegisterMaskOperand(MachineOperand &Dest); bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest); bool parseMachineOperand(MachineOperand &Dest, Optional<unsigned> &TiedDefIdx); @@ -184,6 +189,7 @@ public: bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags); bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV); bool parseMachinePointerInfo(MachinePointerInfo &Dest); + bool parseOptionalAtomicOrdering(AtomicOrdering &Order); bool parseMachineMemoryOperand(MachineMemOperand *&Dest); private: @@ -878,6 +884,66 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) { } } +bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) { + if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore)) + return error("expected '_', register class, or register bank name"); + StringRef::iterator Loc = Token.location(); + StringRef Name = Token.stringValue(); + + // Was it a register class? + auto RCNameI = PFS.Names2RegClasses.find(Name); + if (RCNameI != PFS.Names2RegClasses.end()) { + lex(); + const TargetRegisterClass &RC = *RCNameI->getValue(); + + switch (RegInfo.Kind) { + case VRegInfo::UNKNOWN: + case VRegInfo::NORMAL: + RegInfo.Kind = VRegInfo::NORMAL; + if (RegInfo.Explicit && RegInfo.D.RC != &RC) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + return error(Loc, Twine("conflicting register classes, previously: ") + + Twine(TRI.getRegClassName(RegInfo.D.RC))); + } + RegInfo.D.RC = &RC; + RegInfo.Explicit = true; + return false; + + case VRegInfo::GENERIC: + case VRegInfo::REGBANK: + return error(Loc, "register class specification on generic register"); + } + llvm_unreachable("Unexpected register kind"); + } + + // Should be a register bank or a generic register. + const RegisterBank *RegBank = nullptr; + if (Name != "_") { + auto RBNameI = PFS.Names2RegBanks.find(Name); + if (RBNameI == PFS.Names2RegBanks.end()) + return error(Loc, "expected '_', register class, or register bank name"); + RegBank = RBNameI->getValue(); + } + + lex(); + + switch (RegInfo.Kind) { + case VRegInfo::UNKNOWN: + case VRegInfo::GENERIC: + case VRegInfo::REGBANK: + RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC; + if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank) + return error(Loc, "conflicting generic register banks"); + RegInfo.D.RegBank = RegBank; + RegInfo.Explicit = true; + return false; + + case VRegInfo::NORMAL: + return error(Loc, "register bank specification on normal register"); + } + llvm_unreachable("Unexpected register kind"); +} + bool MIParser::parseRegisterFlag(unsigned &Flags) { const unsigned OldFlags = Flags; switch (Token.kind()) { @@ -1004,6 +1070,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest, if (!TargetRegisterInfo::isVirtualRegister(Reg)) return error("subregister index expects a virtual register"); } + if (Token.is(MIToken::colon)) { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return error("register class specification expects a virtual register"); + lex(); + if (parseRegisterClassOrBank(*RegInfo)) + return true; + } MachineRegisterInfo &MRI = MF.getRegInfo(); if ((Flags & RegState::Define) == 0) { if (consumeIfPresent(MIToken::lparen)) { @@ -1598,6 +1671,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) { return false; } +bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) { + assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + assert(TRI && "Expected target register info"); + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + + uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs()); + while (true) { + if (Token.isNot(MIToken::NamedRegister)) + return error("expected a named register"); + unsigned Reg; + if (parseNamedRegister(Reg)) + return true; + lex(); + Mask[Reg / 32] |= 1U << (Reg % 32); + // TODO: Report an error if the same register is used more than once. + if (Token.isNot(MIToken::comma)) + break; + lex(); + } + + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = MachineOperand::CreateRegMask(Mask); + return false; +} + bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) { assert(Token.is(MIToken::kw_liveout)); const auto *TRI = MF.getSubtarget().getRegisterInfo(); @@ -1695,8 +1797,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest, Dest = MachineOperand::CreateRegMask(RegMask); lex(); break; - } - LLVM_FALLTHROUGH; + } else + return parseCustomRegisterMaskOperand(Dest); default: // FIXME: Parse the MCSymbol machine operand. return error("expected a machine operand"); @@ -1969,6 +2071,28 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) { return false; } +bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) { + Order = AtomicOrdering::NotAtomic; + if (Token.isNot(MIToken::Identifier)) + return false; + + Order = StringSwitch<AtomicOrdering>(Token.stringValue()) + .Case("unordered", AtomicOrdering::Unordered) + .Case("monotonic", AtomicOrdering::Monotonic) + .Case("acquire", AtomicOrdering::Acquire) + .Case("release", AtomicOrdering::Release) + .Case("acq_rel", AtomicOrdering::AcquireRelease) + .Case("seq_cst", AtomicOrdering::SequentiallyConsistent) + .Default(AtomicOrdering::NotAtomic); + + if (Order != AtomicOrdering::NotAtomic) { + lex(); + return false; + } + + return error("expected an atomic scope, ordering or a size integer literal"); +} + bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (expectAndConsume(MIToken::lparen)) return true; @@ -1986,6 +2110,21 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { Flags |= MachineMemOperand::MOStore; lex(); + // Optional "singlethread" scope. + SynchronizationScope Scope = SynchronizationScope::CrossThread; + if (Token.is(MIToken::Identifier) && Token.stringValue() == "singlethread") { + Scope = SynchronizationScope::SingleThread; + lex(); + } + + // Up to two atomic orderings (cmpxchg provides guarantees on failure). + AtomicOrdering Order, FailureOrder; + if (parseOptionalAtomicOrdering(Order)) + return true; + + if (parseOptionalAtomicOrdering(FailureOrder)) + return true; + if (Token.isNot(MIToken::IntegerLiteral)) return error("expected the size integer literal after memory operation"); uint64_t Size; @@ -2040,8 +2179,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { } if (expectAndConsume(MIToken::rparen)) return true; - Dest = - MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range); + Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range, + Scope, Order, FailureOrder); return false; } diff --git a/lib/CodeGen/MIRParser/MIParser.h b/lib/CodeGen/MIRParser/MIParser.h index 93a4d84ba62f6..9b3879cf83772 100644 --- a/lib/CodeGen/MIRParser/MIParser.h +++ b/lib/CodeGen/MIRParser/MIParser.h @@ -45,11 +45,16 @@ struct VRegInfo { unsigned PreferredReg = 0; }; +typedef StringMap<const TargetRegisterClass*> Name2RegClassMap; +typedef StringMap<const RegisterBank*> Name2RegBankMap; + struct PerFunctionMIParsingState { BumpPtrAllocator Allocator; MachineFunction &MF; SourceMgr *SM; const SlotMapping &IRSlots; + const Name2RegClassMap &Names2RegClasses; + const Name2RegBankMap &Names2RegBanks; DenseMap<unsigned, MachineBasicBlock *> MBBSlots; DenseMap<unsigned, VRegInfo*> VRegInfos; @@ -59,7 +64,9 @@ struct PerFunctionMIParsingState { DenseMap<unsigned, unsigned> JumpTableSlots; PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM, - const SlotMapping &IRSlots); + const SlotMapping &IRSlots, + const Name2RegClassMap &Names2RegClasses, + const Name2RegBankMap &Names2RegBanks); VRegInfo &getVRegInfo(unsigned VReg); }; diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp index 3dff1147631b3..a2773cccc5dbd 100644 --- a/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/lib/CodeGen/MIRParser/MIRParser.cpp @@ -55,9 +55,9 @@ class MIRParserImpl { StringMap<std::unique_ptr<yaml::MachineFunction>> Functions; SlotMapping IRSlots; /// Maps from register class names to register classes. - StringMap<const TargetRegisterClass *> Names2RegClasses; + Name2RegClassMap Names2RegClasses; /// Maps from register bank names to register banks. - StringMap<const RegisterBank *> Names2RegBanks; + Name2RegBankMap Names2RegBanks; public: MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename, @@ -325,11 +325,15 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { return error(Twine("no machine function information for function '") + MF.getName() + "' in the MIR file"); // TODO: Recreate the machine function. + initNames2RegClasses(MF); + initNames2RegBanks(MF); const yaml::MachineFunction &YamlMF = *It->getValue(); if (YamlMF.Alignment) MF.setAlignment(YamlMF.Alignment); MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); + if (YamlMF.NoVRegs) + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); if (YamlMF.Legalized) MF.getProperties().set(MachineFunctionProperties::Property::Legalized); if (YamlMF.RegBankSelected) @@ -338,7 +342,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { if (YamlMF.Selected) MF.getProperties().set(MachineFunctionProperties::Property::Selected); - PerFunctionMIParsingState PFS(MF, SM, IRSlots); + PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses, + Names2RegBanks); if (parseRegisterInfo(PFS, YamlMF)) return true; if (!YamlMF.Constants.empty()) { @@ -362,9 +367,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { } PFS.SM = &SM; - if (MF.empty()) - return error(Twine("machine function '") + Twine(MF.getName()) + - "' requires at least one machine basic block in its body"); // Initialize the frame information after creating all the MBBs so that the // MBB references in the frame information can be resolved. if (initializeFrameInfo(PFS, YamlMF)) @@ -462,17 +464,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, RegInfo.addLiveIn(Reg, VReg); } - // Parse the callee saved register mask. - BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size()); - if (!YamlMF.CalleeSavedRegisters) - return false; - for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { - unsigned Reg = 0; - if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) - return error(Error, RegSource.SourceRange); - CalleeSavedRegisterMask[Reg] = true; + // Parse the callee saved registers (Registers that will + // be saved for the caller). + if (YamlMF.CalleeSavedRegisters) { + SmallVector<MCPhysReg, 16> CalleeSavedRegisters; + for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { + unsigned Reg = 0; + if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) + return error(Error, RegSource.SourceRange); + CalleeSavedRegisters.push_back(Reg); + } + RegInfo.setCalleeSavedRegs(CalleeSavedRegisters); } - RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip()); + return false; } @@ -505,14 +509,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS, } // Compute MachineRegisterInfo::UsedPhysRegMask - if (!YamlMF.CalleeSavedRegisters) { - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); - } + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isRegMask()) + continue; + MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); } } } @@ -818,7 +820,6 @@ void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) { const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF, StringRef Name) { - initNames2RegClasses(MF); auto RegClassInfo = Names2RegClasses.find(Name); if (RegClassInfo == Names2RegClasses.end()) return nullptr; @@ -827,7 +828,6 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF, const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF, StringRef Name) { - initNames2RegBanks(MF); auto RegBankInfo = Names2RegBanks.find(Name); if (RegBankInfo == Names2RegBanks.end()) return nullptr; diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index db87092177cad..6da174a536666 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -175,6 +175,8 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.Alignment = MF.getAlignment(); YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); + YamlMF.NoVRegs = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::NoVRegs); YamlMF.Legalized = MF.getProperties().hasProperty( MachineFunctionProperties::Property::Legalized); YamlMF.RegBankSelected = MF.getProperties().hasProperty( @@ -205,6 +207,25 @@ void MIRPrinter::print(const MachineFunction &MF) { Out << YamlMF; } +static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS, + const TargetRegisterInfo *TRI) { + assert(RegMask && "Can't print an empty register mask"); + OS << StringRef("CustomRegMask("); + + bool IsRegInRegMaskFound = false; + for (int I = 0, E = TRI->getNumRegs(); I < E; I++) { + // Check whether the register is asserted in regmask. + if (RegMask[I / 32] & (1u << (I % 32))) { + if (IsRegInRegMaskFound) + OS << ','; + printReg(I, OS, TRI); + IsRegInRegMaskFound = true; + } + } + + OS << ')'; +} + void MIRPrinter::convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo, const TargetRegisterInfo *TRI) { @@ -239,20 +260,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF, printReg(I->second, LiveIn.VirtualRegister, TRI); MF.LiveIns.push_back(LiveIn); } - // The used physical register mask is printed as an inverted callee saved - // register mask. - const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask(); - if (UsedPhysRegMask.none()) - return; - std::vector<yaml::FlowStringValue> CalleeSavedRegisters; - for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) { - if (!UsedPhysRegMask[I]) { + + // Prints the callee saved registers. + if (RegInfo.isUpdatedCSRsInitialized()) { + const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs(); + std::vector<yaml::FlowStringValue> CalleeSavedRegisters; + for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) { yaml::FlowStringValue Reg; - printReg(I, Reg, TRI); + printReg(*I, Reg, TRI); CalleeSavedRegisters.push_back(Reg); } + MF.CalleeSavedRegisters = CalleeSavedRegisters; } - MF.CalleeSavedRegisters = CalleeSavedRegisters; } void MIRPrinter::convert(ModuleSlotTracker &MST, @@ -860,7 +879,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, if (RegMaskInfo != RegisterMaskIds.end()) OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower(); else - llvm_unreachable("Can't print this machine register mask yet."); + printCustomRegMask(Op.getRegMask(), OS, TRI); break; } case MachineOperand::MO_RegisterLiveOut: { @@ -906,6 +925,9 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, << CmpInst::getPredicateName(Pred) << ')'; break; } + case MachineOperand::MO_Placeholder: + OS << "<placeholder>"; + break; } } @@ -926,6 +948,15 @@ void MIPrinter::print(const MachineMemOperand &Op) { assert(Op.isStore() && "Non load machine operand must be a store"); OS << "store "; } + + if (Op.getSynchScope() == SynchronizationScope::SingleThread) + OS << "singlethread "; + + if (Op.getOrdering() != AtomicOrdering::NotAtomic) + OS << toIRString(Op.getOrdering()) << ' '; + if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic) + OS << toIRString(Op.getFailureOrdering()) << ' '; + OS << Op.getSize(); if (const Value *Val = Op.getValue()) { OS << (Op.isLoad() ? " from " : " into "); diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 3869f976854d0..06112723497b0 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -148,8 +149,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { + const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); + iterator E = end(); - while (I != E && (I->isPHI() || I->isPosition())) + while (I != E && (I->isPHI() || I->isPosition() || + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels // inside the bundle. @@ -160,8 +164,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) { + const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); + iterator E = end(); - while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue())) + while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() || + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels / dbg_values // inside the bundle. @@ -225,7 +232,7 @@ StringRef MachineBasicBlock::getName() const { if (const BasicBlock *LBB = getBasicBlock()) return LBB->getName(); else - return "(null)"; + return StringRef("", 0); } /// Return a hopefully unique identifier for this block. @@ -417,7 +424,7 @@ void MachineBasicBlock::updateTerminator() { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc DL; // FIXME: this is nowhere + DebugLoc DL = findBranchDebugLoc(); bool B = TII->analyzeBranch(*this, TBB, FBB, Cond); (void) B; assert(!B && "UpdateTerminators requires analyzable predecessors!"); @@ -485,7 +492,7 @@ void MachineBasicBlock::updateTerminator() { // FIXME: This does not seem like a reasonable pattern to support, but it // has been seen in the wild coming out of degenerate ARM test cases. TII->removeBranch(*this); - + // Finally update the unconditional successor to be reached via a branch if // it would not be reached by fallthrough. if (!isLayoutSuccessor(TBB)) @@ -681,16 +688,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const { return std::next(I) == MachineFunction::const_iterator(MBB); } -bool MachineBasicBlock::canFallThrough() { +MachineBasicBlock *MachineBasicBlock::getFallThrough() { MachineFunction::iterator Fallthrough = getIterator(); ++Fallthrough; // If FallthroughBlock is off the end of the function, it can't fall through. if (Fallthrough == getParent()->end()) - return false; + return nullptr; // If FallthroughBlock isn't a successor, no fallthrough is possible. if (!isSuccessor(&*Fallthrough)) - return false; + return nullptr; // Analyze the branches, if any, at the end of the block. MachineBasicBlock *TBB = nullptr, *FBB = nullptr; @@ -702,25 +709,31 @@ bool MachineBasicBlock::canFallThrough() { // is possible. The isPredicated check is needed because this code can be // called during IfConversion, where an instruction which is normally a // Barrier is predicated and thus no longer an actual control barrier. - return empty() || !back().isBarrier() || TII->isPredicated(back()); + return (empty() || !back().isBarrier() || TII->isPredicated(back())) + ? &*Fallthrough + : nullptr; } // If there is no branch, control always falls through. - if (!TBB) return true; + if (!TBB) return &*Fallthrough; // If there is some explicit branch to the fallthrough block, it can obviously // reach, even though the branch should get folded to fall through implicitly. if (MachineFunction::iterator(TBB) == Fallthrough || MachineFunction::iterator(FBB) == Fallthrough) - return true; + return &*Fallthrough; // If it's an unconditional branch to some block not the fall through, it // doesn't fall through. - if (Cond.empty()) return false; + if (Cond.empty()) return nullptr; // Otherwise, if it is conditional and has no explicit false block, it falls // through. - return FBB == nullptr; + return (FBB == nullptr) ? &*Fallthrough : nullptr; +} + +bool MachineBasicBlock::canFallThrough() { + return getFallThrough() != nullptr; } MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, @@ -1144,6 +1157,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return {}; } +/// Find and return the merged DebugLoc of the branch instructions of the block. +/// Return UnknownLoc if there is none. +DebugLoc +MachineBasicBlock::findBranchDebugLoc() { + DebugLoc DL; + auto TI = getFirstTerminator(); + while (TI != end() && !TI->isBranch()) + ++TI; + + if (TI != end()) { + DL = TI->getDebugLoc(); + for (++TI ; TI != end() ; ++TI) + if (TI->isBranch()) + DL = DILocation::getMergedLocation(DL, TI->getDebugLoc()); + } + return DL; +} + /// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 7d5124d30a04e..9c7367b4c7802 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -28,7 +28,6 @@ using namespace llvm; #define DEBUG_TYPE "block-freq" -#ifndef NDEBUG static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG( "view-machine-block-freq-propagation-dags", cl::Hidden, @@ -43,10 +42,37 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG( "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available."))); +// Similar option above, but used to control BFI display only after MBP pass +cl::opt<GVDAGType> ViewBlockLayoutWithBFI( + "view-block-layout-with-bfi", cl::Hidden, + cl::desc( + "Pop up a window to show a dag displaying MBP layout and associated " + "block frequencies of the CFG."), + cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), + clEnumValN(GVDT_Fraction, "fraction", + "display a graph using the " + "fractional block frequency representation."), + clEnumValN(GVDT_Integer, "integer", + "display a graph using the raw " + "integer fractional block frequency representation."), + clEnumValN(GVDT_Count, "count", + "display a graph using the real " + "profile count if available."))); +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; +// Command line option to specify hot frequency threshold. +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-hot-freq-perc= extern cl::opt<unsigned> ViewHotFreqPercent; +static GVDAGType getGVDT() { + if (ViewBlockLayoutWithBFI != GVDT_None) + return ViewBlockLayoutWithBFI; + + return ViewMachineBlockFreqPropagationDAG; +} + namespace llvm { template <> struct GraphTraits<MachineBlockFrequencyInfo *> { @@ -80,12 +106,32 @@ template <> struct DOTGraphTraits<MachineBlockFrequencyInfo *> : public MBFIDOTGraphTraitsBase { explicit DOTGraphTraits(bool isSimple = false) - : MBFIDOTGraphTraitsBase(isSimple) {} + : MBFIDOTGraphTraitsBase(isSimple), CurFunc(nullptr), LayoutOrderMap() {} + + const MachineFunction *CurFunc; + DenseMap<const MachineBasicBlock *, int> LayoutOrderMap; std::string getNodeLabel(const MachineBasicBlock *Node, const MachineBlockFrequencyInfo *Graph) { - return MBFIDOTGraphTraitsBase::getNodeLabel( - Node, Graph, ViewMachineBlockFreqPropagationDAG); + + int layout_order = -1; + // Attach additional ordering information if 'isSimple' is false. + if (!isSimple()) { + const MachineFunction *F = Node->getParent(); + if (!CurFunc || F != CurFunc) { + if (CurFunc) + LayoutOrderMap.clear(); + + CurFunc = F; + int O = 0; + for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) { + LayoutOrderMap[&*MBI] = O; + } + } + layout_order = LayoutOrderMap[Node]; + } + return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(), + layout_order); } std::string getNodeAttributes(const MachineBasicBlock *Node, @@ -102,7 +148,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *> }; } // end namespace llvm -#endif INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq", "Machine Block Frequency Analysis", true, true) @@ -127,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { - MachineBranchProbabilityInfo &MBPI = - getAnalysis<MachineBranchProbabilityInfo>(); - MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); +void MachineBlockFrequencyInfo::calculate( + const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, + const MachineLoopInfo &MLI) { if (!MBFI) MBFI.reset(new ImplType); MBFI->calculate(F, MBPI, MLI); -#ifndef NDEBUG if (ViewMachineBlockFreqPropagationDAG != GVDT_None && (ViewBlockFreqFuncName.empty() || F.getName().equals(ViewBlockFreqFuncName))) { - view(); + view("MachineBlockFrequencyDAGS." + F.getName()); } -#endif +} + +bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { + MachineBranchProbabilityInfo &MBPI = + getAnalysis<MachineBranchProbabilityInfo>(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + calculate(F, MBPI, MLI); return false; } @@ -148,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); } /// Pop up a ghostview window with the current block frequency propagation /// rendered using dot. -void MachineBlockFrequencyInfo::view() const { -// This code is only for debugging. -#ifndef NDEBUG - ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), - "MachineBlockFrequencyDAGs"); -#else - errs() << "MachineBlockFrequencyInfo::view is only available in debug builds " - "on systems with Graphviz or gv!\n"; -#endif // NDEBUG +void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const { + // This code is only for debugging. + ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple); } BlockFrequency diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 40e3840e6b0b3..4cfc128a8c1d6 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -32,14 +32,15 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" @@ -49,6 +50,8 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <functional> +#include <utility> using namespace llvm; #define DEBUG_TYPE "block-placement" @@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias( // Definition: // - Outlining: placement of a basic block outside the chain or hot path. -static cl::opt<bool> OutlineOptionalBranches( - "outline-optional-branches", - cl::desc("Outlining optional branches will place blocks that are optional " - "branches, i.e. branches with a common post dominator, outside " - "the hot path or chain"), - cl::init(false), cl::Hidden); - -static cl::opt<unsigned> OutlineOptionalThreshold( - "outline-optional-threshold", - cl::desc("Don't outline optional branches that are a single block with an " - "instruction count below this threshold"), - cl::init(4), cl::Hidden); - static cl::opt<unsigned> LoopToColdBlockRatio( "loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " @@ -136,20 +126,47 @@ BranchFoldPlacement("branch-fold-placement", cl::init(true), cl::Hidden); // Heuristic for tail duplication. -static cl::opt<unsigned> TailDuplicatePlacementThreshold( +static cl::opt<unsigned> TailDupPlacementThreshold( "tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden); +// Heuristic for tail duplication. +static cl::opt<unsigned> TailDupPlacementPenalty( + "tail-dup-placement-penalty", + cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " + "Copying can increase fallthrough, but it also increases icache " + "pressure. This parameter controls the penalty to account for that. " + "Percent as integer."), + cl::init(2), + cl::Hidden); + +// Heuristic for triangle chains. +static cl::opt<unsigned> TriangleChainCount( + "triangle-chain-count", + cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " + "triangle tail duplication heuristic to kick in. 0 to disable."), + cl::init(2), + cl::Hidden); + extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; +// Internal option used to control BFI display only after MBP pass. +// Defined in CodeGen/MachineBlockFrequencyInfo.cpp: +// -view-block-layout-with-bfi= +extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; + +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= +extern cl::opt<std::string> ViewBlockFreqFuncName; + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. -typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType; +typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType; } namespace { @@ -193,12 +210,15 @@ public: /// \brief Iterator over blocks within the chain. typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator; + typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator; /// \brief Beginning of blocks within the chain. iterator begin() { return Blocks.begin(); } + const_iterator begin() const { return Blocks.begin(); } /// \brief End of blocks within the chain. iterator end() { return Blocks.end(); } + const_iterator end() const { return Blocks.end(); } bool remove(MachineBasicBlock* BB) { for(iterator i = begin(); i != end(); ++i) { @@ -264,12 +284,28 @@ public: namespace { class MachineBlockPlacement : public MachineFunctionPass { /// \brief A typedef for a block filter set. - typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet; + typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet; + + /// Pair struct containing basic block and taildup profitiability + struct BlockAndTailDupResult { + MachineBasicBlock *BB; + bool ShouldTailDup; + }; + + /// Triple struct containing edge weight and the edge. + struct WeightedEdge { + BlockFrequency Weight; + MachineBasicBlock *Src; + MachineBasicBlock *Dest; + }; /// \brief work lists of blocks that are ready to be laid out SmallVector<MachineBasicBlock *, 16> BlockWorkList; SmallVector<MachineBasicBlock *, 16> EHPadWorkList; + /// Edges that have already been computed as optimal. + DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges; + /// \brief Machine Function MachineFunction *F; @@ -294,7 +330,7 @@ class MachineBlockPlacement : public MachineFunctionPass { const TargetLoweringBase *TLI; /// \brief A handle to the post dominator tree. - MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; /// \brief Duplicator used to duplicate tails during placement. /// @@ -303,10 +339,6 @@ class MachineBlockPlacement : public MachineFunctionPass { /// must be done inline. TailDuplicator TailDup; - /// \brief A set of blocks that are unavoidably execute, i.e. they dominate - /// all terminators of the MachineFunction. - SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks; - /// \brief Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -322,7 +354,7 @@ class MachineBlockPlacement : public MachineFunctionPass { /// BlockChain it participates in, if any. We use it to, among other things, /// allow implicitly defining edges between chains as the existing edges /// between basic blocks. - DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; + DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain; #ifndef NDEBUG /// The set of basic blocks that have terminators that cannot be fully @@ -334,75 +366,107 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. - void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter = nullptr); + void markChainSuccessors( + const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); /// Decrease the UnscheduledPredecessors count for a single block, and /// if the count goes to 0, add them to the appropriate work list. void markBlockSuccessors( - BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB, + const BlockChain &Chain, const MachineBasicBlock *BB, + const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter = nullptr); - BranchProbability - collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter, - SmallVector<MachineBasicBlock *, 4> &Successors); - bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ, - BlockChain &Chain, - const BlockFilterSet *BlockFilter, - BranchProbability SuccProb, - BranchProbability HotProb); + collectViableSuccessors( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, + SmallVector<MachineBasicBlock *, 4> &Successors); + bool shouldPredBlockBeOutlined( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter, + BranchProbability SuccProb, BranchProbability HotProb); bool repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, - MachineBasicBlock *LoopHeaderBB, + const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt); - bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, - const BlockChain &Chain, - BlockFilterSet *BlockFilter, - MachineFunction::iterator &PrevUnplacedBlockIt, - bool &DuplicatedToPred); - bool - hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, - BlockChain &SuccChain, BranchProbability SuccProb, - BranchProbability RealSuccProb, BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock * - selectBestCandidateBlock(BlockChain &Chain, - SmallVectorImpl<MachineBasicBlock *> &WorkList); - MachineBasicBlock * - getFirstUnplacedBlock(const BlockChain &PlacedChain, - MachineFunction::iterator &PrevUnplacedBlockIt, - const BlockFilterSet *BlockFilter); + bool maybeTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *LPred, + BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &DuplicatedToPred); + bool hasBetterLayoutPredecessor( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + BlockAndTailDupResult selectBestSuccessor( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + MachineBasicBlock *selectBestCandidateBlock( + const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList); + MachineBasicBlock *getFirstUnplacedBlock( + const BlockChain &PlacedChain, + MachineFunction::iterator &PrevUnplacedBlockIt, + const BlockFilterSet *BlockFilter); /// \brief Add a basic block to the work list if it is appropriate. /// /// If the optional parameter BlockFilter is provided, only MBB /// present in the set will be added to the worklist. If nullptr /// is provided, no filtering occurs. - void fillWorkLists(MachineBasicBlock *MBB, + void fillWorkLists(const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter); - void buildChain(MachineBasicBlock *BB, BlockChain &Chain, + void buildChain(const MachineBasicBlock *BB, BlockChain &Chain, BlockFilterSet *BlockFilter = nullptr); - MachineBasicBlock *findBestLoopTop(MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - MachineBasicBlock *findBestLoopExit(MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - BlockFilterSet collectLoopBlockSet(MachineLoop &L); - void buildLoopChains(MachineLoop &L); - void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, - const BlockFilterSet &LoopBlockSet); - void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - void collectMustExecuteBBs(); + MachineBasicBlock *findBestLoopTop( + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + MachineBasicBlock *findBestLoopExit( + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + BlockFilterSet collectLoopBlockSet(const MachineLoop &L); + void buildLoopChains(const MachineLoop &L); + void rotateLoop( + BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, + const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile( + BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + /// Returns true if a block should be tail-duplicated to increase fallthrough + /// opportunities. + bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Check the edge frequencies to see if tail duplication will increase + /// fallthroughs. + bool isProfitableToTailDup( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + BranchProbability AdjustedSumProb, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Check for a trellis layout. + bool isTrellis(const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Get the best successor given a trellis layout. + BlockAndTailDupResult getBestTrellisSuccessor( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + BranchProbability AdjustedSumProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + /// Get the best pair of non-conflicting edges. + static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges( + const MachineBasicBlock *BB, + MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges); + /// Returns true if a block can tail duplicate into all unplaced + /// predecessors. Filters based on loop. + bool canTailDuplicateUnplacedPreds( + const MachineBasicBlock *BB, MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Find chains of triangles to tail-duplicate where a global analysis works, + /// but a local analysis would not find them. + void precomputeTriangleChains(); public: static char ID; // Pass identification, replacement for typeid @@ -415,7 +479,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineBranchProbabilityInfo>(); AU.addRequired<MachineBlockFrequencyInfo>(); - AU.addRequired<MachineDominatorTree>(); + if (TailDupPlacement) + AU.addRequired<MachinePostDominatorTree>(); AU.addRequired<MachineLoopInfo>(); AU.addRequired<TargetPassConfig>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -429,7 +494,7 @@ INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement", "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", "Branch Probability Basic Block Placement", false, false) @@ -438,7 +503,7 @@ INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", /// \brief Helper to print the name of a MBB. /// /// Only used by debug logging. -static std::string getBlockName(MachineBasicBlock *BB) { +static std::string getBlockName(const MachineBasicBlock *BB) { std::string Result; raw_string_ostream OS(Result); OS << "BB#" << BB->getNumber(); @@ -455,7 +520,7 @@ static std::string getBlockName(MachineBasicBlock *BB) { /// having one fewer active predecessor. It also adds any successors of this /// chain which reach the zero-predecessor state to the appropriate worklist. void MachineBlockPlacement::markChainSuccessors( - BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, + const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Walk all the blocks in this chain, marking their successors as having // a predecessor placed. @@ -471,8 +536,8 @@ void MachineBlockPlacement::markChainSuccessors( /// and was duplicated into the chain end, we need to redo markBlockSuccessors /// for just that block. void MachineBlockPlacement::markBlockSuccessors( - BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter) { + const BlockChain &Chain, const MachineBasicBlock *MBB, + const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Add any successors for which this is the only un-placed in-loop // predecessor to the worklist as a viable candidate for CFG-neutral // placement. No subsequent placement of this block will violate the CFG @@ -504,7 +569,8 @@ void MachineBlockPlacement::markBlockSuccessors( /// the total branch probability of edges from \p BB to those /// blocks. BranchProbability MachineBlockPlacement::collectViableSuccessors( - MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, SmallVector<MachineBasicBlock *, 4> &Successors) { // Adjust edge probabilities by excluding edges pointing to blocks that is // either not in BlockFilter or is already in the current chain. Consider the @@ -561,46 +627,573 @@ getAdjustedProbability(BranchProbability OrigProb, return SuccProb; } -/// When the option OutlineOptionalBranches is on, this method -/// checks if the fallthrough candidate block \p Succ (of block -/// \p BB) also has other unscheduled predecessor blocks which -/// are also successors of \p BB (forming triangular shape CFG). -/// If none of such predecessors are small, it returns true. -/// The caller can choose to select \p Succ as the layout successors -/// so that \p Succ's predecessors (optional branches) can be -/// outlined. -/// FIXME: fold this with more general layout cost analysis. -bool MachineBlockPlacement::shouldPredBlockBeOutlined( - MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, - const BlockFilterSet *BlockFilter, BranchProbability SuccProb, - BranchProbability HotProb) { - if (!OutlineOptionalBranches) +/// Check if \p BB has exactly the successors in \p Successors. +static bool +hasSameSuccessors(MachineBasicBlock &BB, + SmallPtrSetImpl<const MachineBasicBlock *> &Successors) { + if (BB.succ_size() != Successors.size()) return false; - // If we outline optional branches, look whether Succ is unavoidable, i.e. - // dominates all terminators of the MachineFunction. If it does, other - // successors must be optional. Don't do this for cold branches. - if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) { - for (MachineBasicBlock *Pred : Succ->predecessors()) { - // Check whether there is an unplaced optional branch. - if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + // We don't want to count self-loops + if (Successors.count(&BB)) + return false; + for (MachineBasicBlock *Succ : BB.successors()) + if (!Successors.count(Succ)) + return false; + return true; +} + +/// Check if a block should be tail duplicated to increase fallthrough +/// opportunities. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + bool IsSimple = TailDup.isSimpleBB(BB); + + if (BB->succ_size() == 1) + return false; + return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Compare 2 BlockFrequency's with a small penalty for \p A. +/// In order to be conservative, we apply a X% penalty to account for +/// increased icache pressure and static heuristics. For small frequencies +/// we use only the numerators to improve accuracy. For simplicity, we assume the +/// penalty is less than 100% +/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. +static bool greaterWithBias(BlockFrequency A, BlockFrequency B, + uint64_t EntryFreq) { + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + BlockFrequency Gain = A - B; + return (Gain / ThresholdProb).getFrequency() >= EntryFreq; +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. It only makes sense to call this function when +/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is +/// always locally profitable if we would have picked \p Succ without +/// considering duplication. +bool MachineBlockPlacement::isProfitableToTailDup( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + BranchProbability QProb, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // We need to do a probability calculation to make sure this is profitable. + // First: does succ have a successor that post-dominates? This affects the + // calculation. The 2 relevant cases are: + // BB BB + // | \Qout | \Qout + // P| C |P C + // = C' = C' + // | /Qin | /Qin + // | / | / + // Succ Succ + // / \ | \ V + // U/ =V |U \ + // / \ = D + // D E | / + // | / + // |/ + // PDom + // '=' : Branch taken for that CFG edge + // In the second case, Placing Succ while duplicating it into C prevents the + // fallthrough of Succ into either D or PDom, because they now have C as an + // unplaced predecessor + + // Start by figuring out which case we fall into + MachineBasicBlock *PDom = nullptr; + SmallVector<MachineBasicBlock *, 4> SuccSuccs; + // Only scan the relevant successors + auto AdjustedSuccSumProb = + collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs); + BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ); + auto BBFreq = MBFI->getBlockFreq(BB); + auto SuccFreq = MBFI->getBlockFreq(Succ); + BlockFrequency P = BBFreq * PProb; + BlockFrequency Qout = BBFreq * QProb; + uint64_t EntryFreq = MBFI->getEntryFreq(); + // If there are no more successors, it is profitable to copy, as it strictly + // increases fallthrough. + if (SuccSuccs.size() == 0) + return greaterWithBias(P, Qout, EntryFreq); + + auto BestSuccSucc = BranchProbability::getZero(); + // Find the PDom or the best Succ if no PDom exists. + for (MachineBasicBlock *SuccSucc : SuccSuccs) { + auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); + if (Prob > BestSuccSucc) + BestSuccSucc = Prob; + if (PDom == nullptr) + if (MPDT->dominates(SuccSucc, Succ)) { + PDom = SuccSucc; + break; + } + } + // For the comparisons, we need to know Succ's best incoming edge that isn't + // from BB. + auto SuccBestPred = BlockFrequency(0); + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + if (SuccPred == Succ || SuccPred == BB + || BlockToChain[SuccPred] == &Chain + || (BlockFilter && !BlockFilter->count(SuccPred))) + continue; + auto Freq = MBFI->getBlockFreq(SuccPred) + * MBPI->getEdgeProbability(SuccPred, Succ); + if (Freq > SuccBestPred) + SuccBestPred = Freq; + } + // Qin is Succ's best unplaced incoming edge that isn't BB + BlockFrequency Qin = SuccBestPred; + // If it doesn't have a post-dominating successor, here is the calculation: + // BB BB + // | \Qout | \ + // P| C | = + // = C' | C + // | /Qin | | + // | / | C' (+Succ) + // Succ Succ /| + // / \ | \/ | + // U/ =V | == | + // / \ | / \| + // D E D E + // '=' : Branch taken for that CFG edge + // Cost in the first case is: P + V + // For this calculation, we always assume P > Qout. If Qout > P + // The result of this function will be ignored at the caller. + // Let F = SuccFreq - Qin + // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V + + if (PDom == nullptr || !Succ->isSuccessor(PDom)) { + BranchProbability UProb = BestSuccSucc; + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency F = SuccFreq - Qin; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency QinU = std::min(Qin, F) * UProb; + BlockFrequency BaseCost = P + V; + BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb; + return greaterWithBias(BaseCost, DupCost, EntryFreq); + } + BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency U = SuccFreq * UProb; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency F = SuccFreq - Qin; + // If there is a post-dominating successor, here is the calculation: + // BB BB BB BB + // | \Qout | \ | \Qout | \ + // |P C | = |P C | = + // = C' |P C = C' |P C + // | /Qin | | | /Qin | | + // | / | C' (+Succ) | / | C' (+Succ) + // Succ Succ /| Succ Succ /| + // | \ V | \/ | | \ V | \/ | + // |U \ |U /\ =? |U = |U /\ | + // = D = = =?| | D | = =| + // | / |/ D | / |/ D + // | / | / | = | / + // |/ | / |/ | = + // Dom Dom Dom Dom + // '=' : Branch taken for that CFG edge + // The cost for taken branches in the first case is P + U + // Let F = SuccFreq - Qin + // The cost in the second case (assuming independence), given the layout: + // BB, Succ, (C+Succ), D, Dom or the layout: + // BB, Succ, D, Dom, (C+Succ) + // is Qout + max(F, Qin) * U + min(F, Qin) + // compare P + U vs Qout + P * U + Qin. + // + // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. + // + // For the 3rd case, the cost is P + 2 * V + // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V + // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V + if (UProb > AdjustedSuccSumProb / 2 && + !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb, + Chain, BlockFilter)) + // Cases 3 & 4 + return greaterWithBias( + (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb), + EntryFreq); + // Cases 1 & 2 + return greaterWithBias((P + U), + (Qout + std::min(Qin, F) * AdjustedSuccSumProb + + std::max(Qin, F) * UProb), + EntryFreq); +} + +/// Check for a trellis layout. \p BB is the upper part of a trellis if its +/// successors form the lower part of a trellis. A successor set S forms the +/// lower part of a trellis if all of the predecessors of S are either in S or +/// have all of S as successors. We ignore trellises where BB doesn't have 2 +/// successors because for fewer than 2, it's trivial, and for 3 or greater they +/// are very uncommon and complex to compute optimally. Allowing edges within S +/// is not strictly a trellis, but the same algorithm works, so we allow it. +bool MachineBlockPlacement::isTrellis( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // Technically BB could form a trellis with branching factor higher than 2. + // But that's extremely uncommon. + if (BB->succ_size() != 2 || ViableSuccs.size() != 2) + return false; + + SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(), + BB->succ_end()); + // To avoid reviewing the same predecessors twice. + SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds; + + for (MachineBasicBlock *Succ : ViableSuccs) { + int PredCount = 0; + for (auto SuccPred : Succ->predecessors()) { + // Allow triangle successors, but don't count them. + if (Successors.count(SuccPred)) { + // Make sure that it is actually a triangle. + for (MachineBasicBlock *CheckSucc : SuccPred->successors()) + if (!Successors.count(CheckSucc)) + return false; continue; - // Check whether the optional branch has exactly one BB. - if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) + } + const BlockChain *PredChain = BlockToChain[SuccPred]; + if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) || + PredChain == &Chain || PredChain == BlockToChain[Succ]) continue; - // Check whether the optional branch is small. - if (Pred->size() < OutlineOptionalThreshold) + ++PredCount; + // Perform the successor check only once. + if (!SeenPreds.insert(SuccPred).second) + continue; + if (!hasSameSuccessors(*SuccPred, Successors)) return false; } - return true; - } else + // If one of the successors has only BB as a predecessor, it is not a + // trellis. + if (PredCount < 1) + return false; + } + return true; +} + +/// Pick the highest total weight pair of edges that can both be laid out. +/// The edges in \p Edges[0] are assumed to have a different destination than +/// the edges in \p Edges[1]. Simple counting shows that the best pair is either +/// the individual highest weight edges to the 2 different destinations, or in +/// case of a conflict, one of them should be replaced with a 2nd best edge. +std::pair<MachineBlockPlacement::WeightedEdge, + MachineBlockPlacement::WeightedEdge> +MachineBlockPlacement::getBestNonConflictingEdges( + const MachineBasicBlock *BB, + MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>> + Edges) { + // Sort the edges, and then for each successor, find the best incoming + // predecessor. If the best incoming predecessors aren't the same, + // then that is clearly the best layout. If there is a conflict, one of the + // successors will have to fallthrough from the second best predecessor. We + // compare which combination is better overall. + + // Sort for highest frequency. + auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; }; + + std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp); + std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp); + auto BestA = Edges[0].begin(); + auto BestB = Edges[1].begin(); + // Arrange for the correct answer to be in BestA and BestB + // If the 2 best edges don't conflict, the answer is already there. + if (BestA->Src == BestB->Src) { + // Compare the total fallthrough of (Best + Second Best) for both pairs + auto SecondBestA = std::next(BestA); + auto SecondBestB = std::next(BestB); + BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight; + BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight; + if (BestAScore < BestBScore) + BestA = SecondBestA; + else + BestB = SecondBestB; + } + // Arrange for the BB edge to be in BestA if it exists. + if (BestB->Src == BB) + std::swap(BestA, BestB); + return std::make_pair(*BestA, *BestB); +} + +/// Get the best successor from \p BB based on \p BB being part of a trellis. +/// We only handle trellises with 2 successors, so the algorithm is +/// straightforward: Find the best pair of edges that don't conflict. We find +/// the best incoming edge for each successor in the trellis. If those conflict, +/// we consider which of them should be replaced with the second best. +/// Upon return the two best edges will be in \p BestEdges. If one of the edges +/// comes from \p BB, it will be in \p BestEdges[0] +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::getBestTrellisSuccessor( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + BranchProbability AdjustedSumProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + + BlockAndTailDupResult Result = {nullptr, false}; + SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), + BB->succ_end()); + + // We assume size 2 because it's common. For general n, we would have to do + // the Hungarian algorithm, but it's not worth the complexity because more + // than 2 successors is fairly uncommon, and a trellis even more so. + if (Successors.size() != 2 || ViableSuccs.size() != 2) + return Result; + + // Collect the edge frequencies of all edges that form the trellis. + SmallVector<WeightedEdge, 8> Edges[2]; + int SuccIndex = 0; + for (auto Succ : ViableSuccs) { + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + // Skip any placed predecessors that are not BB + if (SuccPred != BB) + if ((BlockFilter && !BlockFilter->count(SuccPred)) || + BlockToChain[SuccPred] == &Chain || + BlockToChain[SuccPred] == BlockToChain[Succ]) + continue; + BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) * + MBPI->getEdgeProbability(SuccPred, Succ); + Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ}); + } + ++SuccIndex; + } + + // Pick the best combination of 2 edges from all the edges in the trellis. + WeightedEdge BestA, BestB; + std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); + + if (BestA.Src != BB) { + // If we have a trellis, and BB doesn't have the best fallthrough edges, + // we shouldn't choose any successor. We've already looked and there's a + // better fallthrough edge for all the successors. + DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n"); + return Result; + } + + // Did we pick the triangle edge? If tail-duplication is profitable, do + // that instead. Otherwise merge the triangle edge now while we know it is + // optimal. + if (BestA.Dest == BestB.Src) { + // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2 + // would be better. + MachineBasicBlock *Succ1 = BestA.Dest; + MachineBasicBlock *Succ2 = BestB.Dest; + // Check to see if tail-duplication would be profitable. + if (TailDupPlacement && shouldTailDuplicate(Succ2) && + canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) && + isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1), + Chain, BlockFilter)) { + DEBUG(BranchProbability Succ2Prob = getAdjustedProbability( + MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb); + dbgs() << " Selected: " << getBlockName(Succ2) + << ", probability: " << Succ2Prob << " (Tail Duplicate)\n"); + Result.BB = Succ2; + Result.ShouldTailDup = true; + return Result; + } + } + // We have already computed the optimal edge for the other side of the + // trellis. + ComputedEdges[BestB.Src] = { BestB.Dest, false }; + + auto TrellisSucc = BestA.Dest; + DEBUG(BranchProbability SuccProb = getAdjustedProbability( + MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb); + dbgs() << " Selected: " << getBlockName(TrellisSucc) + << ", probability: " << SuccProb << " (Trellis)\n"); + Result.BB = TrellisSucc; + return Result; +} + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( + const MachineBasicBlock *BB, MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + if (!shouldTailDuplicate(Succ)) return false; + + // For CFG checking. + SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), + BB->succ_end()); + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors can be + // tail-duplicated into. + // Skip any blocks that are already placed or not in this loop. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) { + if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) + // This will result in a trellis after tail duplication, so we don't + // need to copy Succ into this predecessor. In the presence + // of a trellis tail duplication can continue to be profitable. + // For example: + // A A + // |\ |\ + // | \ | \ + // | C | C+BB + // | / | | + // |/ | | + // BB => BB | + // |\ |\/| + // | \ |/\| + // | D | D + // | / | / + // |/ |/ + // Succ Succ + // + // After BB was duplicated into C, the layout looks like the one on the + // right. BB and C now have the same successors. When considering + // whether Succ can be duplicated into all its unplaced predecessors, we + // ignore C. + // We can do this because C already has a profitable fallthrough, namely + // D. TODO(iteratee): ignore sufficiently cold predecessors for + // duplication and for this test. + // + // This allows trellises to be laid out in 2 separate chains + // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic + // because it allows the creation of 2 fallthrough paths with links + // between them, and we correctly identify the best layout for these + // CFGs. We want to extend trellises that the user created in addition + // to trellises created by tail-duplication, so we just look for the + // CFG. + continue; + return false; + } + } + return true; +} + +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +/// longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +/// U-shaped distribution. +/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +/// If the branches in a chain are likely to be from the same side of the +/// distribution as their predecessor, but are independent at runtime, this +/// transformation is profitable. (Because the cost of being wrong is a small +/// fixed cost, unlike the standard triangle layout where the cost of being +/// wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +/// branch was taken positively influences whether the next branch will be +/// taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { + struct TriangleChain { + std::vector<MachineBasicBlock *> Edges; + TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst) + : Edges({src, dst}) {} + + void append(MachineBasicBlock *dst) { + assert(getKey()->isSuccessor(dst) && + "Attempting to append a block that is not a successor."); + Edges.push_back(dst); + } + + unsigned count() const { return Edges.size() - 1; } + + MachineBasicBlock *getKey() const { + return Edges.back(); + } + }; + + if (TriangleChainCount == 0) + return; + + DEBUG(dbgs() << "Pre-computing triangle chains.\n"); + // Map from last block to the chain that contains it. This allows us to extend + // chains as we find new triangles. + DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap; + for (MachineBasicBlock &BB : *F) { + // If BB doesn't have 2 successors, it doesn't start a triangle. + if (BB.succ_size() != 2) + continue; + MachineBasicBlock *PDom = nullptr; + for (MachineBasicBlock *Succ : BB.successors()) { + if (!MPDT->dominates(Succ, &BB)) + continue; + PDom = Succ; + break; + } + // If BB doesn't have a post-dominating successor, it doesn't form a + // triangle. + if (PDom == nullptr) + continue; + // If PDom has a hint that it is low probability, skip this triangle. + if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) + continue; + // If PDom isn't eligible for duplication, this isn't the kind of triangle + // we're looking for. + if (!shouldTailDuplicate(PDom)) + continue; + bool CanTailDuplicate = true; + // If PDom can't tail-duplicate into it's non-BB predecessors, then this + // isn't the kind of triangle we're looking for. + for (MachineBasicBlock* Pred : PDom->predecessors()) { + if (Pred == &BB) + continue; + if (!TailDup.canTailDuplicate(PDom, Pred)) { + CanTailDuplicate = false; + break; + } + } + // If we can't tail-duplicate PDom to its predecessors, then skip this + // triangle. + if (!CanTailDuplicate) + continue; + + // Now we have an interesting triangle. Insert it if it's not part of an + // existing chain + // Note: This cannot be replaced with a call insert() or emplace() because + // the find key is BB, but the insert/emplace key is PDom. + auto Found = TriangleChainMap.find(&BB); + // If it is, remove the chain from the map, grow it, and put it back in the + // map with the end as the new key. + if (Found != TriangleChainMap.end()) { + TriangleChain Chain = std::move(Found->second); + TriangleChainMap.erase(Found); + Chain.append(PDom); + TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); + } else { + auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); + assert(InsertResult.second && "Block seen twice."); + (void)InsertResult; + } + } + + // Iterating over a DenseMap is safe here, because the only thing in the body + // of the loop is inserting into another DenseMap (ComputedEdges). + // ComputedEdges is never iterated, so this doesn't lead to non-determinism. + for (auto &ChainPair : TriangleChainMap) { + TriangleChain &Chain = ChainPair.second; + // Benchmarking has shown that due to branch correlation duplicating 2 or + // more triangles is profitable, despite the calculations assuming + // independence. + if (Chain.count() < TriangleChainCount) + continue; + MachineBasicBlock *dst = Chain.Edges.back(); + Chain.Edges.pop_back(); + for (MachineBasicBlock *src : reverse(Chain.Edges)) { + DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << + getBlockName(dst) << " as pre-computed based on triangles.\n"); + + auto InsertResult = ComputedEdges.insert({src, {dst, true}}); + assert(InsertResult.second && "Block seen twice."); + (void)InsertResult; + + dst = src; + } + } } // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold( - MachineBasicBlock *BB) { + const MachineBasicBlock *BB) { if (!BB->getParent()->getFunction()->getEntryCount()) return BranchProbability(StaticLikelyProb, 100); if (BB->succ_size() == 2) { @@ -609,11 +1202,11 @@ static BranchProbability getLayoutSuccessorProbThreshold( if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { /* See case 1 below for the cost analysis. For BB->Succ to * be taken with smaller cost, the following needs to hold: - * Prob(BB->Succ) > 2* Prob(BB->Pred) - * So the threshold T - * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, - * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified - * branch bias, we have + * Prob(BB->Succ) > 2 * Prob(BB->Pred) + * So the threshold T in the calculation below + * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) + * So T / (1 - T) = 2, Yielding T = 2/3 + * Also adding user specified branch bias, we have * T = (2/3)*(ProfileLikelyProb/50) * = (2*ProfileLikelyProb)/150) */ @@ -625,10 +1218,17 @@ static BranchProbability getLayoutSuccessorProbThreshold( /// Checks to see if the layout candidate block \p Succ has a better layout /// predecessor than \c BB. If yes, returns true. +/// \p SuccProb: The probability adjusted for only remaining blocks. +/// Only used for logging +/// \p RealSuccProb: The un-adjusted probability. +/// \p Chain: The chain that BB belongs to and Succ is being considered for. +/// \p BlockFilter: if non-null, the set of blocks that make up the loop being +/// considered bool MachineBlockPlacement::hasBetterLayoutPredecessor( - MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, - BranchProbability SuccProb, BranchProbability RealSuccProb, - BlockChain &Chain, const BlockFilterSet *BlockFilter) { + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { // There isn't a better layout when there are no unscheduled predecessors. if (SuccChain.UnscheduledPredecessors == 0) @@ -734,11 +1334,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( // | Pred----| | S1---- // | | | | // --(S1 or S2) ---Pred-- + // | + // S2 // // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2) // + min(freq(Pred->S1), freq(Pred->S2)) // Non-topo-order cost: - // In the worst case, S2 will not get laid out after Pred. // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2). // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2)) // is 0. Then the non topo layout is better when @@ -756,13 +1357,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( for (MachineBasicBlock *Pred : Succ->predecessors()) { if (Pred == Succ || BlockToChain[Pred] == &SuccChain || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + BlockToChain[Pred] == &Chain || + // This check is redundant except for look ahead. This function is + // called for lookahead by isProfitableToTailDup when BB hasn't been + // placed yet. + (Pred == BB)) continue; // Do backward checking. // For all cases above, we need a backward checking to filter out edges that - // are not 'strongly' biased. With profile data available, the check is - // mostly redundant for case 2 (when threshold prob is set at 50%) unless S - // has more than two successors. + // are not 'strongly' biased. // BB Pred // \ / // Succ @@ -798,14 +1401,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( /// breaking CFG structure, but cave and break such structures in the case of /// very hot successor edges. /// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * -MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter) { +/// \returns The best successor block found, or null if none are viable, along +/// with a boolean indicating if tail duplication is necessary. +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::selectBestSuccessor( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { const BranchProbability HotProb(StaticLikelyProb, 100); - MachineBasicBlock *BestSucc = nullptr; + BlockAndTailDupResult BestSucc = { nullptr, false }; auto BestProb = BranchProbability::getZero(); SmallVector<MachineBasicBlock *, 4> Successors; @@ -813,22 +1417,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, collectViableSuccessors(BB, Chain, BlockFilter, Successors); DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); + + // if we already precomputed the best successor for BB, return that if still + // applicable. + auto FoundEdge = ComputedEdges.find(BB); + if (FoundEdge != ComputedEdges.end()) { + MachineBasicBlock *Succ = FoundEdge->second.BB; + ComputedEdges.erase(FoundEdge); + BlockChain *SuccChain = BlockToChain[Succ]; + if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) && + SuccChain != &Chain && Succ == *SuccChain->begin()) + return FoundEdge->second; + } + + // if BB is part of a trellis, Use the trellis to determine the optimal + // fallthrough edges + if (isTrellis(BB, Successors, Chain, BlockFilter)) + return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain, + BlockFilter); + + // For blocks with CFG violations, we may be able to lay them out anyway with + // tail-duplication. We keep this vector so we can perform the probability + // calculations the minimum number of times. + SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4> + DupCandidates; for (MachineBasicBlock *Succ : Successors) { auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb); - // This heuristic is off by default. - if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, - HotProb)) - return Succ; - BlockChain &SuccChain = *BlockToChain[Succ]; // Skip the edge \c BB->Succ if block \c Succ has a better layout // predecessor that yields lower global cost. if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, - Chain, BlockFilter)) + Chain, BlockFilter)) { + // If tail duplication would make Succ profitable, place it. + if (TailDupPlacement && shouldTailDuplicate(Succ)) + DupCandidates.push_back(std::make_tuple(SuccProb, Succ)); continue; + } DEBUG( dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " @@ -836,17 +1463,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) { + if (BestSucc.BB && BestProb >= SuccProb) { DEBUG(dbgs() << " Not the best candidate, continuing\n"); continue; } DEBUG(dbgs() << " Setting it as best candidate\n"); - BestSucc = Succ; + BestSucc.BB = Succ; BestProb = SuccProb; } - if (BestSucc) - DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc) << "\n"); + // Handle the tail duplication candidates in order of decreasing probability. + // Stop at the first one that is profitable. Also stop if they are less + // profitable than BestSucc. Position is important because we preserve it and + // prefer first best match. Here we aren't comparing in order, so we capture + // the position instead. + if (DupCandidates.size() != 0) { + auto cmp = + [](const std::tuple<BranchProbability, MachineBasicBlock *> &a, + const std::tuple<BranchProbability, MachineBasicBlock *> &b) { + return std::get<0>(a) > std::get<0>(b); + }; + std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp); + } + for(auto &Tup : DupCandidates) { + BranchProbability DupProb; + MachineBasicBlock *Succ; + std::tie(DupProb, Succ) = Tup; + if (DupProb < BestProb) + break; + if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) + && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { + DEBUG( + dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " + << DupProb + << " (Tail Duplicate)\n"); + BestSucc.BB = Succ; + BestSucc.ShouldTailDup = true; + break; + } + } + + if (BestSucc.BB) + DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n"); return BestSucc; } @@ -862,7 +1520,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, /// /// \returns The best block found, or null if none are viable. MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( - BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { + const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { // Once we need to walk the worklist looking for a candidate, cleanup the // worklist of already placed entries. // FIXME: If this shows up on profiles, it could be folded (at the cost of @@ -948,7 +1606,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( } void MachineBlockPlacement::fillWorkLists( - MachineBasicBlock *MBB, + const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter = nullptr) { BlockChain &Chain = *BlockToChain[MBB]; @@ -970,23 +1628,23 @@ void MachineBlockPlacement::fillWorkLists( if (Chain.UnscheduledPredecessors != 0) return; - MBB = *Chain.begin(); - if (MBB->isEHPad()) - EHPadWorkList.push_back(MBB); + MachineBasicBlock *BB = *Chain.begin(); + if (BB->isEHPad()) + EHPadWorkList.push_back(BB); else - BlockWorkList.push_back(MBB); + BlockWorkList.push_back(BB); } void MachineBlockPlacement::buildChain( - MachineBasicBlock *BB, BlockChain &Chain, + const MachineBasicBlock *HeadBB, BlockChain &Chain, BlockFilterSet *BlockFilter) { - assert(BB && "BB must not be null.\n"); - assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); + assert(HeadBB && "BB must not be null.\n"); + assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); - MachineBasicBlock *LoopHeaderBB = BB; + const MachineBasicBlock *LoopHeaderBB = HeadBB; markChainSuccessors(Chain, LoopHeaderBB, BlockFilter); - BB = *std::prev(Chain.end()); + MachineBasicBlock *BB = *std::prev(Chain.end()); for (;;) { assert(BB && "null block found at end of chain in loop."); assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); @@ -995,7 +1653,11 @@ void MachineBlockPlacement::buildChain( // Look for the best viable successor if there is one to place immediately // after this block. - MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + auto Result = selectBestSuccessor(BB, Chain, BlockFilter); + MachineBasicBlock* BestSucc = Result.BB; + bool ShouldTailDup = Result.ShouldTailDup; + if (TailDupPlacement) + ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc)); // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at @@ -1016,7 +1678,7 @@ void MachineBlockPlacement::buildChain( // Placement may have changed tail duplication opportunities. // Check for that now. - if (TailDupPlacement && BestSucc) { + if (TailDupPlacement && BestSucc && ShouldTailDup) { // If the chosen successor was duplicated into all its predecessors, // don't bother laying it out, just go round the loop again with BB as // the chain end. @@ -1052,7 +1714,7 @@ void MachineBlockPlacement::buildChain( /// unconditional jump (for the backedge) rotating it in front of the loop /// header is always profitable. MachineBasicBlock * -MachineBlockPlacement::findBestLoopTop(MachineLoop &L, +MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // Placing the latch block before the header may introduce an extra branch // that skips this block the first time the loop is executed, which we want @@ -1116,7 +1778,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L, /// block to layout at the top of the loop. Typically this is done to maximize /// fallthrough opportunities. MachineBasicBlock * -MachineBlockPlacement::findBestLoopExit(MachineLoop &L, +MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // We don't want to layout the loop linearly in all cases. If the loop header // is just a normal basic block in the loop, we want to look for what block @@ -1235,7 +1897,7 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L, /// branches. For example, if the loop has fallthrough into its header and out /// of its bottom already, don't rotate it. void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, - MachineBasicBlock *ExitingBB, + const MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet) { if (!ExitingBB) return; @@ -1285,7 +1947,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, /// Therefore, the cost for a given rotation is the sum of costs listed above. /// We select the best rotation with the smallest cost. void MachineBlockPlacement::rotateLoopWithProfile( - BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet) { auto HeaderBB = L.getHeader(); auto HeaderIter = find(LoopChain, HeaderBB); auto RotationPos = LoopChain.end(); @@ -1422,7 +2085,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( /// When profile data is available, exclude cold blocks from the returned set; /// otherwise, collect all blocks in the loop. MachineBlockPlacement::BlockFilterSet -MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { +MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { BlockFilterSet LoopBlockSet; // Filter cold blocks off from LoopBlockSet when profile data is available. @@ -1459,10 +2122,10 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { /// as much as possible. We can then stitch the chains together in a way which /// both preserves the topological structure and minimizes taken conditional /// branches. -void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { +void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { // First recurse through any nested loops, building chains for those inner // loops. - for (MachineLoop *InnerLoop : L) + for (const MachineLoop *InnerLoop : L) buildLoopChains(*InnerLoop); assert(BlockWorkList.empty()); @@ -1499,7 +2162,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { assert(LoopChain.UnscheduledPredecessors == 0); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : LoopBlockSet) + for (const MachineBasicBlock *LoopBB : LoopBlockSet) fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet); buildChain(LoopTop, LoopChain, &LoopBlockSet); @@ -1533,7 +2196,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { if (!LoopBlockSet.empty()) { BadLoop = true; - for (MachineBasicBlock *LoopBB : LoopBlockSet) + for (const MachineBasicBlock *LoopBB : LoopBlockSet) dbgs() << "Loop contains blocks never placed into a chain!\n" << " Loop header: " << getBlockName(*L.block_begin()) << "\n" << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" @@ -1546,31 +2209,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { EHPadWorkList.clear(); } -/// When OutlineOpitonalBranches is on, this method collects BBs that -/// dominates all terminator blocks of the function \p F. -void MachineBlockPlacement::collectMustExecuteBBs() { - if (OutlineOptionalBranches) { - // Find the nearest common dominator of all of F's terminators. - MachineBasicBlock *Terminator = nullptr; - for (MachineBasicBlock &MBB : *F) { - if (MBB.succ_size() == 0) { - if (Terminator == nullptr) - Terminator = &MBB; - else - Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); - } - } - - // MBBs dominating this common dominator are unavoidable. - UnavoidableBlocks.clear(); - for (MachineBasicBlock &MBB : *F) { - if (MDT->dominates(&MBB, Terminator)) { - UnavoidableBlocks.insert(&MBB); - } - } - } -} - void MachineBlockPlacement::buildCFGChains() { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. @@ -1605,9 +2243,6 @@ void MachineBlockPlacement::buildCFGChains() { } } - // Turned on with OutlineOptionalBranches option - collectMustExecuteBBs(); - // Build any loop-based chains. PreferredLoopExit = nullptr; for (MachineLoop *L : *MLI) @@ -1839,7 +2474,7 @@ void MachineBlockPlacement::alignBlocks() { /// @return true if \p BB was removed. bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, - MachineBasicBlock *LoopHeaderBB, + const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt) { bool Removed, DuplicatedToLPred; @@ -1901,21 +2536,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( /// \return - True if the block was duplicated into all preds and removed. bool MachineBlockPlacement::maybeTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *LPred, - const BlockChain &Chain, BlockFilterSet *BlockFilter, + BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, bool &DuplicatedToLPred) { - DuplicatedToLPred = false; + if (!shouldTailDuplicate(BB)) + return false; + DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BB); - // Blocks with single successors don't create additional fallthrough - // opportunities. Don't duplicate them. TODO: When conditional exits are - // analyzable, allow them to be duplicated. - if (!IsSimple && BB->succ_size() == 1) - return false; - if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) - return false; + // This has to be a callback because none of it can be done after // BB is deleted. bool Removed = false; @@ -1967,6 +2597,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback); SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; + bool IsSimple = TailDup.isSimpleBB(BB); TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef); @@ -2006,21 +2637,24 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis<MachineLoopInfo>(); TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); - MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = nullptr; // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; + assert(BlockToChain.empty()); + assert(ComputedEdges.empty()); + if (TailDupPlacement) { - unsigned TailDupSize = TailDuplicatePlacementThreshold; + MPDT = &getAnalysis<MachinePostDominatorTree>(); + unsigned TailDupSize = TailDupPlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + precomputeTriangleChains(); } - assert(BlockToChain.empty()); - buildCFGChains(); // Changing the layout can create new tail merging opportunities. @@ -2032,7 +2666,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1; + unsigned TailMergeSize = TailDupPlacementThreshold + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); @@ -2041,8 +2675,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); - // Must redo the dominator tree if blocks were changed. - MDT->runOnMachineFunction(MF); + ComputedEdges.clear(); + // Must redo the post-dominator tree if blocks were changed. + if (MPDT) + MPDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } @@ -2052,6 +2688,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { alignBlocks(); BlockToChain.clear(); + ComputedEdges.clear(); ChainAllocator.DestroyAll(); if (AlignAllBlock) @@ -2067,6 +2704,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MBI->setAlignment(AlignAllNonFallThruBlocks); } } + if (ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + F->getFunction()->getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("MBP." + MF.getName(), false); + } + // We always return true as we have no way to track whether the final order // differs from the original order. diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp index 5beed5f5dd087..50e453e4067cc 100644 --- a/lib/CodeGen/MachineCombiner.cpp +++ b/lib/CodeGen/MachineCombiner.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // The machine combiner pass uses machine trace metrics to ensure the combined -// instructions does not lengthen the critical path or the resource depth. +// instructions do not lengthen the critical path or the resource depth. //===----------------------------------------------------------------------===// #define DEBUG_TYPE "machine-combiner" @@ -135,7 +135,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth for (auto *InstrPtr : InsInstrs) { // for each Use unsigned IDepth = 0; - DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";); + DEBUG(dbgs() << "NEW INSTR "; + InstrPtr->print(dbgs(), TII); + dbgs() << "\n";); for (const MachineOperand &MO : InstrPtr->operands()) { // Check for virtual register operand. if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) @@ -352,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { return false; } +static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, + SmallVector<MachineInstr *, 16> InsInstrs, + SmallVector<MachineInstr *, 16> DelInstrs, + MachineTraceMetrics *Traces) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); + for (auto *InstrPtr : DelInstrs) + InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); + ++NumInstCombined; + Traces->invalidate(MBB); + Traces->verifyAnalysis(); +} + /// Substitute a slow code sequence with a faster one by /// evaluating instruction combining pattern. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction @@ -406,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { DenseMap<unsigned, unsigned> InstrIdxForVirtReg; if (!MinInstr) MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); Traces->verifyAnalysis(); TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, InstrIdxForVirtReg); @@ -426,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || - (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - DelInstrs, InstrIdxForVirtReg, P) && - preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { - for (auto *InstrPtr : InsInstrs) - MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) - InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - - Changed = true; - ++NumInstCombined; - - Traces->invalidate(MBB); - Traces->verifyAnalysis(); + if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); // Eagerly stop after the first pattern fires. + Changed = true; break; } else { + // Calculating the trace metrics may be expensive, + // so only do this when necessary. + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, + InstrIdxForVirtReg, P) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + // Eagerly stop after the first pattern fires. + Changed = true; + break; + } // Cleanup instructions of the alternative code sequence. There is no // use for them. MachineFunction *MF = MBB->getParent(); diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp index 5de6dec29fb9d..7312dc5e94bdd 100644 --- a/lib/CodeGen/MachineCopyPropagation.cpp +++ b/lib/CodeGen/MachineCopyPropagation.cpp @@ -291,17 +291,9 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { if (MO.isDef()) { Defs.push_back(Reg); - } else { + continue; + } else if (MO.readsReg()) ReadRegister(Reg); - } - // Treat undef use like defs for copy propagation but not for - // dead copy. We would need to do a liveness check to be sure the copy - // is dead for undef uses. - // The backends are allowed to do whatever they want with undef value - // and we cannot be sure this register will not be rewritten to break - // some false dependencies for the hardware for instance. - if (MO.isUndef()) - Defs.push_back(Reg); } // The instruction has a register mask operand which means that it clobbers diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp index 303a6a9263be7..e3a6c51c47ad5 100644 --- a/lib/CodeGen/MachineDominators.cpp +++ b/lib/CodeGen/MachineDominators.cpp @@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const { bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) { CriticalEdgesToSplit.clear(); NewBBs.clear(); + DT.reset(new DominatorTreeBase<MachineBasicBlock>(false)); DT->recalculate(F); - return false; } MachineDominatorTree::MachineDominatorTree() : MachineFunctionPass(ID) { initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); - DT = new DominatorTreeBase<MachineBasicBlock>(false); -} - -MachineDominatorTree::~MachineDominatorTree() { - delete DT; } void MachineDominatorTree::releaseMemory() { - DT->releaseMemory(); + CriticalEdgesToSplit.clear(); + DT.reset(nullptr); } void MachineDominatorTree::verifyAnalysis() const { - if (VerifyMachineDomInfo) + if (DT && VerifyMachineDomInfo) verifyDomTree(); } void MachineDominatorTree::print(raw_ostream &OS, const Module*) const { - DT->print(OS); + if (DT) + DT->print(OS); } void MachineDominatorTree::applySplitCriticalEdges() const { @@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const { } void MachineDominatorTree::verifyDomTree() const { + if (!DT) + return; MachineFunction &F = *getRoot()->getParent(); - MachineDominatorTree OtherDT; - OtherDT.DT->recalculate(F); - if (compare(OtherDT)) { + DominatorTreeBase<MachineBasicBlock> OtherDT(false); + OtherDT.recalculate(F); + if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() || + DT->compare(OtherDT)) { errs() << "MachineDominatorTree is not up to date!\nComputed:\n"; - print(errs(), nullptr); + DT->print(errs()); errs() << "\nActual:\n"; - OtherDT.print(errs(), nullptr); + OtherDT.print(errs()); abort(); } } diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index c1d5ea96cd17a..c9767a25e908d 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -169,6 +169,7 @@ void MachineFunction::clear() { InstructionRecycler.clear(Allocator); OperandRecycler.clear(Allocator); BasicBlockRecycler.clear(Allocator); + VariableDbgInfos.clear(); if (RegInfo) { RegInfo->~MachineRegisterInfo(); Allocator.Deallocate(RegInfo); @@ -859,7 +860,9 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { if (!isCalleeSavedInfoValid()) return BV; - for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; + ++CSR) BV.set(*CSR); // Saved CSRs are not pristine. @@ -956,7 +959,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void MachineFrameInfo::dump(const MachineFunction &MF) const { +LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const { print(MF, dbgs()); } #endif diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index 2f2e3b3d8e9f1..c0a8b95ed8a06 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -262,8 +262,21 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { return getBlockAddress() == Other.getBlockAddress() && getOffset() == Other.getOffset(); case MachineOperand::MO_RegisterMask: - case MachineOperand::MO_RegisterLiveOut: - return getRegMask() == Other.getRegMask(); + case MachineOperand::MO_RegisterLiveOut: { + // Shallow compare of the two RegMasks + const uint32_t *RegMask = getRegMask(); + const uint32_t *OtherRegMask = Other.getRegMask(); + if (RegMask == OtherRegMask) + return true; + + // Calculate the size of the RegMask + const MachineFunction *MF = getParent()->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + + // Deep compare of the two RegMasks + return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask); + } case MachineOperand::MO_MCSymbol: return getMCSymbol() == Other.getMCSymbol(); case MachineOperand::MO_CFIIndex: @@ -274,6 +287,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { return getIntrinsicID() == Other.getIntrinsicID(); case MachineOperand::MO_Predicate: return getPredicate() == Other.getPredicate(); + case MachineOperand::MO_Placeholder: + return true; } llvm_unreachable("Invalid machine operand type"); } @@ -322,6 +337,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) { return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID()); case MachineOperand::MO_Predicate: return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate()); + case MachineOperand::MO_Placeholder: + return hash_combine(); } llvm_unreachable("Invalid machine operand type"); } @@ -403,6 +420,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, bool Unused; APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused); OS << "half " << APF.convertToFloat(); + } else if (getFPImm()->getType()->isFP128Ty()) { + APFloat APF = getFPImm()->getValueAPF(); + SmallString<16> Str; + getFPImm()->getValueAPF().toString(Str); + OS << "quad " << Str; } else { OS << getFPImm()->getValueAPF().convertToDouble(); } @@ -491,7 +513,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, auto Pred = static_cast<CmpInst::Predicate>(getPredicate()); OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred") << CmpInst::getPredicateName(Pred) << '>'; + break; } + case MachineOperand::MO_Placeholder: + OS << "<placeholder>"; + break; } if (unsigned TF = getTargetFlags()) OS << "[TF=" << TF << ']'; @@ -1571,6 +1597,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const { return true; } +bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other, + bool UseTBAA) { + const MachineFunction *MF = getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + + // If neither instruction stores to memory, they can't alias in any + // meaningful way, even if they read from the same address. + if (!mayStore() && !Other.mayStore()) + return false; + + // Let the target decide if memory accesses cannot possibly overlap. + if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA)) + return false; + + if (!AA) + return true; + + // FIXME: Need to handle multiple memory operands to support all targets. + if (!hasOneMemOperand() || !Other.hasOneMemOperand()) + return true; + + MachineMemOperand *MMOa = *memoperands_begin(); + MachineMemOperand *MMOb = *Other.memoperands_begin(); + + if (!MMOa->getValue() || !MMOb->getValue()) + return true; + + // The following interface to AA is fashioned after DAGCombiner::isAlias + // and operates with MachineMemOperand offset with some important + // assumptions: + // - LLVM fundamentally assumes flat address spaces. + // - MachineOperand offset can *only* result from legalization and + // cannot affect queries other than the trivial case of overlap + // checking. + // - These offsets never wrap and never step outside + // of allocated objects. + // - There should never be any negative offsets here. + // + // FIXME: Modify API to hide this math from "user" + // FIXME: Even before we go to AA we can reason locally about some + // memory objects. It can save compile time, and possibly catch some + // corner cases not currently covered. + + assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + + int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); + int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; + int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; + + AliasResult AAResult = + AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, + UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), + MemoryLocation(MMOb->getValue(), Overlapb, + UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); + + return (AAResult != NoAlias); +} + /// hasOrderedMemoryRef - Return true if this instruction may have an ordered /// or volatile memory reference, or if the information describing the memory /// reference is not available. Return false if it is known to have no ordered @@ -1692,14 +1777,14 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF, } } -LLVM_DUMP_METHOD void MachineInstr::dump(const TargetInstrInfo *TII) const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineInstr::dump() const { dbgs() << " "; - print(dbgs(), false /* SkipOpers */, TII); -#endif + print(dbgs()); } +#endif -void MachineInstr::print(raw_ostream &OS, bool SkipOpers, +void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc, const TargetInstrInfo *TII) const { const Module *M = nullptr; if (const MachineBasicBlock *MBB = getParent()) @@ -1707,11 +1792,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers, M = MF->getFunction()->getParent(); ModuleSlotTracker MST(M); - print(OS, MST, SkipOpers, TII); + print(OS, MST, SkipOpers, SkipDebugLoc, TII); } void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, - bool SkipOpers, const TargetInstrInfo *TII) const { + bool SkipOpers, bool SkipDebugLoc, + const TargetInstrInfo *TII) const { // We can be a bit tidier if we know the MachineFunction. const MachineFunction *MF = nullptr; const TargetRegisterInfo *TRI = nullptr; @@ -1987,6 +2073,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, } if (isIndirectDebugValue()) OS << " indirect"; + } else if (SkipDebugLoc) { + return; } else if (debugLoc && MF) { if (!HaveSemi) OS << ";"; diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp index fdeaf7b711611..a9aa1d954e70e 100644 --- a/lib/CodeGen/MachineLoopInfo.cpp +++ b/lib/CodeGen/MachineLoopInfo.cpp @@ -87,6 +87,22 @@ MachineBasicBlock *MachineLoop::findLoopControlBlock() { return nullptr; } +DebugLoc MachineLoop::getStartLoc() const { + // Try the pre-header first. + if (MachineBasicBlock *PHeadMBB = getLoopPreheader()) + if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock()) + if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc()) + return DL; + + // If we have no pre-header or there are no instructions with debug + // info in it, try the header. + if (MachineBasicBlock *HeadMBB = getHeader()) + if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock()) + return HeadBB->getTerminator()->getDebugLoc(); + + return DebugLoc(); +} + MachineBasicBlock * MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader) const { diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp index 6618857477ed4..2f0f4297ef5c5 100644 --- a/lib/CodeGen/MachineModuleInfo.cpp +++ b/lib/CodeGen/MachineModuleInfo.cpp @@ -306,6 +306,10 @@ public: MMI.deleteMachineFunctionFor(F); return true; } + + StringRef getPassName() const override { + return "Free MachineFunction"; + } }; char FreeMachineFunction::ID; } // end anonymous namespace diff --git a/lib/CodeGen/MachineModuleInfoImpls.cpp b/lib/CodeGen/MachineModuleInfoImpls.cpp index 22d519e5d88fa..4c81fd91cb829 100644 --- a/lib/CodeGen/MachineModuleInfoImpls.cpp +++ b/lib/CodeGen/MachineModuleInfoImpls.cpp @@ -23,6 +23,7 @@ using namespace llvm; // Out of line virtual method. void MachineModuleInfoMachO::anchor() {} void MachineModuleInfoELF::anchor() {} +void MachineModuleInfoWasm::anchor() {} static int SortSymbolPair(const void *LHS, const void *RHS) { typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy; diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp new file mode 100644 index 0000000000000..6b6b5f2814a90 --- /dev/null +++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -0,0 +1,100 @@ +///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===// +/// +/// The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// Optimization diagnostic interfaces for machine passes. It's packaged as an +/// analysis pass so that by using this service passes become dependent on MBFI +/// as well. MBFI is used to compute the "hotness" of the diagnostic message. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +DiagnosticInfoMIROptimization::MachineArgument::MachineArgument( + StringRef MKey, const MachineInstr &MI) + : Argument() { + Key = MKey; + + raw_string_ostream OS(Val); + MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true); +} + +Optional<uint64_t> +MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) { + if (!MBFI) + return None; + + return MBFI->getBlockProfileCount(&MBB); +} + +void MachineOptimizationRemarkEmitter::computeHotness( + DiagnosticInfoMIROptimization &Remark) { + const MachineBasicBlock *MBB = Remark.getBlock(); + if (MBB) + Remark.setHotness(computeHotness(*MBB)); +} + +void MachineOptimizationRemarkEmitter::emit( + DiagnosticInfoOptimizationBase &OptDiagCommon) { + auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon); + computeHotness(OptDiag); + + LLVMContext &Ctx = MF.getFunction()->getContext(); + yaml::Output *Out = Ctx.getDiagnosticsOutputFile(); + if (Out) { + auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon); + *Out << P; + } + // FIXME: now that IsVerbose is part of DI, filtering for this will be moved + // from here to clang. + if (!OptDiag.isVerbose() || shouldEmitVerbose()) + Ctx.diagnose(OptDiag); +} + +MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass() + : MachineFunctionPass(ID) { + initializeMachineOptimizationRemarkEmitterPassPass( + *PassRegistry::getPassRegistry()); +} + +bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction( + MachineFunction &MF) { + MachineBlockFrequencyInfo *MBFI; + + if (MF.getFunction()->getContext().getDiagnosticHotnessRequested()) + MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI(); + else + MBFI = nullptr; + + ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); + return false; +} + +void MachineOptimizationRemarkEmitterPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +char MachineOptimizationRemarkEmitterPass::ID = 0; +static const char ore_name[] = "Machine Optimization Remark Emitter"; +#define ORE_NAME "machine-opt-remark-emitter" + +INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, + false, true) +INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass) +INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, + false, true) diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp new file mode 100644 index 0000000000000..581a8ad811497 --- /dev/null +++ b/lib/CodeGen/MachineOutliner.cpp @@ -0,0 +1,1251 @@ +//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Replaces repeated sequences of instructions with function calls. +/// +/// This works by placing every instruction from every basic block in a +/// suffix tree, and repeatedly querying that tree for repeated sequences of +/// instructions. If a sequence of instructions appears often, then it ought +/// to be beneficial to pull out into a function. +/// +/// This was originally presented at the 2016 LLVM Developers' Meeting in the +/// talk "Reducing Code Size Using Outlining". For a high-level overview of +/// how this pass works, the talk is available on YouTube at +/// +/// https://www.youtube.com/watch?v=yorld-WSOeU +/// +/// The slides for the talk are available at +/// +/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf +/// +/// The talk provides an overview of how the outliner finds candidates and +/// ultimately outlines them. It describes how the main data structure for this +/// pass, the suffix tree, is queried and purged for candidates. It also gives +/// a simplified suffix tree construction algorithm for suffix trees based off +/// of the algorithm actually used here, Ukkonen's algorithm. +/// +/// For the original RFC for this pass, please see +/// +/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html +/// +/// For more information on the suffix tree data structure, please see +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +/// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <functional> +#include <map> +#include <sstream> +#include <tuple> +#include <vector> + +#define DEBUG_TYPE "machine-outliner" + +using namespace llvm; + +STATISTIC(NumOutlined, "Number of candidates outlined"); +STATISTIC(FunctionsCreated, "Number of functions created"); + +namespace { + +/// \brief An individual sequence of instructions to be replaced with a call to +/// an outlined function. +struct Candidate { + + /// Set to false if the candidate overlapped with another candidate. + bool InCandidateList = true; + + /// The start index of this \p Candidate. + size_t StartIdx; + + /// The number of instructions in this \p Candidate. + size_t Len; + + /// The index of this \p Candidate's \p OutlinedFunction in the list of + /// \p OutlinedFunctions. + size_t FunctionIdx; + + /// \brief The number of instructions that would be saved by outlining every + /// candidate of this type. + /// + /// This is a fixed value which is not updated during the candidate pruning + /// process. It is only used for deciding which candidate to keep if two + /// candidates overlap. The true benefit is stored in the OutlinedFunction + /// for some given candidate. + unsigned Benefit = 0; + + Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx) + : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {} + + Candidate() {} + + /// \brief Used to ensure that \p Candidates are outlined in an order that + /// preserves the start and end indices of other \p Candidates. + bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; } +}; + +/// \brief The information necessary to create an outlined function for some +/// class of candidate. +struct OutlinedFunction { + + /// The actual outlined function created. + /// This is initialized after we go through and create the actual function. + MachineFunction *MF = nullptr; + + /// A number assigned to this function which appears at the end of its name. + size_t Name; + + /// The number of candidates for this OutlinedFunction. + size_t OccurrenceCount = 0; + + /// \brief The sequence of integers corresponding to the instructions in this + /// function. + std::vector<unsigned> Sequence; + + /// The number of instructions this function would save. + unsigned Benefit = 0; + + /// \brief Set to true if candidates for this outlined function should be + /// replaced with tail calls to this OutlinedFunction. + bool IsTailCall = false; + + OutlinedFunction(size_t Name, size_t OccurrenceCount, + const std::vector<unsigned> &Sequence, + unsigned Benefit, bool IsTailCall) + : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence), + Benefit(Benefit), IsTailCall(IsTailCall) + {} +}; + +/// Represents an undefined index in the suffix tree. +const size_t EmptyIdx = -1; + +/// A node in a suffix tree which represents a substring or suffix. +/// +/// Each node has either no children or at least two children, with the root +/// being a exception in the empty tree. +/// +/// Children are represented as a map between unsigned integers and nodes. If +/// a node N has a child M on unsigned integer k, then the mapping represented +/// by N is a proper prefix of the mapping represented by M. Note that this, +/// although similar to a trie is somewhat different: each node stores a full +/// substring of the full mapping rather than a single character state. +/// +/// Each internal node contains a pointer to the internal node representing +/// the same string, but with the first character chopped off. This is stored +/// in \p Link. Each leaf node stores the start index of its respective +/// suffix in \p SuffixIdx. +struct SuffixTreeNode { + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + DenseMap<unsigned, SuffixTreeNode *> Children; + + /// A flag set to false if the node has been pruned from the tree. + bool IsInTree = true; + + /// The start index of this node's substring in the main string. + size_t StartIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + size_t *EndIdx = nullptr; + + /// For leaves, the start index of the suffix represented by this node. + /// + /// For all other nodes, this is ignored. + size_t SuffixIdx = EmptyIdx; + + /// \brief For internal nodes, a pointer to the internal node representing + /// the same sequence with the first character chopped off. + /// + /// This has two major purposes in the suffix tree. The first is as a + /// shortcut in Ukkonen's construction algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + /// + /// The suffix link is also used during the tree pruning process to let us + /// quickly throw out a bunch of potential overlaps. Say we have a sequence + /// S we want to outline. Then each of its suffixes contribute to at least + /// one overlapping case. Therefore, we can follow the suffix links + /// starting at the node associated with S to the root and "delete" those + /// nodes, save for the root. For each candidate, this removes + /// O(|candidate|) overlaps from the search space. We don't actually + /// completely invalidate these nodes though; doing that is far too + /// aggressive. Consider the following pathological string: + /// + /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 + /// + /// If we, for the sake of example, outlined 1 2 3, then we would throw + /// out all instances of 2 3. This isn't desirable. To get around this, + /// when we visit a link node, we decrement its occurrence count by the + /// number of sequences we outlined in the current step. In the pathological + /// example, the 2 3 node would have an occurrence count of 8, while the + /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node + /// would survive to the next round allowing us to outline the extra + /// instances of 2 3. + SuffixTreeNode *Link = nullptr; + + /// The parent of this node. Every node except for the root has a parent. + SuffixTreeNode *Parent = nullptr; + + /// The number of times this node's string appears in the tree. + /// + /// This is equal to the number of leaf children of the string. It represents + /// the number of suffixes that the node's string is a prefix of. + size_t OccurrenceCount = 0; + + /// The length of the string formed by concatenating the edge labels from the + /// root to this node. + size_t ConcatLen = 0; + + /// Returns true if this node is a leaf. + bool isLeaf() const { return SuffixIdx != EmptyIdx; } + + /// Returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const { return StartIdx == EmptyIdx; } + + /// Return the number of elements in the substring associated with this node. + size_t size() const { + + // Is it the root? If so, it's the empty string so return 0. + if (isRoot()) + return 0; + + assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); + + // Size = the number of elements in the string. + // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. + return *EndIdx - StartIdx + 1; + } + + SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link, + SuffixTreeNode *Parent) + : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {} + + SuffixTreeNode() {} +}; + +/// A data structure for fast substring queries. +/// +/// Suffix trees represent the suffixes of their input strings in their leaves. +/// A suffix tree is a type of compressed trie structure where each node +/// represents an entire substring rather than a single character. Each leaf +/// of the tree is a suffix. +/// +/// A suffix tree can be seen as a type of state machine where each state is a +/// substring of the full string. The tree is structured so that, for a string +/// of length N, there are exactly N leaves in the tree. This structure allows +/// us to quickly find repeated substrings of the input string. +/// +/// In this implementation, a "string" is a vector of unsigned integers. +/// These integers may result from hashing some data type. A suffix tree can +/// contain 1 or many strings, which can then be queried as one large string. +/// +/// The suffix tree is implemented using Ukkonen's algorithm for linear-time +/// suffix tree construction. Ukkonen's algorithm is explained in more detail +/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +/// paper is available at +/// +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +class SuffixTree { +private: + /// Each element is an integer representing an instruction in the module. + ArrayRef<unsigned> Str; + + /// Maintains each node in the tree. + SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeNode *Root = nullptr; + + /// Stores each leaf node in the tree. + /// + /// This is used for finding outlining candidates. + std::vector<SuffixTreeNode *> LeafVector; + + /// Maintains the end indices of the internal nodes in the tree. + /// + /// Each internal node is guaranteed to never have its end index change + /// during the construction algorithm; however, leaves must be updated at + /// every step. Therefore, we need to store leaf end indices by reference + /// to avoid updating O(N) leaves at every step of construction. Thus, + /// every internal node must be allocated its own end index. + BumpPtrAllocator InternalEndIdxAllocator; + + /// The end index of each leaf in the tree. + size_t LeafEndIdx = -1; + + /// \brief Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeNode *Node; + + /// The index of the first character in the substring currently being added. + size_t Idx = EmptyIdx; + + /// The length of the substring we have to add at the current step. + size_t Len = 0; + }; + + /// \brief The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx, + unsigned Edge) { + + assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + + SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, + &LeafEndIdx, + nullptr, + &Parent); + Parent.Children[Edge] = N; + + return N; + } + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx, + size_t EndIdx, unsigned Edge) { + + assert(StartIdx <= EndIdx && "String can't start after it ends!"); + assert(!(!Parent && StartIdx != EmptyIdx) && + "Non-root internal nodes must have parents!"); + + size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx); + SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, + E, + Root, + Parent); + if (Parent) + Parent->Children[Edge] = N; + + return N; + } + + /// \brief Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. Also stores each leaf in \p LeafVector at its + /// respective suffix index. + /// + /// \param[in] CurrNode The node currently being visited. + /// \param CurrIdx The current index of the string being visited. + void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) { + + bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot(); + + // Store the length of the concatenation of all strings from the root to + // this node. + if (!CurrNode.isRoot()) { + if (CurrNode.ConcatLen == 0) + CurrNode.ConcatLen = CurrNode.size(); + + if (CurrNode.Parent) + CurrNode.ConcatLen += CurrNode.Parent->ConcatLen; + } + + // Traverse the tree depth-first. + for (auto &ChildPair : CurrNode.Children) { + assert(ChildPair.second && "Node had a null child!"); + setSuffixIndices(*ChildPair.second, + CurrIdx + ChildPair.second->size()); + } + + // Is this node a leaf? + if (IsLeaf) { + // If yes, give it a suffix index and bump its parent's occurrence count. + CurrNode.SuffixIdx = Str.size() - CurrIdx; + assert(CurrNode.Parent && "CurrNode had no parent!"); + CurrNode.Parent->OccurrenceCount++; + + // Store the leaf in the leaf vector for pruning later. + LeafVector[CurrNode.SuffixIdx] = &CurrNode; + } + } + + /// \brief Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(size_t EndIdx, size_t SuffixesToAdd) { + SuffixTreeNode *NeedsLink = nullptr; + + while (SuffixesToAdd > 0) { + + // Are we waiting to add anything other than just the last character? + if (Active.Len == 0) { + // If not, then say the active index is the end index. + Active.Idx = EndIdx; + } + + assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + + // The first character in the current substring we're looking at. + unsigned FirstChar = Str[Active.Idx]; + + // Have we inserted anything starting with FirstChar at the current node? + if (Active.Node->Children.count(FirstChar) == 0) { + // If not, then we can just insert a leaf and move too the next step. + insertLeaf(*Active.Node, EndIdx, FirstChar); + + // The active node is an internal node, and we visited it, so it must + // need a link if it doesn't have one. + if (NeedsLink) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + } else { + // There's a match with FirstChar, so look for the point in the tree to + // insert a new node. + SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; + + size_t SubstringLen = NextNode->size(); + + // Is the current suffix we're trying to insert longer than the size of + // the child we want to move to? + if (Active.Len >= SubstringLen) { + // If yes, then consume the characters we've seen and move to the next + // node. + Active.Idx += SubstringLen; + Active.Len -= SubstringLen; + Active.Node = NextNode; + continue; + } + + // Otherwise, the suffix we're trying to insert must be contained in the + // next node we want to move to. + unsigned LastChar = Str[EndIdx]; + + // Is the string we're trying to insert a substring of the next node? + if (Str[NextNode->StartIdx + Active.Len] == LastChar) { + // If yes, then we're done for this step. Remember our insertion point + // and move to the next end index. At this point, we have an implicit + // suffix tree. + if (NeedsLink && !Active.Node->isRoot()) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + + Active.Len++; + break; + } + + // The string we're trying to insert isn't a substring of the next node, + // but matches up to a point. Split the node. + // + // For example, say we ended our search at a node n and we're trying to + // insert ABD. Then we'll create a new node s for AB, reduce n to just + // representing C, and insert a new leaf node l to represent d. This + // allows us to ensure that if n was a leaf, it remains a leaf. + // + // | ABC ---split---> | AB + // n s + // C / \ D + // n l + + // The node s from the diagram + SuffixTreeNode *SplitNode = + insertInternalNode(Active.Node, + NextNode->StartIdx, + NextNode->StartIdx + Active.Len - 1, + FirstChar); + + // Insert the new node representing the new substring into the tree as + // a child of the split node. This is the node l from the diagram. + insertLeaf(*SplitNode, EndIdx, LastChar); + + // Make the old node a child of the split node and update its start + // index. This is the node n from the diagram. + NextNode->StartIdx += Active.Len; + NextNode->Parent = SplitNode; + SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; + + // SplitNode is an internal node, update the suffix link. + if (NeedsLink) + NeedsLink->Link = SplitNode; + + NeedsLink = SplitNode; + } + + // We've added something new to the tree, so there's one less suffix to + // add. + SuffixesToAdd--; + + if (Active.Node->isRoot()) { + if (Active.Len > 0) { + Active.Len--; + Active.Idx = EndIdx - SuffixesToAdd + 1; + } + } else { + // Start the next phase at the next smallest suffix. + Active.Node = Active.Node->Link; + } + } + + return SuffixesToAdd; + } + +public: + + /// Find all repeated substrings that satisfy \p BenefitFn. + /// + /// If a substring appears at least twice, then it must be represented by + /// an internal node which appears in at least two suffixes. Each suffix is + /// represented by a leaf node. To do this, we visit each internal node in + /// the tree, using the leaf children of each internal node. If an internal + /// node represents a beneficial substring, then we use each of its leaf + /// children to find the locations of its substring. + /// + /// \param[out] CandidateList Filled with candidates representing each + /// beneficial substring. + /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each + /// type of candidate. + /// \param BenefitFn The function to satisfy. + /// + /// \returns The length of the longest candidate found. + size_t findCandidates(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)> + &BenefitFn) { + + CandidateList.clear(); + FunctionList.clear(); + size_t FnIdx = 0; + size_t MaxLen = 0; + + for (SuffixTreeNode* Leaf : LeafVector) { + assert(Leaf && "Leaves in LeafVector cannot be null!"); + if (!Leaf->IsInTree) + continue; + + assert(Leaf->Parent && "All leaves must have parents!"); + SuffixTreeNode &Parent = *(Leaf->Parent); + + // If it doesn't appear enough, or we already outlined from it, skip it. + if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree) + continue; + + size_t StringLen = Leaf->ConcatLen - Leaf->size(); + + // How many instructions would outlining this string save? + unsigned Benefit = BenefitFn(Parent, + StringLen, Str[Leaf->SuffixIdx + StringLen - 1]); + + // If it's not beneficial, skip it. + if (Benefit < 1) + continue; + + if (StringLen > MaxLen) + MaxLen = StringLen; + + unsigned OccurrenceCount = 0; + for (auto &ChildPair : Parent.Children) { + SuffixTreeNode *M = ChildPair.second; + + // Is it a leaf? If so, we have an occurrence of this candidate. + if (M && M->IsInTree && M->isLeaf()) { + OccurrenceCount++; + CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx); + CandidateList.back().Benefit = Benefit; + M->IsInTree = false; + } + } + + // Save the function for the new candidate sequence. + std::vector<unsigned> CandidateSequence; + for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++) + CandidateSequence.push_back(Str[i]); + + FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence, + Benefit, false); + + // Move to the next function. + FnIdx++; + Parent.IsInTree = false; + } + + return MaxLen; + } + + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector<unsigned> &Str) : Str(Str) { + Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); + Root->IsInTree = true; + Active.Node = Root; + LeafVector = std::vector<SuffixTreeNode*>(Str.size()); + + // Keep track of the number of suffixes we have to add of the current + // prefix. + size_t SuffixesToAdd = 0; + Active.Node = Root; + + // Construct the suffix tree iteratively on each prefix of the string. + // PfxEndIdx is the end index of the current prefix. + // End is one past the last element in the string. + for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { + SuffixesToAdd++; + LeafEndIdx = PfxEndIdx; // Extend each of the leaves. + SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); + } + + // Set the suffix indices of each leaf. + assert(Root && "Root node can't be nullptr!"); + setSuffixIndices(*Root, 0); + } +}; + +/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings. +struct InstructionMapper { + + /// \brief The next available integer to assign to a \p MachineInstr that + /// cannot be outlined. + /// + /// Set to -3 for compatability with \p DenseMapInfo<unsigned>. + unsigned IllegalInstrNumber = -3; + + /// \brief The next available integer to assign to a \p MachineInstr that can + /// be outlined. + unsigned LegalInstrNumber = 0; + + /// Correspondence from \p MachineInstrs to unsigned integers. + DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait> + InstructionIntegerMap; + + /// Corresponcence from unsigned integers to \p MachineInstrs. + /// Inverse of \p InstructionIntegerMap. + DenseMap<unsigned, MachineInstr *> IntegerInstructionMap; + + /// The vector of unsigned integers that the module is mapped to. + std::vector<unsigned> UnsignedVec; + + /// \brief Stores the location of the instruction associated with the integer + /// at index i in \p UnsignedVec for each index i. + std::vector<MachineBasicBlock::iterator> InstrList; + + /// \brief Maps \p *It to a legal integer. + /// + /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap, + /// \p IntegerInstructionMap, and \p LegalInstrNumber. + /// + /// \returns The integer that \p *It was mapped to. + unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) { + + // Get the integer for this instruction or give it the current + // LegalInstrNumber. + InstrList.push_back(It); + MachineInstr &MI = *It; + bool WasInserted; + DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator + ResultIt; + std::tie(ResultIt, WasInserted) = + InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber)); + unsigned MINumber = ResultIt->second; + + // There was an insertion. + if (WasInserted) { + LegalInstrNumber++; + IntegerInstructionMap.insert(std::make_pair(MINumber, &MI)); + } + + UnsignedVec.push_back(MINumber); + + // Make sure we don't overflow or use any integers reserved by the DenseMap. + if (LegalInstrNumber >= IllegalInstrNumber) + report_fatal_error("Instruction mapping overflow!"); + + assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() + && "Tried to assign DenseMap tombstone or empty key to instruction."); + assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() + && "Tried to assign DenseMap tombstone or empty key to instruction."); + + return MINumber; + } + + /// Maps \p *It to an illegal integer. + /// + /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber. + /// + /// \returns The integer that \p *It was mapped to. + unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) { + unsigned MINumber = IllegalInstrNumber; + + InstrList.push_back(It); + UnsignedVec.push_back(IllegalInstrNumber); + IllegalInstrNumber--; + + assert(LegalInstrNumber < IllegalInstrNumber && + "Instruction mapping overflow!"); + + assert(IllegalInstrNumber != + DenseMapInfo<unsigned>::getEmptyKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + assert(IllegalInstrNumber != + DenseMapInfo<unsigned>::getTombstoneKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + return MINumber; + } + + /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds + /// and appends it to \p UnsignedVec and \p InstrList. + /// + /// Two instructions are assigned the same integer if they are identical. + /// If an instruction is deemed unsafe to outline, then it will be assigned an + /// unique integer. The resulting mapping is placed into a suffix tree and + /// queried for candidates. + /// + /// \param MBB The \p MachineBasicBlock to be translated into integers. + /// \param TRI \p TargetRegisterInfo for the module. + /// \param TII \p TargetInstrInfo for the module. + void convertToUnsignedVec(MachineBasicBlock &MBB, + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { + for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et; + It++) { + + // Keep track of where this instruction is in the module. + switch(TII.getOutliningType(*It)) { + case TargetInstrInfo::MachineOutlinerInstrType::Illegal: + mapToIllegalUnsigned(It); + break; + + case TargetInstrInfo::MachineOutlinerInstrType::Legal: + mapToLegalUnsigned(It); + break; + + case TargetInstrInfo::MachineOutlinerInstrType::Invisible: + break; + } + } + + // After we're done every insertion, uniquely terminate this part of the + // "string". This makes sure we won't match across basic block or function + // boundaries since the "end" is encoded uniquely and thus appears in no + // repeated substring. + InstrList.push_back(MBB.end()); + UnsignedVec.push_back(IllegalInstrNumber); + IllegalInstrNumber--; + } + + InstructionMapper() { + // Make sure that the implementation of DenseMapInfo<unsigned> hasn't + // changed. + assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 && + "DenseMapInfo<unsigned>'s empty key isn't -1!"); + assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 && + "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + } +}; + +/// \brief An interprocedural pass which finds repeated sequences of +/// instructions and replaces them with calls to functions. +/// +/// Each instruction is mapped to an unsigned integer and placed in a string. +/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree +/// is then repeatedly queried for repeated sequences of instructions. Each +/// non-overlapping repeated sequence is then placed in its own +/// \p MachineFunction and each instance is then replaced with a call to that +/// function. +struct MachineOutliner : public ModulePass { + + static char ID; + + StringRef getPassName() const override { return "Machine Outliner"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineModuleInfo>(); + AU.addPreserved<MachineModuleInfo>(); + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } + + MachineOutliner() : ModulePass(ID) { + initializeMachineOutlinerPass(*PassRegistry::getPassRegistry()); + } + + /// \brief Replace the sequences of instructions represented by the + /// \p Candidates in \p CandidateList with calls to \p MachineFunctions + /// described in \p FunctionList. + /// + /// \param M The module we are outlining from. + /// \param CandidateList A list of candidates to be outlined. + /// \param FunctionList A list of functions to be inserted into the module. + /// \param Mapper Contains the instruction mappings for the module. + bool outline(Module &M, const ArrayRef<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + InstructionMapper &Mapper); + + /// Creates a function for \p OF and inserts it into the module. + MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF, + InstructionMapper &Mapper); + + /// Find potential outlining candidates and store them in \p CandidateList. + /// + /// For each type of potential candidate, also build an \p OutlinedFunction + /// struct containing the information to build the function for that + /// candidate. + /// + /// \param[out] CandidateList Filled with outlining candidates for the module. + /// \param[out] FunctionList Filled with functions corresponding to each type + /// of \p Candidate. + /// \param ST The suffix tree for the module. + /// \param TII TargetInstrInfo for the module. + /// + /// \returns The length of the longest candidate found. 0 if there are none. + unsigned buildCandidateList(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + SuffixTree &ST, + InstructionMapper &Mapper, + const TargetInstrInfo &TII); + + /// \brief Remove any overlapping candidates that weren't handled by the + /// suffix tree's pruning method. + /// + /// Pruning from the suffix tree doesn't necessarily remove all overlaps. + /// If a short candidate is chosen for outlining, then a longer candidate + /// which has that short candidate as a suffix is chosen, the tree's pruning + /// method will not find it. Thus, we need to prune before outlining as well. + /// + /// \param[in,out] CandidateList A list of outlining candidates. + /// \param[in,out] FunctionList A list of functions to be outlined. + /// \param MaxCandidateLen The length of the longest candidate. + /// \param TII TargetInstrInfo for the module. + void pruneOverlaps(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + unsigned MaxCandidateLen, + const TargetInstrInfo &TII); + + /// Construct a suffix tree on the instructions in \p M and outline repeated + /// strings from that tree. + bool runOnModule(Module &M) override; +}; + +} // Anonymous namespace. + +char MachineOutliner::ID = 0; + +namespace llvm { +ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); } +} + +INITIALIZE_PASS(MachineOutliner, "machine-outliner", + "Machine Function Outliner", false, false) + +void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + unsigned MaxCandidateLen, + const TargetInstrInfo &TII) { + // TODO: Experiment with interval trees or other interval-checking structures + // to lower the time complexity of this function. + // TODO: Can we do better than the simple greedy choice? + // Check for overlaps in the range. + // This is O(MaxCandidateLen * CandidateList.size()). + for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et; + It++) { + Candidate &C1 = *It; + OutlinedFunction &F1 = FunctionList[C1.FunctionIdx]; + + // If we removed this candidate, skip it. + if (!C1.InCandidateList) + continue; + + // Is it still worth it to outline C1? + if (F1.Benefit < 1 || F1.OccurrenceCount < 2) { + assert(F1.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F1.OccurrenceCount--; + C1.InCandidateList = false; + continue; + } + + // The minimum start index of any candidate that could overlap with this + // one. + unsigned FarthestPossibleIdx = 0; + + // Either the index is 0, or it's at most MaxCandidateLen indices away. + if (C1.StartIdx > MaxCandidateLen) + FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen; + + // Compare against the candidates in the list that start at at most + // FarthestPossibleIdx indices away from C1. There are at most + // MaxCandidateLen of these. + for (auto Sit = It + 1; Sit != Et; Sit++) { + Candidate &C2 = *Sit; + OutlinedFunction &F2 = FunctionList[C2.FunctionIdx]; + + // Is this candidate too far away to overlap? + if (C2.StartIdx < FarthestPossibleIdx) + break; + + // Did we already remove this candidate in a previous step? + if (!C2.InCandidateList) + continue; + + // Is the function beneficial to outline? + if (F2.OccurrenceCount < 2 || F2.Benefit < 1) { + // If not, remove this candidate and move to the next one. + assert(F2.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F2.OccurrenceCount--; + C2.InCandidateList = false; + continue; + } + + size_t C2End = C2.StartIdx + C2.Len - 1; + + // Do C1 and C2 overlap? + // + // Not overlapping: + // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices + // + // We sorted our candidate list so C2Start <= C1Start. We know that + // C2End > C2Start since each candidate has length >= 2. Therefore, all we + // have to check is C2End < C2Start to see if we overlap. + if (C2End < C1.StartIdx) + continue; + + // C1 and C2 overlap. + // We need to choose the better of the two. + // + // Approximate this by picking the one which would have saved us the + // most instructions before any pruning. + if (C1.Benefit >= C2.Benefit) { + + // C1 is better, so remove C2 and update C2's OutlinedFunction to + // reflect the removal. + assert(F2.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F2.OccurrenceCount--; + F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(), + F2.OccurrenceCount, + F2.IsTailCall + ); + + C2.InCandidateList = false; + + DEBUG ( + dbgs() << "- Removed C2. \n"; + dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n"; + dbgs() << "--- C2's benefit: " << F2.Benefit << "\n"; + ); + + } else { + // C2 is better, so remove C1 and update C1's OutlinedFunction to + // reflect the removal. + assert(F1.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F1.OccurrenceCount--; + F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(), + F1.OccurrenceCount, + F1.IsTailCall + ); + C1.InCandidateList = false; + + DEBUG ( + dbgs() << "- Removed C1. \n"; + dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n"; + dbgs() << "--- C1's benefit: " << F1.Benefit << "\n"; + ); + + // C1 is out, so we don't have to compare it against anyone else. + break; + } + } + } +} + +unsigned +MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + SuffixTree &ST, + InstructionMapper &Mapper, + const TargetInstrInfo &TII) { + + std::vector<unsigned> CandidateSequence; // Current outlining candidate. + size_t MaxCandidateLen = 0; // Length of the longest candidate. + + // Function for maximizing query in the suffix tree. + // This allows us to define more fine-grained types of things to outline in + // the target without putting target-specific info in the suffix tree. + auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr, + size_t StringLen, unsigned EndVal) { + + // The root represents the empty string. + if (Curr.isRoot()) + return 0u; + + // Is this long enough to outline? + // TODO: Let the target decide how "long" a string is in terms of the sizes + // of the instructions in the string. For example, if a call instruction + // is smaller than a one instruction string, we should outline that string. + if (StringLen < 2) + return 0u; + + size_t Occurrences = Curr.OccurrenceCount; + + // Anything we want to outline has to appear at least twice. + if (Occurrences < 2) + return 0u; + + // Check if the last instruction in the sequence is a return. + MachineInstr *LastInstr = + Mapper.IntegerInstructionMap[EndVal]; + assert(LastInstr && "Last instruction in sequence was unmapped!"); + + // The only way a terminator could be mapped as legal is if it was safe to + // tail call. + bool IsTailCall = LastInstr->isTerminator(); + return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall); + }; + + MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn); + + for (auto &OF : FunctionList) + OF.IsTailCall = Mapper. + IntegerInstructionMap[OF.Sequence.back()]->isTerminator(); + + // Sort the candidates in decending order. This will simplify the outlining + // process when we have to remove the candidates from the mapping by + // allowing us to cut them out without keeping track of an offset. + std::stable_sort(CandidateList.begin(), CandidateList.end()); + + return MaxCandidateLen; +} + +MachineFunction * +MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, + InstructionMapper &Mapper) { + + // Create the function name. This should be unique. For now, just hash the + // module name and include it in the function name plus the number of this + // function. + std::ostringstream NameStream; + NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name; + + // Create the function using an IR-level function. + LLVMContext &C = M.getContext(); + Function *F = dyn_cast<Function>( + M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C))); + assert(F && "Function was null!"); + + // NOTE: If this is linkonceodr, then we can take advantage of linker deduping + // which gives us better results when we outline from linkonceodr functions. + F->setLinkage(GlobalValue::PrivateLinkage); + F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); + IRBuilder<> Builder(EntryBB); + Builder.CreateRetVoid(); + + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); + MachineFunction &MF = MMI.getMachineFunction(*F); + MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + // Insert the new function into the module. + MF.insert(MF.begin(), &MBB); + + TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall); + + // Copy over the instructions for the function using the integer mappings in + // its sequence. + for (unsigned Str : OF.Sequence) { + MachineInstr *NewMI = + MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second); + NewMI->dropMemRefs(); + + // Don't keep debug information for outlined instructions. + // FIXME: This means outlined functions are currently undebuggable. + NewMI->setDebugLoc(DebugLoc()); + MBB.insert(MBB.end(), NewMI); + } + + TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall); + + return &MF; +} + +bool MachineOutliner::outline(Module &M, + const ArrayRef<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + InstructionMapper &Mapper) { + + bool OutlinedSomething = false; + + // Replace the candidates with calls to their respective outlined functions. + for (const Candidate &C : CandidateList) { + + // Was the candidate removed during pruneOverlaps? + if (!C.InCandidateList) + continue; + + // If not, then look at its OutlinedFunction. + OutlinedFunction &OF = FunctionList[C.FunctionIdx]; + + // Was its OutlinedFunction made unbeneficial during pruneOverlaps? + if (OF.OccurrenceCount < 2 || OF.Benefit < 1) + continue; + + // If not, then outline it. + assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); + MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent(); + MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx]; + unsigned EndIdx = C.StartIdx + C.Len - 1; + + assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); + MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; + assert(EndIt != MBB->end() && "EndIt out of bounds!"); + + EndIt++; // Erase needs one past the end index. + + // Does this candidate have a function yet? + if (!OF.MF) { + OF.MF = createOutlinedFunction(M, OF, Mapper); + FunctionsCreated++; + } + + MachineFunction *MF = OF.MF; + const TargetSubtargetInfo &STI = MF->getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + // Insert a call to the new function and erase the old sequence. + TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall); + StartIt = Mapper.InstrList[C.StartIdx]; + MBB->erase(StartIt, EndIt); + + OutlinedSomething = true; + + // Statistics. + NumOutlined++; + } + + DEBUG ( + dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n"; + ); + + return OutlinedSomething; +} + +bool MachineOutliner::runOnModule(Module &M) { + + // Is there anything in the module at all? + if (M.empty()) + return false; + + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); + const TargetSubtargetInfo &STI = MMI.getMachineFunction(*M.begin()) + .getSubtarget(); + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetInstrInfo *TII = STI.getInstrInfo(); + + InstructionMapper Mapper; + + // Build instruction mappings for each function in the module. + for (Function &F : M) { + MachineFunction &MF = MMI.getMachineFunction(F); + + // Is the function empty? Safe to outline from? + if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF)) + continue; + + // If it is, look at each MachineBasicBlock in the function. + for (MachineBasicBlock &MBB : MF) { + + // Is there anything in MBB? + if (MBB.empty()) + continue; + + // If yes, map it. + Mapper.convertToUnsignedVec(MBB, *TRI, *TII); + } + } + + // Construct a suffix tree, use it to find candidates, and then outline them. + SuffixTree ST(Mapper.UnsignedVec); + std::vector<Candidate> CandidateList; + std::vector<OutlinedFunction> FunctionList; + + // Find all of the outlining candidates. + unsigned MaxCandidateLen = + buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII); + + // Remove candidates that overlap with other candidates. + pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII); + + // Outline each of the candidates and return true if something was outlined. + return outline(M, CandidateList, FunctionList, Mapper); +} diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 43a18099d39aa..d06c38cf4ed81 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -552,7 +552,9 @@ public: os << "\n"; } - void dump() const { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif }; /// This class repesents the scheduled code. The main data structure is a @@ -593,7 +595,7 @@ private: /// Virtual register information. MachineRegisterInfo &MRI; - DFAPacketizer *Resources; + std::unique_ptr<DFAPacketizer> Resources; public: SMSchedule(MachineFunction *mf) @@ -604,13 +606,6 @@ public: InitiationInterval = 0; } - ~SMSchedule() { - ScheduledInstrs.clear(); - InstrToCycle.clear(); - RegToStageDiff.clear(); - delete Resources; - } - void reset() { ScheduledInstrs.clear(); InstrToCycle.clear(); @@ -738,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { return false; if (mf.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + AttributeList::FunctionIndex, Attribute::OptimizeForSize) && !EnableSWPOptSize.getPosition()) return false; @@ -960,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2) if (Phi.getOperand(i + 1).getMBB() != Loop) InitVal = Phi.getOperand(i).getReg(); - else if (Phi.getOperand(i + 1).getMBB() == Loop) + else LoopVal = Phi.getOperand(i).getReg(); assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure."); @@ -2514,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis( MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap, InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum, bool IsLast) { - // Compute the stage number for the inital value of the Phi, which + // Compute the stage number for the initial value of the Phi, which // comes from the prolog. The prolog to use depends on to which kernel/ // epilog that we're adding the Phi. unsigned PrologStage = 0; @@ -3480,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep, // increment value to determine if the accesses may be loop carried. if (OffsetS >= OffsetD) return OffsetS + AccessSizeS > DeltaS; - else if (OffsetS < OffsetD) + else return OffsetD + AccessSizeD > DeltaD; return true; @@ -3980,5 +3975,7 @@ void SMSchedule::print(raw_ostream &os) const { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Utility function used for debugging to print the schedule. -void SMSchedule::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); } +#endif diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp index fc32183c7f639..71ad4e6aa7f52 100644 --- a/lib/CodeGen/MachineRegionInfo.cpp +++ b/lib/CodeGen/MachineRegionInfo.cpp @@ -1,10 +1,9 @@ - #include "llvm/CodeGen/MachineRegionInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/CodeGen/MachinePostDominators.h" -#define DEBUG_TYPE "region" +#define DEBUG_TYPE "machine-region-info" using namespace llvm; @@ -86,6 +85,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { auto DF = &getAnalysis<MachineDominanceFrontier>(); RI.recalculate(F, DT, PDT, DF); + + DEBUG(RI.dump()); + return false; } @@ -103,9 +105,10 @@ void MachineRegionInfoPass::verifyAnalysis() const { void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequiredTransitive<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<DominanceFrontierWrapperPass>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominanceFrontier>(); + MachineFunctionPass::getAnalysisUsage(AU); } void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const { @@ -119,14 +122,15 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const { #endif char MachineRegionInfoPass::ID = 0; +char &MachineRegionInfoPassID = MachineRegionInfoPass::ID; -INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions", - "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE, + "Detect single entry single exit regions", true, true) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) -INITIALIZE_PASS_END(MachineRegionInfoPass, "regions", - "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE, + "Detect single entry single exit regions", true, true) // Create methods available outside of this file, to use them // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp index 242cb0b809536..128910f8eb2aa 100644 --- a/lib/CodeGen/MachineRegisterInfo.cpp +++ b/lib/CodeGen/MachineRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===// +//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,13 +11,27 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" -#include "llvm/Support/raw_os_ostream.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> using namespace llvm; @@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden, void MachineRegisterInfo::Delegate::anchor() {} MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF) - : MF(MF), TheDelegate(nullptr), - TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && - EnableSubRegLiveness) { + : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && + EnableSubRegLiveness), + IsUpdatedCSRsInitialized(false) { unsigned NumRegs = getTargetRegisterInfo()->getNumRegs(); VRegInfo.reserve(256); RegAllocHints.reserve(256); @@ -444,8 +458,8 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const { return TRC.getLaneMask(); } -#ifndef NDEBUG -void MachineRegisterInfo::dumpUses(unsigned Reg) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const { for (MachineInstr &I : use_instructions(Reg)) I.dump(); } @@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const { } return false; } + +void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) { + + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + assert(Reg && (Reg < TRI->getNumRegs()) && + "Trying to disable an invalid register"); + + if (!IsUpdatedCSRsInitialized) { + const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); + for (const MCPhysReg *I = CSR; *I; ++I) + UpdatedCSRs.push_back(*I); + + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); + + IsUpdatedCSRsInitialized = true; + } + + // Remove the register (and its aliases from the list). + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI), + UpdatedCSRs.end()); +} + +const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const { + if (IsUpdatedCSRsInitialized) + return UpdatedCSRs.data(); + + return getTargetRegisterInfo()->getCalleeSavedRegs(MF); +} + +void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) { + if (IsUpdatedCSRsInitialized) + UpdatedCSRs.clear(); + + for (MCPhysReg Reg : CSRs) + UpdatedCSRs.push_back(Reg); + + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); + IsUpdatedCSRsInitialized = true; +} diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index e06bc517fa916..41e161f71e532 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -12,30 +12,67 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/PriorityQueue.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/ScheduleDFS.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <memory> +#include <string> +#include <tuple> +#include <utility> +#include <vector> using namespace llvm; #define DEBUG_TYPE "misched" namespace llvm { + cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, cl::desc("Force top-down list scheduling")); cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, @@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, cl::opt<bool> DumpCriticalPathLength("misched-dcpl", cl::Hidden, cl::desc("Print critical path length to stdout")); -} + +} // end namespace llvm #ifndef NDEBUG static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, @@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, cl::desc("Enable memop clustering."), cl::init(true)); -// Experimental heuristics -static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); @@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8; // Pin the vtables to this file. void MachineSchedStrategy::anchor() {} + void ScheduleDAGMutation::anchor() {} //===----------------------------------------------------------------------===// // Machine Instruction Scheduling Pass and Registry //===----------------------------------------------------------------------===// -MachineSchedContext::MachineSchedContext(): - MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) { +MachineSchedContext::MachineSchedContext() { RegClassInfo = new RegisterClassInfo(); } @@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() { } namespace { + /// Base class for a machine scheduler class that can run at any point. class MachineSchedulerBase : public MachineSchedContext, public MachineFunctionPass { @@ -149,7 +184,8 @@ public: protected: ScheduleDAGInstrs *createPostMachineScheduler(); }; -} // namespace + +} // end anonymous namespace char MachineScheduler::ID = 0; @@ -158,6 +194,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID; INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler", "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler", @@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { /// MachineSchedOpt allows command line selection of the scheduler. static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, - RegisterPassParser<MachineSchedRegistry> > + RegisterPassParser<MachineSchedRegistry>> MachineSchedOpt("misched", cl::init(&useDefaultMachineSched), cl::Hidden, cl::desc("Machine instruction scheduler to use")); @@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // instruction stream until we find the nearest boundary. unsigned NumRegionInstrs = 0; MachineBasicBlock::iterator I = RegionEnd; - for (;I != MBB->begin(); --I) { + for (; I != MBB->begin(); --I) { MachineInstr &MI = *std::prev(I); if (isSchedBoundary(&MI, &*MBB, MF, TII)) break; @@ -504,13 +541,14 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { // unimplemented } -LLVM_DUMP_METHOD -void ReadyQueue::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ReadyQueue::dump() { dbgs() << "Queue " << Name << ": "; for (unsigned i = 0, e = Queue.size(); i < e; ++i) dbgs() << Queue[i]->NodeNum << " "; dbgs() << "\n"; } +#endif //===----------------------------------------------------------------------===// // ScheduleDAGMI - Basic machine instruction scheduling. This is @@ -519,8 +557,7 @@ void ReadyQueue::dump() { // ===----------------------------------------------------------------------===/ // Provide a vtable anchor. -ScheduleDAGMI::~ScheduleDAGMI() { -} +ScheduleDAGMI::~ScheduleDAGMI() = default; bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); @@ -825,7 +862,7 @@ void ScheduleDAGMI::placeDebugValues() { RegionBegin = FirstDbgValue; } - for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator + for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI); MachineInstr *DbgValue = P.first; @@ -841,7 +878,7 @@ void ScheduleDAGMI::placeDebugValues() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void ScheduleDAGMI::dumpSchedule() const { +LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { if (SUnit *SU = getSUnit(&(*MI))) SU->dump(this); @@ -1012,7 +1049,7 @@ updateScheduledPressure(const SUnit *SU, ++CritIdx; if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() - && NewMaxPressure[ID] <= INT16_MAX) + && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max()) RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); } unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); @@ -1136,6 +1173,12 @@ void ScheduleDAGMILive::schedule() { dbgs() << " Pressure Diff : "; getPressureDiff(&SU).dump(*TRI); } + dbgs() << " Single Issue : "; + if (SchedModel.mustBeginGroup(SU.getInstr()) && + SchedModel.mustEndGroup(SU.getInstr())) + dbgs() << "true;"; + else + dbgs() << "false;"; dbgs() << '\n'; } if (ExitSU.getInstr() != nullptr) @@ -1396,6 +1439,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { //===----------------------------------------------------------------------===// namespace { + /// \brief Post-process the DAG to create cluster edges between neighboring /// loads or between neighboring stores. class BaseMemOpClusterMutation : public ScheduleDAGMutation { @@ -1403,6 +1447,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation { SUnit *SU; unsigned BaseReg; int64_t Offset; + MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) : SU(su), BaseReg(reg), Offset(ofs) {} @@ -1439,25 +1484,26 @@ public: LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) : BaseMemOpClusterMutation(tii, tri, true) {} }; -} // anonymous + +} // end anonymous namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI) + return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI) : nullptr; } std::unique_ptr<ScheduleDAGMutation> createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI) + return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI) : nullptr; } -} // namespace llvm +} // end namespace llvm void BaseMemOpClusterMutation::clusterNeighboringMemOps( ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { @@ -1543,80 +1589,11 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } //===----------------------------------------------------------------------===// -// MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - -namespace { -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class MacroFusion : public ScheduleDAGMutation { - const TargetInstrInfo &TII; -public: - MacroFusion(const TargetInstrInfo &TII) - : TII(TII) {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; -} // anonymous - -namespace llvm { - -std::unique_ptr<ScheduleDAGMutation> -createMacroFusionDAGMutation(const TargetInstrInfo *TII) { - return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr; -} - -} // namespace llvm - -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; - } -} - -//===----------------------------------------------------------------------===// // CopyConstrain - DAG post-processing to encourage copy elimination. //===----------------------------------------------------------------------===// namespace { + /// \brief Post-process the DAG to create weak edges from all uses of a copy to /// the one use that defines the copy's source vreg, most likely an induction /// variable increment. @@ -1626,6 +1603,7 @@ class CopyConstrain : public ScheduleDAGMutation { // RegionEndIdx is the slot index of the last non-debug instruction in the // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. SlotIndex RegionEndIdx; + public: CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} @@ -1634,17 +1612,18 @@ public: protected: void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); }; -} // anonymous + +} // end anonymous namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createCopyConstrainDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - return make_unique<CopyConstrain>(TII, TRI); + const TargetRegisterInfo *TRI) { + return llvm::make_unique<CopyConstrain>(TII, TRI); } -} // namespace llvm +} // end namespace llvm /// constrainLocalCopy handles two possibilities: /// 1) Local src: @@ -1836,7 +1815,7 @@ void SchedBoundary::reset() { CheckPending = false; CurrCycle = 0; CurrMOps = 0; - MinReadyCycle = UINT_MAX; + MinReadyCycle = std::numeric_limits<unsigned>::max(); ExpectedLatency = 0; DependentLatency = 0; RetiredMOps = 0; @@ -1937,12 +1916,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) { && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) { return true; } + unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); return true; } + + if (CurrMOps > 0 && + ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) { + DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must " + << (isTop()? "begin" : "end") << " group\n"); + return true; + } + if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { const MCSchedClassDesc *SC = DAG->getSchedClass(SU); for (TargetSchedModel::ProcResIter @@ -2039,7 +2028,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) { /// Move the boundary of scheduled code by one cycle. void SchedBoundary::bumpCycle(unsigned NextCycle) { if (SchedModel->getMicroOpBufferSize() == 0) { - assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); + assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && + "MinReadyCycle uninitialized"); if (MinReadyCycle > NextCycle) NextCycle = MinReadyCycle; } @@ -2237,6 +2227,18 @@ void SchedBoundary::bumpNode(SUnit *SU) { // one cycle. Since we commonly reach the max MOps here, opportunistically // bump the cycle to avoid uselessly checking everything in the readyQ. CurrMOps += IncMOps; + + // Bump the cycle count for issue group constraints. + // This must be done after NextCycle has been adjust for all other stalls. + // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set + // currCycle to X. + if ((isTop() && SchedModel->mustEndGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) { + DEBUG(dbgs() << " Bump cycle to " + << (isTop() ? "end" : "begin") << " group\n"); + bumpCycle(++NextCycle); + } + while (CurrMOps >= SchedModel->getIssueWidth()) { DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle " << CurrCycle << '\n'); @@ -2250,7 +2252,7 @@ void SchedBoundary::bumpNode(SUnit *SU) { void SchedBoundary::releasePending() { // If the available queue is empty, it is safe to reset MinReadyCycle. if (Available.empty()) - MinReadyCycle = UINT_MAX; + MinReadyCycle = std::numeric_limits<unsigned>::max(); // Check to see if any of the pending instructions are ready to issue. If // so, add them to the available queue. @@ -2323,10 +2325,10 @@ SUnit *SchedBoundary::pickOnlyChoice() { return nullptr; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) // This is useful information to dump after bumpNode. // Note that the Queue contents are more useful before pickNodeFromQueue. -void SchedBoundary::dumpScheduledState() { +LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() { unsigned ResFactor; unsigned ResCount; if (ZoneCritResIdx) { @@ -2666,11 +2668,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, } void GenericScheduler::dumpPolicy() { + // Cannot completely remove virtual function even in release mode. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dbgs() << "GenericScheduler RegionPolicy: " << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure << " OnlyTopDown=" << RegionPolicy.OnlyTopDown << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp << "\n"; +#endif } /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic @@ -2724,7 +2729,7 @@ void GenericScheduler::registerRoots() { errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n"; } - if (EnableCyclicPath) { + if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) { Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); checkAcyclicLatency(); } @@ -3106,7 +3111,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { } void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { - MachineBasicBlock::iterator InsertPos = SU->getInstr(); if (!isTop) ++InsertPos; @@ -3154,7 +3158,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { /// Create the standard converging machine scheduler. This will be used as the /// default scheduler if the target does not set a default. ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { - ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)); + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)); // Register DAG post-processors. // // FIXME: extend the mutation API to allow earlier mutations to instantiate @@ -3195,7 +3200,6 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { } } - void PostGenericScheduler::registerRoots() { Rem.CriticalPath = DAG->ExitSU.getDepth(); @@ -3302,7 +3306,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { } ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { - return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C), + return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C), /*RemoveKillFlags=*/true); } @@ -3311,14 +3315,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { //===----------------------------------------------------------------------===// namespace { + /// \brief Order nodes by the ILP metric. struct ILPOrder { - const SchedDFSResult *DFSResult; - const BitVector *ScheduledTrees; + const SchedDFSResult *DFSResult = nullptr; + const BitVector *ScheduledTrees = nullptr; bool MaximizeILP; - ILPOrder(bool MaxILP) - : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {} + ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {} /// \brief Apply a less-than relation on node priority. /// @@ -3347,12 +3351,13 @@ struct ILPOrder { /// \brief Schedule based on the ILP metric. class ILPScheduler : public MachineSchedStrategy { - ScheduleDAGMILive *DAG; + ScheduleDAGMILive *DAG = nullptr; ILPOrder Cmp; std::vector<SUnit*> ReadyQ; + public: - ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {} + ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {} void initialize(ScheduleDAGMI *dag) override { assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness"); @@ -3405,14 +3410,16 @@ public: std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); } }; -} // namespace + +} // end anonymous namespace static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true)); + return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true)); } static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false)); + return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false)); } + static MachineSchedRegistry ILPMaxRegistry( "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); static MachineSchedRegistry ILPMinRegistry( @@ -3424,6 +3431,7 @@ static MachineSchedRegistry ILPMinRegistry( #ifndef NDEBUG namespace { + /// Apply a less-than relation on the node order, which corresponds to the /// instruction order prior to scheduling. IsReverse implements greater-than. template<bool IsReverse> @@ -3444,11 +3452,12 @@ class InstructionShuffler : public MachineSchedStrategy { // Using a less-than relation (SUnitOrder<false>) for the TopQ priority // gives nodes with a higher number higher priority causing the latest // instructions to be scheduled first. - PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > + PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>> TopQ; // When scheduling bottom-up, use greater-than as the queue priority. - PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > + PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>> BottomQ; + public: InstructionShuffler(bool alternate, bool topdown) : IsAlternating(alternate), IsTopDown(topdown) {} @@ -3492,15 +3501,18 @@ public: BottomQ.push(SU); } }; -} // namespace + +} // end anonymous namespace static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { bool Alternate = !ForceTopDown && !ForceBottomUp; bool TopDown = !ForceBottomUp; assert((TopDown || !ForceTopDown) && "-misched-topdown incompatible with -misched-bottomup"); - return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown)); + return new ScheduleDAGMILive( + C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown)); } + static MachineSchedRegistry ShufflerRegistry( "shuffle", "Shuffle machine instructions alternating directions", createInstructionShuffler); @@ -3518,8 +3530,7 @@ template<> struct GraphTraits< template<> struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { - - DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} static std::string getGraphName(const ScheduleDAG *G) { return G->MF.getName(); @@ -3576,7 +3587,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { return Str; } }; -} // namespace llvm + +} // end namespace llvm #endif // NDEBUG /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index ef7e525e8165d..998a9645e68bf 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -1,4 +1,4 @@ -//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===// +//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,21 +7,35 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <tuple> +#include <utility> using namespace llvm; @@ -37,9 +51,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(MachineTraceMetrics, "machine-trace-metrics", "Machine Trace Metrics", false, true) -MachineTraceMetrics::MachineTraceMetrics() - : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr), - MRI(nullptr), Loops(nullptr) { +MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) { std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr); } @@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const { return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds); } - //===----------------------------------------------------------------------===// // Ensemble utility functions //===----------------------------------------------------------------------===// @@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct) } // Virtual destructor serves as an anchor. -MachineTraceMetrics::Ensemble::~Ensemble() {} +MachineTraceMetrics::Ensemble::~Ensemble() = default; const MachineLoop* MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const { @@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) { // MinInstrCountEnsemble - Pick the trace that executes the least number of // instructions. namespace { + class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble { const char *getName() const override { return "MinInstr"; } const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override; @@ -306,7 +318,8 @@ public: MinInstrCountEnsemble(MachineTraceMetrics *mtm) : MachineTraceMetrics::Ensemble(mtm) {} }; -} + +} // end anonymous namespace // Select the preferred predecessor for MBB. const MachineBasicBlock* @@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const { // revisit blocks. namespace { + struct LoopBounds { MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks; SmallPtrSet<const MachineBasicBlock*, 8> Visited; const MachineLoopInfo *Loops; - bool Downward; + bool Downward = false; + LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks, - const MachineLoopInfo *loops) - : Blocks(blocks), Loops(loops), Downward(false) {} + const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {} }; -} + +} // end anonymous namespace // Specialize po_iterator_storage in order to prune the post-order traversal so // it is limited to the current loop and doesn't traverse the loop back edges. namespace llvm { + template<> class po_iterator_storage<LoopBounds, true> { LoopBounds &LB; + public: po_iterator_storage(LoopBounds &lb) : LB(lb) {} + void finishPostorder(const MachineBasicBlock*) {} bool insertEdge(Optional<const MachineBasicBlock *> From, @@ -452,7 +470,8 @@ public: return LB.Visited.insert(To).second; } }; -} + +} // end namespace llvm /// Compute the trace through MBB. void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) { @@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const { // A data dependency is represented as a defining MI and operand numbers on the // defining and using MI. namespace { + struct DataDep { const MachineInstr *DefMI; unsigned DefOp; @@ -622,7 +642,8 @@ struct DataDep { assert((++DefI).atEnd() && "Register has multiple defs"); } }; -} + +} // end anonymous namespace // Get the input data dependencies that must be ready before UseMI can issue. // Return true if UseMI has any physreg operands. @@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI, // direction instructions are scanned, it could be the operand that defined the // regunit, or the highest operand to read the regunit. namespace { + struct LiveRegUnit { unsigned RegUnit; - unsigned Cycle; - const MachineInstr *MI; - unsigned Op; + unsigned Cycle = 0; + const MachineInstr *MI = nullptr; + unsigned Op = 0; unsigned getSparseSetIndex() const { return RegUnit; } - LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {} + LiveRegUnit(unsigned RU) : RegUnit(RU) {} }; -} + +} // end anonymous namespace // Identify physreg dependencies for UseMI, and update the live regunit // tracking set when scanning instructions downwards. @@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height, return Height; } - typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap; // Push the height of DefMI upwards if required to match UseMI. diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index a98139f9e5af3..d392c044bd716 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -260,8 +260,8 @@ namespace { static char ID; // Pass ID, replacement for typeid const std::string Banner; - MachineVerifierPass(const std::string &banner = nullptr) - : MachineFunctionPass(ID), Banner(banner) { + MachineVerifierPass(std::string banner = std::string()) + : MachineFunctionPass(ID), Banner(std::move(banner)) { initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry()); } @@ -528,7 +528,8 @@ void MachineVerifier::visitMachineFunctionBefore() { lastIndex = SlotIndex(); regsReserved = MRI->getReservedRegs(); - markReachable(&MF->front()); + if (!MF->empty()) + markReachable(&MF->front()); // Build a set of the basic blocks in the function. FunctionBlocks.clear(); @@ -548,7 +549,8 @@ void MachineVerifier::visitMachineFunctionBefore() { // Check that the register use lists are sane. MRI->verifyUseLists(); - verifyStackFrame(); + if (!MF->empty()) + verifyStackFrame(); } // Does iterator point to a and b as the first two elements? @@ -572,7 +574,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { for (const auto &LI : MBB->liveins()) { if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && MBB->getIterator() != MBB->getParent()->begin()) { - report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB); + report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB); } } } @@ -908,6 +910,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { } } + // Generic loads and stores must have a single MachineMemOperand + // describing that access. + if ((MI->getOpcode() == TargetOpcode::G_LOAD || + MI->getOpcode() == TargetOpcode::G_STORE) && + !MI->hasOneMemOperand()) + report("Generic instruction accessing memory must have one mem operand", + MI); + StringRef ErrorInfo; if (!TII->verifyInstruction(*MI, ErrorInfo)) report(ErrorInfo.data(), MI); @@ -2047,23 +2057,14 @@ void MachineVerifier::verifyStackFrame() { // Update stack state by checking contents of MBB. for (const auto &I : *MBB) { if (I.getOpcode() == FrameSetupOpcode) { - // The first operand of a FrameOpcode should be i32. - int Size = I.getOperand(0).getImm(); - assert(Size >= 0 && - "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - if (BBState.ExitIsSetup) report("FrameSetup is after another FrameSetup", &I); - BBState.ExitValue -= Size; + BBState.ExitValue -= TII->getFrameSize(I); BBState.ExitIsSetup = true; } if (I.getOpcode() == FrameDestroyOpcode) { - // The first operand of a FrameOpcode should be i32. - int Size = I.getOperand(0).getImm(); - assert(Size >= 0 && - "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - + int Size = TII->getFrameSize(I); if (!BBState.ExitIsSetup) report("FrameDestroy is not after a FrameSetup", &I); int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue : diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp index ad9166f1ed23b..00e72971a01e8 100644 --- a/lib/CodeGen/PatchableFunction.cpp +++ b/lib/CodeGen/PatchableFunction.cpp @@ -75,7 +75,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) { .addImm(FirstActualI->getOpcode()); for (auto &MO : FirstActualI->operands()) - MIB.addOperand(MO); + MIB.add(MO); FirstActualI->eraseFromParent(); MF.ensureAlignment(4); diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index 6081916a6a829..61dccdde8f1dc 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -253,7 +253,7 @@ void SchedulePostRATDList::exitRegion() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// dumpSchedule - dump the scheduled Sequence. -void SchedulePostRATDList::dumpSchedule() const { +LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const { for (unsigned i = 0, e = Sequence.size(); i != e; i++) { if (SUnit *SU = Sequence[i]) SU->dump(this); diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 5fca7fa5536b5..1354009794cb4 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -265,11 +265,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) { std::vector<MachineBasicBlock::iterator> FrameSDOps; for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) - if (I->getOpcode() == FrameSetupOpcode || - I->getOpcode() == FrameDestroyOpcode) { - assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" - " instructions should have a single immediate argument!"); - unsigned Size = I->getOperand(0).getImm(); + if (TII.isFrameInstr(*I)) { + unsigned Size = TII.getFrameSize(*I); if (Size > MaxCallFrameSize) MaxCallFrameSize = Size; AdjustsStack = true; FrameSDOps.push_back(I); @@ -336,7 +333,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, return; const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); + const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs(); std::vector<CalleeSavedInfo> CSI; for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1049,8 +1046,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); - unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); - unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); if (RS && FrameIndexEliminationScavenging) RS->enterBasicBlock(*BB); @@ -1059,11 +1054,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { - if (I->getOpcode() == FrameSetupOpcode || - I->getOpcode() == FrameDestroyOpcode) { - InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); + if (TII.isFrameInstr(*I)) { + InsideCallSequence = TII.isFrameSetup(*I); SPAdj += TII.getSPAdjust(*I); - I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); continue; } @@ -1237,4 +1230,6 @@ doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) { ++I; } } + + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); } diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp index 804a4c3dad669..b29e62bf1aa3c 100644 --- a/lib/CodeGen/PseudoSourceValue.cpp +++ b/lib/CodeGen/PseudoSourceValue.cpp @@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {} PseudoSourceValue::~PseudoSourceValue() {} void PseudoSourceValue::printCustom(raw_ostream &O) const { - O << PSVNames[Kind]; + if (Kind < TargetCustom) + O << PSVNames[Kind]; + else + O << "TargetCustom" << Kind; } bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const { diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index a558e371ad4c6..a87fed3a687e1 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -176,8 +176,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); Q.collectInterferingVRegs(); - if (Q.seenUnspillableVReg()) - return false; for (unsigned i = Q.interferingVRegs().size(); i; --i) { LiveInterval *Intf = Q.interferingVRegs()[i - 1]; if (!Intf->isSpillable() || Intf->weight > VirtReg.weight) diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index c47cfb1b986fb..06500289c971a 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -29,8 +29,10 @@ #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" @@ -125,6 +127,7 @@ class RAGreedy : public MachineFunctionPass, MachineBlockFrequencyInfo *MBFI; MachineDominatorTree *DomTree; MachineLoopInfo *Loops; + MachineOptimizationRemarkEmitter *ORE; EdgeBundles *Bundles; SpillPlacement *SpillPlacer; LiveDebugVariables *DebugVars; @@ -419,6 +422,20 @@ private: void collectHintInfo(unsigned, HintsInfo &); bool isUnusedCalleeSavedReg(unsigned PhysReg) const; + + /// Compute and report the number of spills and reloads for a loop. + void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, + unsigned &FoldedReloads, unsigned &Spills, + unsigned &FoldedSpills); + + /// Report the number of spills and reloads for each loop. + void reportNumberOfSplillsReloads() { + for (MachineLoop *L : *Loops) { + unsigned Reloads, FoldedReloads, Spills, FoldedSpills; + reportNumberOfSplillsReloads(L, Reloads, FoldedReloads, Spills, + FoldedSpills); + } + } }; } // end anonymous namespace @@ -439,6 +456,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(SpillPlacement) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) INITIALIZE_PASS_END(RAGreedy, "greedy", "Greedy Register Allocator", false, false) @@ -490,6 +508,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<LiveRegMatrix>(); AU.addRequired<EdgeBundles>(); AU.addRequired<SpillPlacement>(); + AU.addRequired<MachineOptimizationRemarkEmitterPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -679,7 +698,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) { MCRegUnitIterator Units(PhysReg, TRI); for (; Units.isValid(); ++Units) { // Instantiate a "subquery", not to be confused with the Queries array. - LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]); + LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]); if (subQ.checkInterference()) break; } @@ -830,7 +849,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg, SmallVector<LiveInterval*, 8> Intfs; for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); - assert(Q.seenAllInterferences() && "Didn't check all interfererences."); + // We usually have the interfering VRegs cached so collectInterferingVRegs() + // should be fast, we may need to recalculate if when different physregs + // overlap the same register unit so we had different SubRanges queried + // against it. + Q.collectInterferingVRegs(); ArrayRef<LiveInterval*> IVR = Q.interferingVRegs(); Intfs.append(IVR.begin(), IVR.end()); } @@ -2611,6 +2634,69 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, return 0; } +void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, + unsigned &FoldedReloads, + unsigned &Spills, + unsigned &FoldedSpills) { + Reloads = 0; + FoldedReloads = 0; + Spills = 0; + FoldedSpills = 0; + + // Sum up the spill and reloads in subloops. + for (MachineLoop *SubLoop : *L) { + unsigned SubReloads; + unsigned SubFoldedReloads; + unsigned SubSpills; + unsigned SubFoldedSpills; + + reportNumberOfSplillsReloads(SubLoop, SubReloads, SubFoldedReloads, + SubSpills, SubFoldedSpills); + Reloads += SubReloads; + FoldedReloads += SubFoldedReloads; + Spills += SubSpills; + FoldedSpills += SubFoldedSpills; + } + + const MachineFrameInfo &MFI = MF->getFrameInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + int FI; + + for (MachineBasicBlock *MBB : L->getBlocks()) + // Handle blocks that were not included in subloops. + if (Loops->getLoopFor(MBB) == L) + for (MachineInstr &MI : *MBB) { + const MachineMemOperand *MMO; + + if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) + ++Reloads; + else if (TII->hasLoadFromStackSlot(MI, MMO, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++FoldedReloads; + else if (TII->isStoreToStackSlot(MI, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++Spills; + else if (TII->hasStoreToStackSlot(MI, MMO, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++FoldedSpills; + } + + if (Reloads || FoldedReloads || Spills || FoldedSpills) { + using namespace ore; + MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReload", + L->getStartLoc(), L->getHeader()); + if (Spills) + R << NV("NumSpills", Spills) << " spills "; + if (FoldedSpills) + R << NV("NumFoldedSpills", FoldedSpills) << " folded spills "; + if (Reloads) + R << NV("NumReloads", Reloads) << " reloads "; + if (FoldedReloads) + R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads "; + ORE->emit(R << "generated in loop"); + } +} + bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n" << "********** Function: " << mf.getName() << '\n'); @@ -2633,6 +2719,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { Indexes = &getAnalysis<SlotIndexes>(); MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); DomTree = &getAnalysis<MachineDominatorTree>(); + ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); Loops = &getAnalysis<MachineLoopInfo>(); Bundles = &getAnalysis<EdgeBundles>(); @@ -2658,6 +2745,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { allocatePhysRegs(); tryHintsRecoloring(); postOptimization(); + reportNumberOfSplillsReloads(); releaseMemory(); return true; diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index 101b30bf3b657..3b5964eef55e4 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -1,4 +1,4 @@ -//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===// +//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===// // // The LLVM Compiler Infrastructure // @@ -29,34 +29,61 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegAllocPBQP.h" #include "RegisterCoalescer.h" #include "Spiller.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PBQP/Graph.h" +#include "llvm/CodeGen/PBQP/Solution.h" +#include "llvm/CodeGen/PBQPRAConstraint.h" +#include "llvm/CodeGen/RegAllocPBQP.h" #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstddef> #include <limits> +#include <map> #include <memory> #include <queue> #include <set> #include <sstream> +#include <string> +#include <system_error> +#include <tuple> #include <vector> +#include <utility> using namespace llvm; @@ -86,7 +113,6 @@ namespace { /// Programming problems. class RegAllocPBQP : public MachineFunctionPass { public: - static char ID; /// Construct a PBQP register allocator. @@ -113,7 +139,6 @@ public: } private: - typedef std::map<const LiveInterval*, unsigned> LI2NodeMap; typedef std::vector<const LiveInterval*> Node2LIMap; typedef std::vector<unsigned> AllowedSet; @@ -187,7 +212,6 @@ public: /// @brief Add interference edges between overlapping vregs. class Interference : public PBQPRAConstraint { private: - typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr; typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey; typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache; @@ -276,7 +300,6 @@ private: } public: - void apply(PBQPRAGraph &G) override { // The following is loosely based on the linear scan algorithm introduced in // "Linear Scan Register Allocation" by Poletto and Sarkar. This version @@ -363,7 +386,6 @@ public: } private: - // Create an Interference edge and add it to the graph, unless it is // a null matrix, meaning the nodes' allowed registers do not have any // interference. This case occurs frequently between integer and floating @@ -372,7 +394,6 @@ private: bool createInterferenceEdge(PBQPRAGraph &G, PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId, IMatrixCache &C) { - const TargetRegisterInfo &TRI = *G.getMetadata().MF.getSubtarget().getRegisterInfo(); const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs(); @@ -409,7 +430,6 @@ private: } }; - class Coalescing : public PBQPRAConstraint { public: void apply(PBQPRAGraph &G) override { @@ -421,7 +441,6 @@ public: // gives the Ok. for (const auto &MBB : MF) { for (const auto &MI : MBB) { - // Skip not-coalescable or already coalesced copies. if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg()) continue; @@ -479,7 +498,6 @@ public: } private: - void addVirtRegCoalesce( PBQPRAGraph::RawMatrix &CostMat, const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1, @@ -496,14 +514,15 @@ private: } } } - }; -} // End anonymous namespace. +} // end anonymous namespace // Out-of-line destructor/anchor for PBQPRAConstraint. -PBQPRAConstraint::~PBQPRAConstraint() {} +PBQPRAConstraint::~PBQPRAConstraint() = default; + void PBQPRAConstraint::anchor() {} + void PBQPRAConstraintList::anchor() {} void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { @@ -554,7 +573,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF, static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI, const MachineFunction &MF) { - const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); + const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs(); for (unsigned i = 0; CSR[i] != 0; ++i) if (TRI.regsOverlap(reg, CSR[i])) return true; @@ -777,7 +796,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { // If there are non-empty intervals allocate them using pbqp. if (!VRegsToAlloc.empty()) { - const TargetSubtargetInfo &Subtarget = MF.getSubtarget(); std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot = llvm::make_unique<PBQPRAConstraintList>(); @@ -840,7 +858,8 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId, }); } -void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { for (auto NId : nodeIds()) { const Vector &Costs = getNodeCosts(NId); assert(Costs.getLength() != 0 && "Empty vector in graph."); @@ -861,7 +880,10 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { } } -LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); } +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { + dump(dbgs()); +} +#endif void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const { OS << "graph {\n"; diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp index ece44c28e9ede..855aa37ff3c36 100644 --- a/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/lib/CodeGen/RegUsageInfoCollector.cpp @@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Clobbered Registers: "); - for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) - if (MRI->isPhysRegModified(PReg, true)) - RegMask[PReg / 32] &= ~(1u << PReg % 32); + const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask(); + auto SetRegAsDefined = [&RegMask] (unsigned Reg) { + RegMask[Reg / 32] &= ~(1u << Reg % 32); + }; + // Scan all the physical registers. When a register is defined in the current + // function set it and all the aliasing registers as defined in the regmask. + for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) { + // If a register is in the UsedPhysRegsMask set then mark it as defined. + // All it's aliases will also be in the set, so we can skip setting + // as defined all the aliases here. + if (UsedPhysRegsMask.test(PReg)) { + SetRegAsDefined(PReg); + continue; + } + // If a register is defined by an instruction mark it as defined together + // with all it's aliases. + if (!MRI->def_empty(PReg)) { + for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI) + SetRegAsDefined(*AI); + } + } if (!TargetFrameLowering::isSafeForNoCSROpt(F)) { const uint32_t *CallPreservedMask = diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp index 178fa18ac5a60..82a3bd9a0bd17 100644 --- a/lib/CodeGen/RegisterClassInfo.cpp +++ b/lib/CodeGen/RegisterClassInfo.cpp @@ -1,4 +1,4 @@ -//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===// +//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===// // // The LLVM Compiler Infrastructure // @@ -14,12 +14,22 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; @@ -29,8 +39,7 @@ static cl::opt<unsigned> StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"), cl::desc("Limit all regclasses to N registers")); -RegisterClassInfo::RegisterClassInfo() - : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {} +RegisterClassInfo::RegisterClassInfo() = default; void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { bool Update = false; @@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { // Does this MF have different CSRs? assert(TRI && "no register info set"); - const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); - if (Update || CSR != CalleeSaved) { - // Build a CSRNum map. Every CSR alias gets an entry pointing to the last + + // Get the callee saved registers. + const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs(); + if (Update || CSR != CalleeSavedRegs) { + // Build a CSRAlias map. Every CSR alias saves the last // overlapping CSR. - CSRNum.clear(); - CSRNum.resize(TRI->getNumRegs(), 0); - for (unsigned N = 0; unsigned Reg = CSR[N]; ++N) - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... + CalleeSavedAliases.resize(TRI->getNumRegs(), 0); + for (const MCPhysReg *I = CSR; *I; ++I) + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) + CalleeSavedAliases[*AI] = *I; + Update = true; } - CalleeSaved = CSR; + CalleeSavedRegs = CSR; // Different reserved registers? const BitVector &RR = MF->getRegInfo().getReservedRegs(); @@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { unsigned Cost = TRI->getCostPerUse(PhysReg); MinCost = std::min(MinCost, Cost); - if (CSRNum[PhysReg]) + if (CalleeSavedAliases[PhysReg]) // PhysReg aliases a CSR, save it for later. CSRAlias.push_back(PhysReg); else { @@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { } } RCI.NumRegs = N + CSRAlias.size(); - assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); + assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); // CSR aliases go after the volatile registers, preserve the target's order. for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) { @@ -156,9 +167,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { const TargetRegisterClass *RC = nullptr; unsigned NumRCUnits = 0; - for (TargetRegisterInfo::regclass_iterator - RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) { - const int *PSetID = TRI->getRegClassPressureSets(*RI); + for (const TargetRegisterClass *C : TRI->regclasses()) { + const int *PSetID = TRI->getRegClassPressureSets(C); for (; *PSetID != -1; ++PSetID) { if ((unsigned)*PSetID == Idx) break; @@ -168,9 +178,9 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { // Found a register class that counts against this pressure set. // For efficiency, only compute the set order for the largest set. - unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit; + unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit; if (!RC || NUnits > NumRCUnits) { - RC = *RI; + RC = C; NumRCUnits = NUnits; } } diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 4bb3c229afc58..bf44ee8453b61 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -189,6 +190,9 @@ namespace { /// This returns true if an interval was modified. bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); + /// We found a copy which can be moved to its less frequent predecessor. + bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI); + /// If the source of a copy is defined by a /// trivial computation, replace the copy by rematerialize the definition. bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, @@ -811,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, VNInfo *ASubValNo = SA.getVNInfoAt(AIdx); assert(ASubValNo != nullptr); - LaneBitmask AMask = SA.LaneMask; - for (LiveInterval::SubRange &SB : IntB.subranges()) { - LaneBitmask BMask = SB.LaneMask; - LaneBitmask Common = BMask & AMask; - if (Common.none()) - continue; - - DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask) - << " into " << PrintLaneMask(Common) << '\n'); - LaneBitmask BRest = BMask & ~AMask; - LiveInterval::SubRange *CommonRange; - if (BRest.any()) { - SB.LaneMask = BRest; - DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest) - << '\n'); - // Duplicate SubRange for newly merged common stuff. - CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB); - } else { - // We van reuse the L SubRange. - SB.LaneMask = Common; - CommonRange = &SB; - } - LiveRange RangeCopy(SB, Allocator); - - VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx); - assert(BSubValNo->def == CopyIdx); - BSubValNo->def = ASubValNo->def; - addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo); - AMask &= ~BMask; - } - if (AMask.any()) { - DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n'); - LiveRange *NewRange = IntB.createSubRange(Allocator, AMask); - VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator); - addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo); - } + IntB.refineSubRanges(Allocator, SA.LaneMask, + [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) { + VNInfo *BSubValNo = SR.empty() + ? SR.getNextValue(CopyIdx, Allocator) + : SR.getVNInfoAt(CopyIdx); + assert(BSubValNo != nullptr); + addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo); + }); } } @@ -861,6 +837,184 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, return true; } +/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a +/// predecessor of BB2, and if B is not redefined on the way from A = B +/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the +/// execution goes through the path from BB0 to BB2. We may move B = A +/// to the predecessor without such reversed copy. +/// So we will transform the program from: +/// BB0: +/// A = B; BB1: +/// ... ... +/// / \ / +/// BB2: +/// ... +/// B = A; +/// +/// to: +/// +/// BB0: BB1: +/// A = B; ... +/// ... B = A; +/// / \ / +/// BB2: +/// ... +/// +/// A special case is when BB0 and BB2 are the same BB which is the only +/// BB in a loop: +/// BB1: +/// ... +/// BB0/BB2: ---- +/// B = A; | +/// ... | +/// A = B; | +/// |------- +/// | +/// We may hoist B = A from BB0/BB2 to BB1. +/// +/// The major preconditions for correctness to remove such partial +/// redundancy include: +/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of +/// the PHI is defined by the reversed copy A = B in BB0. +/// 2. No B is referenced from the start of BB2 to B = A. +/// 3. No B is defined from A = B to the end of BB0. +/// 4. BB1 has only one successor. +/// +/// 2 and 4 implicitly ensure B is not live at the end of BB1. +/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a +/// colder place, which not only prevent endless loop, but also make sure +/// the movement of copy is beneficial. +bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, + MachineInstr &CopyMI) { + assert(!CP.isPhys()); + if (!CopyMI.isFullCopy()) + return false; + + MachineBasicBlock &MBB = *CopyMI.getParent(); + if (MBB.isEHPad()) + return false; + + if (MBB.pred_size() != 2) + return false; + + LiveInterval &IntA = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + LiveInterval &IntB = + LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg()); + + // A is defined by PHI at the entry of MBB. + SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true); + VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx); + assert(AValNo && !AValNo->isUnused() && "COPY source not live"); + if (!AValNo->isPHIDef()) + return false; + + // No B is referenced before CopyMI in MBB. + if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx)) + return false; + + // MBB has two predecessors: one contains A = B so no copy will be inserted + // for it. The other one will have a copy moved from MBB. + bool FoundReverseCopy = false; + MachineBasicBlock *CopyLeftBB = nullptr; + for (MachineBasicBlock *Pred : MBB.predecessors()) { + VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred)); + MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def); + if (!DefMI || !DefMI->isFullCopy()) { + CopyLeftBB = Pred; + continue; + } + // Check DefMI is a reverse copy and it is in BB Pred. + if (DefMI->getOperand(0).getReg() != IntA.reg || + DefMI->getOperand(1).getReg() != IntB.reg || + DefMI->getParent() != Pred) { + CopyLeftBB = Pred; + continue; + } + // If there is any other def of B after DefMI and before the end of Pred, + // we need to keep the copy of B = A at the end of Pred if we remove + // B = A from MBB. + bool ValB_Changed = false; + for (auto VNI : IntB.valnos) { + if (VNI->isUnused()) + continue; + if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) { + ValB_Changed = true; + break; + } + } + if (ValB_Changed) { + CopyLeftBB = Pred; + continue; + } + FoundReverseCopy = true; + } + + // If no reverse copy is found in predecessors, nothing to do. + if (!FoundReverseCopy) + return false; + + // If CopyLeftBB is nullptr, it means every predecessor of MBB contains + // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated. + // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and + // update IntA/IntB. + // + // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so + // MBB is hotter than CopyLeftBB. + if (CopyLeftBB && CopyLeftBB->succ_size() > 1) + return false; + + // Now ok to move copy. + if (CopyLeftBB) { + DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#" + << CopyLeftBB->getNumber() << '\t' << CopyMI); + + // Insert new copy to CopyLeftBB. + auto InsPos = CopyLeftBB->getFirstTerminator(); + MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(), + TII->get(TargetOpcode::COPY), IntB.reg) + .addReg(IntA.reg); + SlotIndex NewCopyIdx = + LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); + IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + for (LiveInterval::SubRange &SR : IntB.subranges()) + SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + } else { + DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#" + << MBB.getNumber() << '\t' << CopyMI); + } + + // Remove CopyMI. + // Note: This is fine to remove the copy before updating the live-ranges. + // While updating the live-ranges, we only look at slot indices and + // never go back to the instruction. + LIS->RemoveMachineInstrFromMaps(CopyMI); + CopyMI.eraseFromParent(); + + // Update the liveness. + SmallVector<SlotIndex, 8> EndPoints; + VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead(); + LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(), + &EndPoints); + BValNo->markUnused(); + // Extend IntB to the EndPoints of its original live interval. + LIS->extendToIndices(IntB, EndPoints); + + // Now, do the same for its subranges. + for (LiveInterval::SubRange &SR : IntB.subranges()) { + EndPoints.clear(); + VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead(); + assert(BValNo && "All sublanes should be live"); + LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints); + BValNo->markUnused(); + LIS->extendToIndices(SR, EndPoints); + } + + // Finally, update the live-range of IntA. + shrinkToUses(&IntA); + return true; +} + /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just /// defining a subregister. static bool definesFullReg(const MachineInstr &MI, unsigned Reg) { @@ -1290,7 +1444,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, // If SrcReg wasn't read, it may still be the case that DstReg is live-in // because SrcReg is a sub-register. - if (DstInt && !Reads && SubIdx) + if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); // Replace SrcReg with DstReg in all UseMI operands. @@ -1486,6 +1640,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { } } + // Try and see if we can partially eliminate the copy by moving the copy to + // its predecessor. + if (!CP.isPartial() && !CP.isPhys()) + if (removePartialRedundancy(CP, *CopyMI)) + return true; + // Otherwise, we are unable to join the intervals. DEBUG(dbgs() << "\tInterference!\n"); Again = true; // May be possible to coalesce later. @@ -1583,6 +1743,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { return false; } } + + // We must also check for overlaps with regmask clobbers. + BitVector RegMaskUsable; + if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) && + !RegMaskUsable.test(DstReg)) { + DEBUG(dbgs() << "\t\tRegMask interference\n"); + return false; + } } // Skip any value computations, we are not adding new values to the @@ -1636,14 +1804,6 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { DEBUG(dbgs() << "\t\tInterference (read): " << *MI); return false; } - - // We must also check for clobbers caused by regmasks. - for (const auto &MO : MI->operands()) { - if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) { - DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI); - return false; - } - } } } @@ -2738,39 +2898,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, LaneBitmask LaneMask, CoalescerPair &CP) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - for (LiveInterval::SubRange &R : LI.subranges()) { - LaneBitmask RMask = R.LaneMask; - // LaneMask of subregisters common to subrange R and ToMerge. - LaneBitmask Common = RMask & LaneMask; - // There is nothing to do without common subregs. - if (Common.none()) - continue; - - DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into " - << PrintLaneMask(Common) << '\n'); - // LaneMask of subregisters contained in the R range but not in ToMerge, - // they have to split into their own subrange. - LaneBitmask LRest = RMask & ~LaneMask; - LiveInterval::SubRange *CommonRange; - if (LRest.any()) { - R.LaneMask = LRest; - DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n'); - // Duplicate SubRange for newly merged common stuff. - CommonRange = LI.createSubRangeFrom(Allocator, Common, R); + LI.refineSubRanges(Allocator, LaneMask, + [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) { + if (SR.empty()) { + SR.assign(ToMerge, Allocator); } else { - // Reuse the existing range. - R.LaneMask = Common; - CommonRange = &R; + // joinSubRegRange() destroys the merged range, so we need a copy. + LiveRange RangeCopy(ToMerge, Allocator); + joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP); } - LiveRange RangeCopy(ToMerge, Allocator); - joinSubRegRanges(*CommonRange, RangeCopy, Common, CP); - LaneMask &= ~RMask; - } - - if (LaneMask.any()) { - DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n'); - LI.createSubRangeFrom(Allocator, LaneMask, ToMerge); - } + }); } bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index fc84aebb14d7e..c726edc88b41c 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -1,4 +1,4 @@ -//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===// +//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +12,37 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> using namespace llvm; @@ -52,6 +76,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure, } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure, const TargetRegisterInfo *TRI) { @@ -97,6 +122,7 @@ void RegPressureTracker::dump() const { P.dump(TRI); } +LLVM_DUMP_METHOD void PressureDiff::dump(const TargetRegisterInfo &TRI) const { const char *sep = ""; for (const PressureChange &Change : *this) { @@ -108,6 +134,7 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const { } dbgs() << '\n'; } +#endif void RegPressureTracker::increaseRegPressure(unsigned RegUnit, LaneBitmask PreviousMask, @@ -264,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const { MachineBasicBlock::const_iterator()); } - SlotIndex RegPressureTracker::getCurrSlot() const { MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(CurrPos, MBB->end()); @@ -328,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits, unsigned RegUnit) { - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) @@ -340,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, RegisterMaskPair Pair) { unsigned RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -352,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits, unsigned RegUnit) { - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -366,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, RegisterMaskPair Pair) { unsigned RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I != RegUnits.end()) { @@ -423,6 +449,8 @@ namespace { /// /// FIXME: always ignore tied opers class RegisterOperandsCollector { + friend class llvm::RegisterOperands; + RegisterOperands &RegOpers; const TargetRegisterInfo &TRI; const MachineRegisterInfo &MRI; @@ -517,11 +545,9 @@ class RegisterOperandsCollector { addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll())); } } - - friend class llvm::RegisterOperands; }; -} // namespace +} // end anonymous namespace void RegisterOperands::collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, @@ -674,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair, assert(Pair.LaneMask.any()); unsigned RegUnit = Pair.RegUnit; - auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { + auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { return Other.RegUnit == RegUnit; }); LaneBitmask PrevMask; @@ -772,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, if (!TrackLaneMasks) { addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask)); } else { - auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { - return Other.RegUnit == Reg; - }); + auto I = + llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { + return Other.RegUnit == Reg; + }); bool IsRedef = I != LiveUses->end(); if (IsRedef) { // ignore re-defs here... @@ -1154,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) { int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); - if (CritInc > 0 && CritInc <= INT16_MAX) { + if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) { Delta.CriticalMax = PressureChange(PSetID); Delta.CriticalMax.setUnitInc(CritInc); } diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index fdf741fd58f72..6392136fa2909 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -1,4 +1,4 @@ -//===-- RegisterScavenging.cpp - Machine register scavenging --------------===// +//===- RegisterScavenging.cpp - Machine register scavenging ---------------===// // // The LLVM Compiler Infrastructure // @@ -15,28 +15,32 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <iterator> +#include <limits> +#include <string> + using namespace llvm; #define DEBUG_TYPE "reg-scavenging" void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) { - for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { - LaneBitmask UnitMask = (*RUI).second; - if (UnitMask.none() || (LaneMask & UnitMask).any()) - RegUnitsAvailable.reset((*RUI).first); - } + LiveUnits.addRegMasked(Reg, LaneMask); } void RegScavenger::init(MachineBasicBlock &MBB) { @@ -44,6 +48,7 @@ void RegScavenger::init(MachineBasicBlock &MBB) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); + LiveUnits.init(*TRI); assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) && "Target changed?"); @@ -51,7 +56,6 @@ void RegScavenger::init(MachineBasicBlock &MBB) { // Self-initialize. if (!this->MBB) { NumRegUnits = TRI->getNumRegUnits(); - RegUnitsAvailable.resize(NumRegUnits); KillRegUnits.resize(NumRegUnits); DefRegUnits.resize(NumRegUnits); TmpRegUnits.resize(NumRegUnits); @@ -64,32 +68,17 @@ void RegScavenger::init(MachineBasicBlock &MBB) { I->Restore = nullptr; } - // All register units start out unused. - RegUnitsAvailable.set(); - - // Pristine CSRs are not available. - BitVector PR = MF.getFrameInfo().getPristineRegs(MF); - for (int I = PR.find_first(); I>0; I = PR.find_next(I)) - setRegUsed(I); - Tracking = false; } -void RegScavenger::setLiveInsUsed(const MachineBasicBlock &MBB) { - for (const auto &LI : MBB.liveins()) - setRegUsed(LI.PhysReg, LI.LaneMask); -} - void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) { init(MBB); - setLiveInsUsed(MBB); + LiveUnits.addLiveIns(MBB); } void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) { init(MBB); - // Merge live-ins of successors to get live-outs. - for (const MachineBasicBlock *Succ : MBB.successors()) - setLiveInsUsed(*Succ); + LiveUnits.addLiveOuts(MBB); // Move internal iterator at the last instruction of the block. if (MBB.begin() != MBB.end()) { @@ -263,36 +252,7 @@ void RegScavenger::backward() { assert(Tracking && "Must be tracking to determine kills and defs"); const MachineInstr &MI = *MBBI; - // Defined or clobbered registers are available now. - for (const MachineOperand &MO : MI.operands()) { - if (MO.isRegMask()) { - for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; - ++RU) { - for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) { - if (MO.clobbersPhysReg(*RURI)) { - RegUnitsAvailable.set(RU); - break; - } - } - } - } else if (MO.isReg() && MO.isDef()) { - unsigned Reg = MO.getReg(); - if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || - isReserved(Reg)) - continue; - addRegUnits(RegUnitsAvailable, Reg); - } - } - // Mark read registers as unavailable. - for (const MachineOperand &MO : MI.uses()) { - if (MO.isReg() && MO.readsReg()) { - unsigned Reg = MO.getReg(); - if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || - isReserved(Reg)) - continue; - removeRegUnits(RegUnitsAvailable, Reg); - } - } + LiveUnits.stepBackward(MI); if (MBBI == MBB->begin()) { MBBI = MachineBasicBlock::iterator(nullptr); @@ -302,12 +262,9 @@ void RegScavenger::backward() { } bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const { - if (includeReserved && isReserved(Reg)) - return true; - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) - if (!RegUnitsAvailable.test(*RUI)) - return true; - return false; + if (isReserved(Reg)) + return includeReserved; + return !LiveUnits.available(Reg); } unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const { @@ -441,7 +398,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, unsigned NeedSize = RC->getSize(); unsigned NeedAlign = RC->getAlignment(); - unsigned SI = Scavenged.size(), Diff = UINT_MAX; + unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max(); int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd(); for (unsigned I = 0; I < Scavenged.size(); ++I) { if (Scavenged[I].Reg != 0) diff --git a/lib/CodeGen/ResetMachineFunctionPass.cpp b/lib/CodeGen/ResetMachineFunctionPass.cpp index 451964199ba5a..3e259927ac5cb 100644 --- a/lib/CodeGen/ResetMachineFunctionPass.cpp +++ b/lib/CodeGen/ResetMachineFunctionPass.cpp @@ -30,17 +30,23 @@ namespace { /// Tells whether or not this pass should emit a fallback /// diagnostic when it resets a function. bool EmitFallbackDiag; + /// Whether we should abort immediately instead of resetting the function. + bool AbortOnFailedISel; public: static char ID; // Pass identification, replacement for typeid - ResetMachineFunction(bool EmitFallbackDiag = false) - : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag) {} + ResetMachineFunction(bool EmitFallbackDiag = false, + bool AbortOnFailedISel = false) + : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag), + AbortOnFailedISel(AbortOnFailedISel) {} StringRef getPassName() const override { return "ResetMachineFunction"; } bool runOnMachineFunction(MachineFunction &MF) override { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) { + if (AbortOnFailedISel) + report_fatal_error("Instruction selection failed"); DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n'); ++NumFunctionsReset; MF.reset(); @@ -62,6 +68,7 @@ INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE, "reset machine function if ISel failed", false, false) MachineFunctionPass * -llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false) { - return new ResetMachineFunction(EmitFallbackDiag); +llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false, + bool AbortOnFailedISel = false) { + return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel); } diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp index 2b82df293c148..fa68411284e77 100644 --- a/lib/CodeGen/SafeStack.cpp +++ b/lib/CodeGen/SafeStack.cpp @@ -451,7 +451,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI, IRBuilder<> IRBFail(CheckTerm); // FIXME: respect -fsanitize-trap / -ftrap-function here? Constant *StackChkFail = F.getParent()->getOrInsertFunction( - "__stack_chk_fail", IRB.getVoidTy(), nullptr); + "__stack_chk_fail", IRB.getVoidTy()); IRBFail.CreateCall(StackChkFail, {}); } diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp index 7fbeaddb38e8e..09289f947dc96 100644 --- a/lib/CodeGen/SafeStackColoring.cpp +++ b/lib/CodeGen/SafeStackColoring.cpp @@ -236,6 +236,7 @@ void StackColoring::calculateLiveIntervals() { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void StackColoring::dumpAllocas() { dbgs() << "Allocas:\n"; for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) @@ -262,6 +263,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() { dbgs() << " " << AllocaNo << ": " << Range << "\n"; } } +#endif void StackColoring::run() { DEBUG(dumpAllocas()); diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp index 427d95268c745..dc72ac0732588 100644 --- a/lib/CodeGen/ScheduleDAG.cpp +++ b/lib/CodeGen/ScheduleDAG.cpp @@ -1,4 +1,4 @@ -//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===// +//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===// // // The LLVM Compiler Infrastructure // @@ -7,22 +7,32 @@ // //===----------------------------------------------------------------------===// // -// This implements the ScheduleDAG class, which is a base class used by -// scheduling implementation classes. +/// \file Implements the ScheduleDAG class, which is a base class used by +/// scheduling implementation classes. // //===----------------------------------------------------------------------===// +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <climits> +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "pre-RA-sched" @@ -33,58 +43,52 @@ static cl::opt<bool> StressSchedOpt( cl::desc("Stress test instruction scheduling")); #endif -void SchedulingPriorityQueue::anchor() { } +void SchedulingPriorityQueue::anchor() {} ScheduleDAG::ScheduleDAG(MachineFunction &mf) : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()), TRI(mf.getSubtarget().getRegisterInfo()), MF(mf), - MRI(mf.getRegInfo()), EntrySU(), ExitSU() { + MRI(mf.getRegInfo()) { #ifndef NDEBUG StressSched = StressSchedOpt; #endif } -ScheduleDAG::~ScheduleDAG() {} +ScheduleDAG::~ScheduleDAG() = default; -/// Clear the DAG state (e.g. between scheduling regions). void ScheduleDAG::clearDAG() { SUnits.clear(); EntrySU = SUnit(); ExitSU = SUnit(); } -/// getInstrDesc helper to handle SDNodes. const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const { if (!Node || !Node->isMachineOpcode()) return nullptr; return &TII->get(Node->getMachineOpcode()); } -/// addPred - This adds the specified edge as a pred of the current node if -/// not already. It also adds the current node as a successor of the -/// specified node. bool SUnit::addPred(const SDep &D, bool Required) { // If this node already has this dependence, don't add a redundant one. - for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) { + for (SDep &PredDep : Preds) { // Zero-latency weak edges may be added purely for heuristic ordering. Don't // add them if another kind of edge already exists. - if (!Required && I->getSUnit() == D.getSUnit()) + if (!Required && PredDep.getSUnit() == D.getSUnit()) return false; - if (I->overlaps(D)) { - // Extend the latency if needed. Equivalent to removePred(I) + addPred(D). - if (I->getLatency() < D.getLatency()) { - SUnit *PredSU = I->getSUnit(); + if (PredDep.overlaps(D)) { + // Extend the latency if needed. Equivalent to + // removePred(PredDep) + addPred(D). + if (PredDep.getLatency() < D.getLatency()) { + SUnit *PredSU = PredDep.getSUnit(); // Find the corresponding successor in N. - SDep ForwardD = *I; + SDep ForwardD = PredDep; ForwardD.setSUnit(this); - for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(), - EE = PredSU->Succs.end(); II != EE; ++II) { - if (*II == ForwardD) { - II->setLatency(D.getLatency()); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(D.getLatency()); break; } } - I->setLatency(D.getLatency()); + PredDep.setLatency(D.getLatency()); } return false; } @@ -95,8 +99,10 @@ bool SUnit::addPred(const SDep &D, bool Required) { SUnit *N = D.getSUnit(); // Update the bookkeeping. if (D.getKind() == SDep::Data) { - assert(NumPreds < UINT_MAX && "NumPreds will overflow!"); - assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!"); + assert(NumPreds < std::numeric_limits<unsigned>::max() && + "NumPreds will overflow!"); + assert(N->NumSuccs < std::numeric_limits<unsigned>::max() && + "NumSuccs will overflow!"); ++NumPreds; ++N->NumSuccs; } @@ -105,7 +111,8 @@ bool SUnit::addPred(const SDep &D, bool Required) { ++WeakPredsLeft; } else { - assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!"); + assert(NumPredsLeft < std::numeric_limits<unsigned>::max() && + "NumPredsLeft will overflow!"); ++NumPredsLeft; } } @@ -114,7 +121,8 @@ bool SUnit::addPred(const SDep &D, bool Required) { ++N->WeakSuccsLeft; } else { - assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!"); + assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() && + "NumSuccsLeft will overflow!"); ++N->NumSuccsLeft; } } @@ -127,51 +135,46 @@ bool SUnit::addPred(const SDep &D, bool Required) { return true; } -/// removePred - This removes the specified edge as a pred of the current -/// node if it exists. It also removes the current node as a successor of -/// the specified node. void SUnit::removePred(const SDep &D) { // Find the matching predecessor. - for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) - if (*I == D) { - // Find the corresponding successor in N. - SDep P = D; - P.setSUnit(this); - SUnit *N = D.getSUnit(); - SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P); - assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); - N->Succs.erase(Succ); - Preds.erase(I); - // Update the bookkeeping. - if (P.getKind() == SDep::Data) { - assert(NumPreds > 0 && "NumPreds will underflow!"); - assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); - --NumPreds; - --N->NumSuccs; - } - if (!N->isScheduled) { - if (D.isWeak()) - --WeakPredsLeft; - else { - assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); - --NumPredsLeft; - } - } - if (!isScheduled) { - if (D.isWeak()) - --N->WeakSuccsLeft; - else { - assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); - --N->NumSuccsLeft; - } - } - if (P.getLatency() != 0) { - this->setDepthDirty(); - N->setHeightDirty(); - } - return; + SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D); + if (I == Preds.end()) + return; + // Find the corresponding successor in N. + SDep P = D; + P.setSUnit(this); + SUnit *N = D.getSUnit(); + SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P); + assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); + N->Succs.erase(Succ); + Preds.erase(I); + // Update the bookkeeping. + if (P.getKind() == SDep::Data) { + assert(NumPreds > 0 && "NumPreds will underflow!"); + assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); + --NumPreds; + --N->NumSuccs; + } + if (!N->isScheduled) { + if (D.isWeak()) + --WeakPredsLeft; + else { + assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); + --NumPredsLeft; } + } + if (!isScheduled) { + if (D.isWeak()) + --N->WeakSuccsLeft; + else { + assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); + --N->NumSuccsLeft; + } + } + if (P.getLatency() != 0) { + this->setDepthDirty(); + N->setHeightDirty(); + } } void SUnit::setDepthDirty() { @@ -181,9 +184,8 @@ void SUnit::setDepthDirty() { do { SUnit *SU = WorkList.pop_back_val(); SU->isDepthCurrent = false; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), - E = SU->Succs.end(); I != E; ++I) { - SUnit *SuccSU = I->getSUnit(); + for (SDep &SuccDep : SU->Succs) { + SUnit *SuccSU = SuccDep.getSUnit(); if (SuccSU->isDepthCurrent) WorkList.push_back(SuccSU); } @@ -197,18 +199,14 @@ void SUnit::setHeightDirty() { do { SUnit *SU = WorkList.pop_back_val(); SU->isHeightCurrent = false; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), - E = SU->Preds.end(); I != E; ++I) { - SUnit *PredSU = I->getSUnit(); + for (SDep &PredDep : SU->Preds) { + SUnit *PredSU = PredDep.getSUnit(); if (PredSU->isHeightCurrent) WorkList.push_back(PredSU); } } while (!WorkList.empty()); } -/// setDepthToAtLeast - Update this node's successors to reflect the -/// fact that this node's depth just increased. -/// void SUnit::setDepthToAtLeast(unsigned NewDepth) { if (NewDepth <= getDepth()) return; @@ -217,9 +215,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) { isDepthCurrent = true; } -/// setHeightToAtLeast - Update this node's predecessors to reflect the -/// fact that this node's height just increased. -/// void SUnit::setHeightToAtLeast(unsigned NewHeight) { if (NewHeight <= getHeight()) return; @@ -228,8 +223,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) { isHeightCurrent = true; } -/// ComputeDepth - Calculate the maximal path from the node to the exit. -/// +/// Calculates the maximal path from the node to the exit. void SUnit::ComputeDepth() { SmallVector<SUnit*, 8> WorkList; WorkList.push_back(this); @@ -238,12 +232,11 @@ void SUnit::ComputeDepth() { bool Done = true; unsigned MaxPredDepth = 0; - for (SUnit::const_pred_iterator I = Cur->Preds.begin(), - E = Cur->Preds.end(); I != E; ++I) { - SUnit *PredSU = I->getSUnit(); + for (const SDep &PredDep : Cur->Preds) { + SUnit *PredSU = PredDep.getSUnit(); if (PredSU->isDepthCurrent) MaxPredDepth = std::max(MaxPredDepth, - PredSU->Depth + I->getLatency()); + PredSU->Depth + PredDep.getLatency()); else { Done = false; WorkList.push_back(PredSU); @@ -261,8 +254,7 @@ void SUnit::ComputeDepth() { } while (!WorkList.empty()); } -/// ComputeHeight - Calculate the maximal path from the node to the entry. -/// +/// Calculates the maximal path from the node to the entry. void SUnit::ComputeHeight() { SmallVector<SUnit*, 8> WorkList; WorkList.push_back(this); @@ -271,12 +263,11 @@ void SUnit::ComputeHeight() { bool Done = true; unsigned MaxSuccHeight = 0; - for (SUnit::const_succ_iterator I = Cur->Succs.begin(), - E = Cur->Succs.end(); I != E; ++I) { - SUnit *SuccSU = I->getSUnit(); + for (const SDep &SuccDep : Cur->Succs) { + SUnit *SuccSU = SuccDep.getSUnit(); if (SuccSU->isHeightCurrent) MaxSuccHeight = std::max(MaxSuccHeight, - SuccSU->Height + I->getLatency()); + SuccSU->Height + SuccDep.getLatency()); else { Done = false; WorkList.push_back(SuccSU); @@ -310,6 +301,7 @@ void SUnit::biasCriticalPath() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const { if (this == &DAG->ExitSU) OS << "ExitSU"; @@ -319,15 +311,13 @@ void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const { OS << "SU(" << NodeNum << ")"; } -/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or -/// a group of nodes flagged together. -void SUnit::dump(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dump(const ScheduleDAG *G) const { print(dbgs(), G); dbgs() << ": "; G->dumpNode(this); } -void SUnit::dumpAll(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const { dump(G); dbgs() << " # preds left : " << NumPredsLeft << "\n"; @@ -343,41 +333,39 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { if (Preds.size() != 0) { dbgs() << " Predecessors:\n"; - for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) { + for (const SDep &SuccDep : Preds) { dbgs() << " "; - switch (I->getKind()) { + switch (SuccDep.getKind()) { case SDep::Data: dbgs() << "data "; break; case SDep::Anti: dbgs() << "anti "; break; case SDep::Output: dbgs() << "out "; break; case SDep::Order: dbgs() << "ord "; break; } - I->getSUnit()->print(dbgs(), G); - if (I->isArtificial()) + SuccDep.getSUnit()->print(dbgs(), G); + if (SuccDep.isArtificial()) dbgs() << " *"; - dbgs() << ": Latency=" << I->getLatency(); - if (I->isAssignedRegDep()) - dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); + dbgs() << ": Latency=" << SuccDep.getLatency(); + if (SuccDep.isAssignedRegDep()) + dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI); dbgs() << "\n"; } } if (Succs.size() != 0) { dbgs() << " Successors:\n"; - for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end(); - I != E; ++I) { + for (const SDep &SuccDep : Succs) { dbgs() << " "; - switch (I->getKind()) { + switch (SuccDep.getKind()) { case SDep::Data: dbgs() << "data "; break; case SDep::Anti: dbgs() << "anti "; break; case SDep::Output: dbgs() << "out "; break; case SDep::Order: dbgs() << "ord "; break; } - I->getSUnit()->print(dbgs(), G); - if (I->isArtificial()) + SuccDep.getSUnit()->print(dbgs(), G); + if (SuccDep.isArtificial()) dbgs() << " *"; - dbgs() << ": Latency=" << I->getLatency(); - if (I->isAssignedRegDep()) - dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); + dbgs() << ": Latency=" << SuccDep.getLatency(); + if (SuccDep.isAssignedRegDep()) + dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI); dbgs() << "\n"; } } @@ -385,47 +373,44 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { #endif #ifndef NDEBUG -/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that -/// their state is consistent. Return the number of scheduled nodes. -/// unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) { bool AnyNotSched = false; unsigned DeadNodes = 0; - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - if (!SUnits[i].isScheduled) { - if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) { + for (const SUnit &SUnit : SUnits) { + if (!SUnit.isScheduled) { + if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) { ++DeadNodes; continue; } if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has not been scheduled!\n"; AnyNotSched = true; } - if (SUnits[i].isScheduled && - (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) > - unsigned(INT_MAX)) { + if (SUnit.isScheduled && + (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) > + unsigned(std::numeric_limits<int>::max())) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has an unexpected " << (isBottomUp ? "Height" : "Depth") << " value!\n"; AnyNotSched = true; } if (isBottomUp) { - if (SUnits[i].NumSuccsLeft != 0) { + if (SUnit.NumSuccsLeft != 0) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has successors left!\n"; AnyNotSched = true; } } else { - if (SUnits[i].NumPredsLeft != 0) { + if (SUnit.NumPredsLeft != 0) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has predecessors left!\n"; AnyNotSched = true; } @@ -436,36 +421,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) { } #endif -/// InitDAGTopologicalSorting - create the initial topological -/// ordering from the DAG to be scheduled. -/// -/// The idea of the algorithm is taken from -/// "Online algorithms for managing the topological order of -/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly -/// This is the MNR algorithm, which was first introduced by -/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in -/// "Maintaining a topological order under edge insertions". -/// -/// Short description of the algorithm: -/// -/// Topological ordering, ord, of a DAG maps each node to a topological -/// index so that for all edges X->Y it is the case that ord(X) < ord(Y). -/// -/// This means that if there is a path from the node X to the node Z, -/// then ord(X) < ord(Z). -/// -/// This property can be used to check for reachability of nodes: -/// if Z is reachable from X, then an insertion of the edge Z->X would -/// create a cycle. -/// -/// The algorithm first computes a topological ordering for the DAG by -/// initializing the Index2Node and Node2Index arrays and then tries to keep -/// the ordering up-to-date after edge insertions by reordering the DAG. -/// -/// On insertion of the edge X->Y, the algorithm first marks by calling DFS -/// the nodes reachable from Y, and then shifts them using Shift to lie -/// immediately after X in Index2Node. void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { + // The idea of the algorithm is taken from + // "Online algorithms for managing the topological order of + // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly + // This is the MNR algorithm, which was first introduced by + // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in + // "Maintaining a topological order under edge insertions". + // + // Short description of the algorithm: + // + // Topological ordering, ord, of a DAG maps each node to a topological + // index so that for all edges X->Y it is the case that ord(X) < ord(Y). + // + // This means that if there is a path from the node X to the node Z, + // then ord(X) < ord(Z). + // + // This property can be used to check for reachability of nodes: + // if Z is reachable from X, then an insertion of the edge Z->X would + // create a cycle. + // + // The algorithm first computes a topological ordering for the DAG by + // initializing the Index2Node and Node2Index arrays and then tries to keep + // the ordering up-to-date after edge insertions by reordering the DAG. + // + // On insertion of the edge X->Y, the algorithm first marks by calling DFS + // the nodes reachable from Y, and then shifts them using Shift to lie + // immediately after X in Index2Node. unsigned DAGSize = SUnits.size(); std::vector<SUnit*> WorkList; WorkList.reserve(DAGSize); @@ -476,18 +458,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { // Initialize the data structures. if (ExitSU) WorkList.push_back(ExitSU); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - int NodeNum = SU->NodeNum; - unsigned Degree = SU->Succs.size(); + for (SUnit &SU : SUnits) { + int NodeNum = SU.NodeNum; + unsigned Degree = SU.Succs.size(); // Temporarily use the Node2Index array as scratch space for degree counts. Node2Index[NodeNum] = Degree; // Is it a node without dependencies? if (Degree == 0) { - assert(SU->Succs.empty() && "SUnit should have no successors"); + assert(SU.Succs.empty() && "SUnit should have no successors"); // Collect leaf nodes. - WorkList.push_back(SU); + WorkList.push_back(&SU); } } @@ -497,9 +478,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { WorkList.pop_back(); if (SU->NodeNum < DAGSize) Allocate(SU->NodeNum, --Id); - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - SUnit *SU = I->getSUnit(); + for (const SDep &PredDep : SU->Preds) { + SUnit *SU = PredDep.getSUnit(); if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum]) // If all dependencies of the node are processed already, // then the node can be computed now. @@ -511,19 +491,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { #ifndef NDEBUG // Check correctness of the ordering - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && + for (SUnit &SU : SUnits) { + for (const SDep &PD : SU.Preds) { + assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] && "Wrong topological sorting"); } } #endif } -/// AddPred - Updates the topological ordering to accommodate an edge -/// to be added from SUnit X to SUnit Y. void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { int UpperBound, LowerBound; LowerBound = Node2Index[Y->NodeNum]; @@ -540,16 +516,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { } } -/// RemovePred - Updates the topological ordering to accommodate an -/// an edge to be removed from the specified node N from the predecessors -/// of the current node M. void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) { // InitDAGTopologicalSorting(); } -/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark -/// all nodes affected by the edge insertion. These nodes will later get new -/// topological indexes by means of the Shift method. void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, bool &HasLoop) { std::vector<const SUnit*> WorkList; @@ -560,8 +530,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, SU = WorkList.back(); WorkList.pop_back(); Visited.set(SU->NodeNum); - for (int I = SU->Succs.size()-1; I >= 0; --I) { - unsigned s = SU->Succs[I].getSUnit()->NodeNum; + for (const SDep &SuccDep + : make_range(SU->Succs.rbegin(), SU->Succs.rend())) { + unsigned s = SuccDep.getSUnit()->NodeNum; // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). if (s >= Node2Index.size()) continue; @@ -571,14 +542,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, } // Visit successors if not already and in affected region. if (!Visited.test(s) && Node2Index[s] < UpperBound) { - WorkList.push_back(SU->Succs[I].getSUnit()); + WorkList.push_back(SuccDep.getSUnit()); } } } while (!WorkList.empty()); } -/// Shift - Renumber the nodes so that the topological ordering is -/// preserved. +std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU, + const SUnit &TargetSU, + bool &Success) { + std::vector<const SUnit*> WorkList; + int LowerBound = Node2Index[StartSU.NodeNum]; + int UpperBound = Node2Index[TargetSU.NodeNum]; + bool Found = false; + BitVector VisitedBack; + std::vector<int> Nodes; + + if (LowerBound > UpperBound) { + Success = false; + return Nodes; + } + + WorkList.reserve(SUnits.size()); + Visited.reset(); + + // Starting from StartSU, visit all successors up + // to UpperBound. + WorkList.push_back(&StartSU); + do { + const SUnit *SU = WorkList.back(); + WorkList.pop_back(); + for (int I = SU->Succs.size()-1; I >= 0; --I) { + const SUnit *Succ = SU->Succs[I].getSUnit(); + unsigned s = Succ->NodeNum; + // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). + if (Succ->isBoundaryNode()) + continue; + if (Node2Index[s] == UpperBound) { + Found = true; + continue; + } + // Visit successors if not already and in affected region. + if (!Visited.test(s) && Node2Index[s] < UpperBound) { + Visited.set(s); + WorkList.push_back(Succ); + } + } + } while (!WorkList.empty()); + + if (!Found) { + Success = false; + return Nodes; + } + + WorkList.clear(); + VisitedBack.resize(SUnits.size()); + Found = false; + + // Starting from TargetSU, visit all predecessors up + // to LowerBound. SUs that are visited by the two + // passes are added to Nodes. + WorkList.push_back(&TargetSU); + do { + const SUnit *SU = WorkList.back(); + WorkList.pop_back(); + for (int I = SU->Preds.size()-1; I >= 0; --I) { + const SUnit *Pred = SU->Preds[I].getSUnit(); + unsigned s = Pred->NodeNum; + // Edges to non-SUnits are allowed but ignored (e.g. EntrySU). + if (Pred->isBoundaryNode()) + continue; + if (Node2Index[s] == LowerBound) { + Found = true; + continue; + } + if (!VisitedBack.test(s) && Visited.test(s)) { + VisitedBack.set(s); + WorkList.push_back(Pred); + Nodes.push_back(s); + } + } + } while (!WorkList.empty()); + + assert(Found && "Error in SUnit Graph!"); + Success = true; + return Nodes; +} + void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, int UpperBound) { std::vector<int> L; @@ -598,28 +648,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, } } - for (unsigned j = 0; j < L.size(); ++j) { - Allocate(L[j], i - shift); + for (unsigned LI : L) { + Allocate(LI, i - shift); i = i + 1; } } - -/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will -/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU). bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) { // Is SU reachable from TargetSU via successor edges? if (IsReachable(SU, TargetSU)) return true; - for (SUnit::pred_iterator - I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I) - if (I->isAssignedRegDep() && - IsReachable(SU, I->getSUnit())) + for (const SDep &PredDep : TargetSU->Preds) + if (PredDep.isAssignedRegDep() && + IsReachable(SU, PredDep.getSUnit())) return true; return false; } -/// IsReachable - Checks if SU is reachable from TargetSU. bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, const SUnit *TargetSU) { // If insertion of the edge SU->TargetSU would create a cycle @@ -637,7 +682,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, return HasLoop; } -/// Allocate - assign the topological index to the node n. void ScheduleDAGTopologicalSort::Allocate(int n, int index) { Node2Index[n] = index; Index2Node[index] = n; @@ -647,4 +691,4 @@ ScheduleDAGTopologicalSort:: ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu) : SUnits(sunits), ExitSU(exitsu) {} -ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {} +ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default; diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 611c5a71bd5a2..18823b74c47fe 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This implements the ScheduleDAGInstrs class, which implements re-scheduling -// of MachineInstrs. +/// \file This implements the ScheduleDAGInstrs class, which implements +/// re-scheduling of MachineInstrs. // //===----------------------------------------------------------------------===// @@ -101,8 +101,8 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, SchedModel.init(ST.getSchedModel(), &ST, TII); } -/// getUnderlyingObjectFromInt - This is the function that does the work of -/// looking through basic ptrtoint+arithmetic+inttoptr sequences. +/// This is the function that does the work of looking through basic +/// ptrtoint+arithmetic+inttoptr sequences. static const Value *getUnderlyingObjectFromInt(const Value *V) { do { if (const Operator *U = dyn_cast<Operator>(V)) { @@ -129,8 +129,8 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) { } while (1); } -/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects -/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences. +/// This is a wrapper around GetUnderlyingObjects and adds support for basic +/// ptrtoint+arithmetic+inttoptr sequences. static void getUnderlyingObjects(const Value *V, SmallVectorImpl<Value *> &Objects, const DataLayout &DL) { @@ -158,9 +158,8 @@ static void getUnderlyingObjects(const Value *V, } while (!Working.empty()); } -/// getUnderlyingObjectsForInstr - If this machine instr has memory reference -/// information and it can be tracked to a normal reference to a known -/// object, return the Value for that object. +/// If this machine instr has memory reference information and it can be tracked +/// to a normal reference to a known object, return the Value for that object. static void getUnderlyingObjectsForInstr(const MachineInstr *MI, const MachineFrameInfo &MFI, UnderlyingObjectsVector &Objects, @@ -216,10 +215,6 @@ void ScheduleDAGInstrs::finishBlock() { BB = nullptr; } -/// Initialize the DAG and common scheduler state for the current scheduling -/// region. This does not actually create the DAG, only clears it. The -/// scheduling driver may call BuildSchedGraph multiple times per scheduling -/// region. void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, MachineBasicBlock::iterator begin, MachineBasicBlock::iterator end, @@ -230,20 +225,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, NumRegionInstrs = regioninstrs; } -/// Close the current scheduling region. Don't clear any state in case the -/// driver wants to refer to the previous scheduling region. void ScheduleDAGInstrs::exitRegion() { // Nothing to do. } -/// addSchedBarrierDeps - Add dependencies from instructions in the current -/// list of instructions being scheduled to scheduling barrier by adding -/// the exit SU to the register defs and use list. This is because we want to -/// make sure instructions which define registers that are either used by -/// the terminator or are live-out are properly scheduled. This is -/// especially important when the definition latency of the return value(s) -/// are too high to be hidden by the branch or when the liveout registers -/// used by instructions in the fallthrough block. void ScheduleDAGInstrs::addSchedBarrierDeps() { MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr; ExitSU.setInstr(ExitMI); @@ -271,7 +256,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { } } -/// MO is an operand of SU's instruction that defines a physical register. Add +/// MO is an operand of SU's instruction that defines a physical register. Adds /// data dependencies from SU to any uses of the physical register. void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); @@ -313,9 +298,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { } } -/// addPhysRegDeps - Add register dependencies (data, anti, and output) from -/// this SUnit to following instructions in the same scheduling region that -/// depend the physical register referenced at OperIdx. +/// \brief Adds register dependencies (data, anti, and output) from this SUnit +/// to following instructions in the same scheduling region that depend the +/// physical register referenced at OperIdx. void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { MachineInstr *MI = SU->getInstr(); MachineOperand &MO = MI->getOperand(OperIdx); @@ -406,9 +391,9 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const return TRI->getSubRegIndexLaneMask(SubReg); } -/// addVRegDefDeps - Add register output and data dependencies from this SUnit -/// to instructions that occur later in the same scheduling region if they read -/// from or write to the virtual register defined at OperIdx. +/// Adds register output and data dependencies from this SUnit to instructions +/// that occur later in the same scheduling region if they read from or write to +/// the virtual register defined at OperIdx. /// /// TODO: Hoist loop induction variable increments. This has to be /// reevaluated. Generally, IV scheduling should be done before coalescing. @@ -515,10 +500,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); } -/// addVRegUseDeps - Add a register data dependency if the instruction that -/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a -/// register antidependency from this SUnit to instructions that occur later in -/// the same scheduling region if they write the virtual register. +/// \brief Adds a register data dependency if the instruction that defines the +/// virtual register used at OperIdx is mapped to an SUnit. Add a register +/// antidependency from this SUnit to instructions that occur later in the same +/// scheduling region if they write the virtual register. /// /// TODO: Handle ExitSU "uses" properly. void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { @@ -545,87 +530,25 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { } } -/// Return true if MI is an instruction we are unable to reason about +/// Returns true if MI is an instruction we are unable to reason about /// (like a call or something with unmodeled side effects). static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { return MI->isCall() || MI->hasUnmodeledSideEffects() || (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA)); } -/// This returns true if the two MIs need a chain edge between them. -/// This is called on normal stores and loads. -static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI, - const DataLayout &DL, MachineInstr *MIa, - MachineInstr *MIb) { - const MachineFunction *MF = MIa->getParent()->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - - assert ((MIa->mayStore() || MIb->mayStore()) && - "Dependency checked between two loads"); - - // Let the target decide if memory accesses cannot possibly overlap. - if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA)) - return false; - - // To this point analysis is generic. From here on we do need AA. - if (!AA) - return true; - - // FIXME: Need to handle multiple memory operands to support all targets. - if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) - return true; - - MachineMemOperand *MMOa = *MIa->memoperands_begin(); - MachineMemOperand *MMOb = *MIb->memoperands_begin(); - - if (!MMOa->getValue() || !MMOb->getValue()) - return true; - - // The following interface to AA is fashioned after DAGCombiner::isAlias - // and operates with MachineMemOperand offset with some important - // assumptions: - // - LLVM fundamentally assumes flat address spaces. - // - MachineOperand offset can *only* result from legalization and - // cannot affect queries other than the trivial case of overlap - // checking. - // - These offsets never wrap and never step outside - // of allocated objects. - // - There should never be any negative offsets here. - // - // FIXME: Modify API to hide this math from "user" - // FIXME: Even before we go to AA we can reason locally about some - // memory objects. It can save compile time, and possibly catch some - // corner cases not currently covered. - - assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); - assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); - - int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); - int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; - int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; - - AliasResult AAResult = - AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, - UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), - MemoryLocation(MMOb->getValue(), Overlapb, - UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); - - return (AAResult != NoAlias); -} - -/// Check whether two objects need a chain edge and add it if needed. void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, unsigned Latency) { - if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(), - SUb->getInstr())) { + if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) { SDep Dep(SUa, SDep::MayAliasMem); Dep.setLatency(Latency); SUb->addPred(Dep); } } -/// Create an SUnit for each real instruction, numbered in top-down topological -/// order. The instruction order A < B, implies that no edge exists from B to A. +/// \brief Creates an SUnit for each real instruction, numbered in top-down +/// topological order. The instruction order A < B, implies that no edge exists +/// from B to A. /// /// Map each real instruction to its SUnit. /// @@ -682,14 +605,13 @@ void ScheduleDAGInstrs::initSUnits() { } class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { - /// Current total number of SUs in map. unsigned NumNodes; /// 1 for loads, 0 for stores. (see comment in SUList) unsigned TrueMemOrderLatency; -public: +public: Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {} /// To keep NumNodes up to date, insert() is used instead of @@ -697,8 +619,8 @@ public: ValueType &operator[](const SUList &Key) { llvm_unreachable("Don't use. Use insert() instead."); }; - /// Add SU to the SUList of V. If Map grows huge, reduce its size - /// by calling reduce(). + /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling + /// reduce(). void inline insert(SUnit *SU, ValueType V) { MapVector::operator[](V).push_back(SU); NumNodes++; @@ -723,7 +645,7 @@ public: unsigned inline size() const { return NumNodes; } - /// Count the number of SUs in this map after a reduction. + /// Counts the number of SUs in this map after a reduction. void reComputeSize(void) { NumNodes = 0; for (auto &I : *this) @@ -797,9 +719,6 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { map.reComputeSize(); } -/// If RegPressure is non-null, compute register pressure as a side effect. The -/// DAG builder is an efficient place to do it because it already visits -/// operands. void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker, PressureDiffs *PDiffs, @@ -1088,10 +1007,6 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() { } } -/// Reduce maps in FIFO order, by N SUs. This is better than turning -/// every Nth memory SU into BarrierChain in buildSchedGraph(), since -/// it avoids unnecessary edges between seen SUs above the new -/// BarrierChain, and those below it. void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, Value2SUsMap &loads, unsigned N) { DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; @@ -1142,7 +1057,6 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, loads.dump()); } -/// \brief Initialize register live-range state for updating kills. void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) { // Start with no live registers. LiveRegs.reset(); @@ -1178,32 +1092,35 @@ static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg, if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false)) return; } else - (--End)->clearRegisterKills(Reg, TRI); + (--End)->clearRegisterKills(Reg, TRI); } } -bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) { +void ScheduleDAGInstrs::toggleKillFlag(MachineInstr &MI, MachineOperand &MO) { + if (MO.isDebug()) + return; + // Setting kill flag... if (!MO.isKill()) { MO.setIsKill(true); - toggleBundleKillFlag(MI, MO.getReg(), true, TRI); - return false; + toggleBundleKillFlag(&MI, MO.getReg(), true, TRI); + return; } // If MO itself is live, clear the kill flag... if (LiveRegs.test(MO.getReg())) { MO.setIsKill(false); - toggleBundleKillFlag(MI, MO.getReg(), false, TRI); - return false; + toggleBundleKillFlag(&MI, MO.getReg(), false, TRI); + return; } // If any subreg of MO is live, then create an imp-def for that // subreg and keep MO marked as killed. MO.setIsKill(false); - toggleBundleKillFlag(MI, MO.getReg(), false, TRI); + toggleBundleKillFlag(&MI, MO.getReg(), false, TRI); bool AllDead = true; const unsigned SuperReg = MO.getReg(); - MachineInstrBuilder MIB(MF, MI); + MachineInstrBuilder MIB(MF, &MI); for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) { if (LiveRegs.test(*SubRegs)) { MIB.addReg(*SubRegs, RegState::ImplicitDefine); @@ -1213,13 +1130,12 @@ bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) { if(AllDead) { MO.setIsKill(true); - toggleBundleKillFlag(MI, MO.getReg(), true, TRI); + toggleBundleKillFlag(&MI, MO.getReg(), true, TRI); } - return false; } -// FIXME: Reuse the LivePhysRegs utility for this. void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { + // FIXME: Reuse the LivePhysRegs utility for this. DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n'); LiveRegs.resize(TRI->getNumRegs()); @@ -1289,7 +1205,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { if (MO.isKill() != kill) { DEBUG(dbgs() << "Fixing " << MO << " in "); - toggleKillFlag(&MI, MO); + toggleKillFlag(MI, MO); DEBUG(MI.dump()); DEBUG({ if (MI.getOpcode() == TargetOpcode::BUNDLE) { @@ -1319,6 +1235,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { } void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { + // Cannot completely remove virtual function even in release mode. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) SU->getInstr()->dump(); #endif @@ -1347,7 +1264,7 @@ std::string ScheduleDAGInstrs::getDAGName() const { //===----------------------------------------------------------------------===// namespace llvm { -/// \brief Internal state used to compute SchedDFSResult. +/// Internal state used to compute SchedDFSResult. class SchedDFSImpl { SchedDFSResult &R; @@ -1358,8 +1275,8 @@ class SchedDFSImpl { struct RootData { unsigned NodeID; - unsigned ParentNodeID; // Parent node (member of the parent subtree). - unsigned SubInstrCount; // Instr count in this tree only, not children. + unsigned ParentNodeID; ///< Parent node (member of the parent subtree). + unsigned SubInstrCount; ///< Instr count in this tree only, not children. RootData(unsigned id): NodeID(id), ParentNodeID(SchedDFSResult::InvalidSubtreeID), @@ -1375,7 +1292,7 @@ public: RootSet.setUniverse(R.DFSNodeData.size()); } - /// Return true if this node been visited by the DFS traversal. + /// Returns true if this node been visited by the DFS traversal. /// /// During visitPostorderNode the Node's SubtreeID is assigned to the Node /// ID. Later, SubtreeID is updated but remains valid. @@ -1384,7 +1301,7 @@ public: != SchedDFSResult::InvalidSubtreeID; } - /// Initialize this node's instruction count. We don't need to flag the node + /// Initializes this node's instruction count. We don't need to flag the node /// visited until visitPostorder because the DAG cannot have cycles. void visitPreorder(const SUnit *SU) { R.DFSNodeData[SU->NodeNum].InstrCount = @@ -1433,8 +1350,8 @@ public: RootSet[SU->NodeNum] = RData; } - /// Called once for each tree edge after calling visitPostOrderNode on the - /// predecessor. Increment the parent node's instruction count and + /// \brief Called once for each tree edge after calling visitPostOrderNode on + /// the predecessor. Increment the parent node's instruction count and /// preemptively join this subtree to its parent's if it is small enough. void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) { R.DFSNodeData[Succ->NodeNum].InstrCount @@ -1442,13 +1359,13 @@ public: joinPredSubtree(PredDep, Succ); } - /// Add a connection for cross edges. + /// Adds a connection for cross edges. void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) { ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ)); } - /// Set each node's subtree ID to the representative ID and record connections - /// between trees. + /// Sets each node's subtree ID to the representative ID and record + /// connections between trees. void finalize() { SubtreeClasses.compress(); R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); @@ -1484,8 +1401,8 @@ public: } protected: - /// Join the predecessor subtree with the successor that is its DFS - /// parent. Apply some heuristics before joining. + /// Joins the predecessor subtree with the successor that is its DFS parent. + /// Applies some heuristics before joining. bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ, bool CheckLimit = true) { assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); @@ -1531,10 +1448,10 @@ protected: } while (FromTree != SchedDFSResult::InvalidSubtreeID); } }; -} // namespace llvm +} // end namespace llvm namespace { -/// \brief Manage the stack used by a reverse depth-first search over the DAG. +/// Manage the stack used by a reverse depth-first search over the DAG. class SchedDAGReverseDFS { std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack; public: @@ -1569,7 +1486,7 @@ static bool hasDataSucc(const SUnit *SU) { return false; } -/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first +/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first /// search from this root. void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { if (!IsBottomUp) @@ -1626,8 +1543,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) { } } -LLVM_DUMP_METHOD -void ILPValue::print(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { OS << InstrCount << " / " << Length << " = "; if (!Length) OS << "BADILP"; @@ -1635,8 +1552,7 @@ void ILPValue::print(raw_ostream &OS) const { OS << format("%g", ((double)InstrCount / Length)); } -LLVM_DUMP_METHOD -void ILPValue::dump() const { +LLVM_DUMP_METHOD void ILPValue::dump() const { dbgs() << *this << '\n'; } @@ -1648,4 +1564,5 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { return OS; } -} // namespace llvm +} // end namespace llvm +#endif diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp index 83bc1ba7beb94..b3d83d5313aff 100644 --- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp +++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp @@ -1,4 +1,4 @@ -//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===// +//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===// // // The LLVM Compiler Infrastructure // @@ -15,11 +15,13 @@ #include "llvm/CodeGen/ScoreboardHazardRecognizer.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include <cassert> using namespace llvm; @@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer( const InstrItineraryData *II, const ScheduleDAG *SchedDAG, const char *ParentDebugType) : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II), - DAG(SchedDAG), IssueWidth(0), IssueCount(0) { - + DAG(SchedDAG) { // Determine the maximum depth of any itinerary. This determines the depth of // the scoreboard. We always make the scoreboard at least 1 cycle deep to // avoid dealing with the boundary condition. diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2c7bffe76503f..4d468551ae24e 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -53,10 +53,6 @@ STATISTIC(SlicedLoads, "Number of load sliced"); namespace { static cl::opt<bool> - CombinerAA("combiner-alias-analysis", cl::Hidden, - cl::desc("Enable DAG combiner alias-analysis heuristics")); - - static cl::opt<bool> CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner's use of IR alias analysis")); @@ -133,6 +129,9 @@ namespace { /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { + assert(N->getOpcode() != ISD::DELETED_NODE && + "Deleted Node added to Worklist"); + // Skip handle nodes as they can't usefully be combined and confuse the // zero-use deletion strategy. if (N->getOpcode() == ISD::HANDLENODE) @@ -177,6 +176,7 @@ namespace { void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: + unsigned MaximumLegalStoreInBits; /// Check the specified integer node value to see if it can be simplified or /// if things it uses can be simplified by bit propagation. @@ -232,9 +232,12 @@ namespace { SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); + SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitSUB(SDNode *N); SDValue visitADDC(SDNode *N); + SDValue visitUADDO(SDNode *N); SDValue visitSUBC(SDNode *N); + SDValue visitUSUBO(SDNode *N); SDValue visitADDE(SDNode *N); SDValue visitSUBE(SDNode *N); SDValue visitMUL(SDNode *N); @@ -259,6 +262,7 @@ namespace { SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); SDValue visitRotate(SDNode *N); + SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); SDValue visitBITREVERSE(SDNode *N); SDValue visitCTLZ(SDNode *N); @@ -274,6 +278,7 @@ namespace { SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); + SDValue visitAssertZext(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -336,6 +341,7 @@ namespace { SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); SDValue foldSelectOfConstants(SDNode *N); + SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); @@ -344,6 +350,8 @@ namespace { bool NotExtCompare = false); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); + SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, + const SDLoc &DL); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans = true); @@ -377,6 +385,7 @@ namespace { unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); + SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -384,9 +393,9 @@ namespace { SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); - SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask, - SDValue VecIn1, SDValue VecIn2, - unsigned LeftIdx); + SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, + ArrayRef<int> VectorMask, SDValue VecIn1, + SDValue VecIn2, unsigned LeftIdx); SDValue GetDemandedBits(SDValue V, const APInt &Mask); @@ -416,15 +425,12 @@ namespace { /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { - MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): - MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } + MemOpLink(LSBaseSDNode *N, int64_t Offset) + : MemNode(N), OffsetFromBase(Offset) {} // Ptr to the mem node. LSBaseSDNode *MemNode; // Offset from the base ptr. int64_t OffsetFromBase; - // What is the sequence number of this mem node. - // Lowest mem operand in the DAG starts at zero. - unsigned SequenceNum; }; /// This is a helper function for visitMUL to check the profitability @@ -435,12 +441,6 @@ namespace { SDValue &AddNode, SDValue &ConstNode); - /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a - /// constant build_vector of the stored constant values in Stores. - SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL, - ArrayRef<MemOpLink> Stores, - SmallVectorImpl<SDValue> &Chains, - EVT Ty) const; /// This is a helper function for visitAND and visitZERO_EXTEND. Returns /// true if the (and (load x) c) pattern matches an extload. ExtVT returns @@ -451,34 +451,35 @@ namespace { EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, bool &NarrowLoad); + /// Helper function for MergeConsecutiveStores which merges the + /// component store chains. + SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, + unsigned NumStores); + /// This is a helper function for MergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store. - /// \return number of stores that were merged into a merged store (always - /// a prefix of \p StoreNode). - bool MergeStoresOfConstantsOrVecElts( - SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, - bool IsConstantSrc, bool UseVector); + /// \return True if a merged store was created. + bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, + EVT MemVT, unsigned NumStores, + bool IsConstantSrc, bool UseVector); /// This is a helper function for MergeConsecutiveStores. /// Stores that may be merged are placed in StoreNodes. - /// Loads that may alias with those stores are placed in AliasLoadNodes. - void getStoreMergeAndAliasCandidates( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, - SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes); + void getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl<MemOpLink> &StoreNodes); /// Helper function for MergeConsecutiveStores. Checks if /// Candidate stores have indirect dependency through their /// operands. \return True if safe to merge bool checkMergeStoreCandidatesForDependencies( - SmallVectorImpl<MemOpLink> &StoreNodes); + SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return number of stores that were merged into a merged store (the /// affected nodes are stored as a prefix in \p StoreNodes). - bool MergeConsecutiveStores(StoreSDNode *N, - SmallVectorImpl<MemOpLink> &StoreNodes); + bool MergeConsecutiveStores(StoreSDNode *N); /// \brief Try to transform a truncation where C is a constant: /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) @@ -493,6 +494,13 @@ namespace { : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); + + MaximumLegalStoreInBits = 0; + for (MVT VT : MVT::all_valuetypes()) + if (EVT(VT).isSimple() && VT != MVT::Other && + TLI.isTypeLegal(EVT(VT)) && + VT.getSizeInBits() >= MaximumLegalStoreInBits) + MaximumLegalStoreInBits = VT.getSizeInBits(); } /// Runs the dag combiner on all nodes in the work list @@ -607,10 +615,16 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, switch (Op.getOpcode()) { default: return false; - case ISD::ConstantFP: - // Don't invert constant FP values after legalize. The negated constant - // isn't necessarily legal. - return LegalOperations ? 0 : 1; + case ISD::ConstantFP: { + if (!LegalOperations) + return 1; + + // Don't invert constant FP values after legalization unless the target says + // the negated constant is legal. + EVT VT = Op.getValueType(); + return TLI.isOperationLegal(ISD::ConstantFP, VT) || + TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT); + } case ISD::FADD: // FIXME: determine better conditions for this xform. if (!Options->UnsafeFPMath) return 0; @@ -629,7 +643,8 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options->UnsafeFPMath && !Op.getNode()->getFlags()->hasNoSignedZeros()) + if (!Options->NoSignedZerosFPMath && + !Op.getNode()->getFlags()->hasNoSignedZeros()) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) @@ -1079,37 +1094,36 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); + DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); + bool Replace0 = false; SDValue N0 = Op.getOperand(0); SDValue NN0 = PromoteOperand(N0, PVT, Replace0); - if (!NN0.getNode()) - return SDValue(); bool Replace1 = false; SDValue N1 = Op.getOperand(1); - SDValue NN1; - if (N0 == N1) - NN1 = NN0; - else { - NN1 = PromoteOperand(N1, PVT, Replace1); - if (!NN1.getNode()) - return SDValue(); - } + SDValue NN1 = PromoteOperand(N1, PVT, Replace1); + SDLoc DL(Op); - AddToWorklist(NN0.getNode()); - if (NN1.getNode()) - AddToWorklist(NN1.getNode()); + SDValue RV = + DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); - if (Replace0) + // New replace instances of N0 and N1 + if (Replace0 && N0 && N0.getOpcode() != ISD::DELETED_NODE && NN0 && + NN0.getOpcode() != ISD::DELETED_NODE) { + AddToWorklist(NN0.getNode()); ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); - if (Replace1) + } + + if (Replace1 && N1 && N1.getOpcode() != ISD::DELETED_NODE && NN1 && + NN1.getOpcode() != ISD::DELETED_NODE) { + AddToWorklist(NN1.getNode()); ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); + } - DEBUG(dbgs() << "\nPromoting "; - Op.getNode()->dump(&DAG)); - SDLoc DL(Op); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(Opc, DL, PVT, NN0, NN1)); + // Deal with Op being deleted. + if (Op && Op.getOpcode() != ISD::DELETED_NODE) + return RV; } return SDValue(); } @@ -1137,26 +1151,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); + DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); + bool Replace = false; SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); if (Opc == ISD::SRA) - N0 = SExtPromoteOperand(Op.getOperand(0), PVT); + N0 = SExtPromoteOperand(N0, PVT); else if (Opc == ISD::SRL) - N0 = ZExtPromoteOperand(Op.getOperand(0), PVT); + N0 = ZExtPromoteOperand(N0, PVT); else N0 = PromoteOperand(N0, PVT, Replace); + if (!N0.getNode()) return SDValue(); + SDLoc DL(Op); + SDValue RV = + DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); + AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); - DEBUG(dbgs() << "\nPromoting "; - Op.getNode()->dump(&DAG)); - SDLoc DL(Op); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1))); + // Deal with Op being deleted. + if (Op && Op.getOpcode() != ISD::DELETED_NODE) + return RV; } return SDValue(); } @@ -1361,8 +1381,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) { else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); - SDValue OpV = RV; - DAG.ReplaceAllUsesWith(N, &OpV); + DAG.ReplaceAllUsesWith(N, &RV); } // Push the new node and any users onto the worklist @@ -1389,7 +1408,9 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::ADD: return visitADD(N); case ISD::SUB: return visitSUB(N); case ISD::ADDC: return visitADDC(N); + case ISD::UADDO: return visitUADDO(N); case ISD::SUBC: return visitSUBC(N); + case ISD::USUBO: return visitUSUBO(N); case ISD::ADDE: return visitADDE(N); case ISD::SUBE: return visitSUBE(N); case ISD::MUL: return visitMUL(N); @@ -1415,6 +1436,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); + case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); case ISD::CTLZ: return visitCTLZ(N); @@ -1430,6 +1452,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); + case ISD::AssertZext: return visitAssertZext(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -1574,7 +1597,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } SmallVector<SDNode *, 8> TFs; // List of token factors to visit. - SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. + SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. SmallPtrSet<SDNode*, 16> SeenOps; bool Changed = false; // If we should replace this token factor. @@ -1618,6 +1641,86 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } } + // Remove Nodes that are chained to another node in the list. Do so + // by walking up chains breath-first stopping when we've seen + // another operand. In general we must climb to the EntryNode, but we can exit + // early if we find all remaining work is associated with just one operand as + // no further pruning is possible. + + // List of nodes to search through and original Ops from which they originate. + SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; + SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. + SmallPtrSet<SDNode *, 16> SeenChains; + bool DidPruneOps = false; + + unsigned NumLeftToConsider = 0; + for (const SDValue &Op : Ops) { + Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); + OpWorkCount.push_back(1); + } + + auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { + // If this is an Op, we can remove the op from the list. Remark any + // search associated with it as from the current OpNumber. + if (SeenOps.count(Op) != 0) { + Changed = true; + DidPruneOps = true; + unsigned OrigOpNumber = 0; + while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) + OrigOpNumber++; + assert((OrigOpNumber != Ops.size()) && + "expected to find TokenFactor Operand"); + // Re-mark worklist from OrigOpNumber to OpNumber + for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { + if (Worklist[i].second == OrigOpNumber) { + Worklist[i].second = OpNumber; + } + } + OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; + OpWorkCount[OrigOpNumber] = 0; + NumLeftToConsider--; + } + // Add if it's a new chain + if (SeenChains.insert(Op).second) { + OpWorkCount[OpNumber]++; + Worklist.push_back(std::make_pair(Op, OpNumber)); + } + }; + + for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { + // We need at least be consider at least 2 Ops to prune. + if (NumLeftToConsider <= 1) + break; + auto CurNode = Worklist[i].first; + auto CurOpNumber = Worklist[i].second; + assert((OpWorkCount[CurOpNumber] > 0) && + "Node should not appear in worklist"); + switch (CurNode->getOpcode()) { + case ISD::EntryToken: + // Hitting EntryToken is the only way for the search to terminate without + // hitting + // another operand's search. Prevent us from marking this operand + // considered. + NumLeftToConsider++; + break; + case ISD::TokenFactor: + for (const SDValue &Op : CurNode->op_values()) + AddToWorklist(i, Op.getNode(), CurOpNumber); + break; + case ISD::CopyFromReg: + case ISD::CopyToReg: + AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); + break; + default: + if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) + AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); + break; + } + OpWorkCount[CurOpNumber]--; + if (OpWorkCount[CurOpNumber] == 0) + NumLeftToConsider--; + } + SDValue Result; // If we've changed things around then replace token factor. @@ -1626,15 +1729,22 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); } else { - // New and improved token factor. - Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + if (DidPruneOps) { + SmallVector<SDValue, 8> PrunedOps; + // + for (const SDValue &Op : Ops) { + if (SeenChains.count(Op.getNode()) == 0) + PrunedOps.push_back(Op); + } + Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); + } else { + Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + } } - // Add users to worklist if AA is enabled, since it may introduce - // a lot of new chained token factors while removing memory deps. - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - return CombineTo(N, Result, UseAA /*add to worklist*/); + // Add users to worklist, since we may introduce a lot of new + // chained token factors while removing memory deps. + return CombineTo(N, Result, true /*add to worklist*/); } return Result; @@ -1664,6 +1774,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } +SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { + auto BinOpcode = BO->getOpcode(); + assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB || + BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV || + BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM || + BinOpcode == ISD::UREM || BinOpcode == ISD::AND || + BinOpcode == ISD::OR || BinOpcode == ISD::XOR || + BinOpcode == ISD::SHL || BinOpcode == ISD::SRL || + BinOpcode == ISD::SRA || BinOpcode == ISD::FADD || + BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL || + BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) && + "Unexpected binary operator"); + + // Bail out if any constants are opaque because we can't constant fold those. + SDValue C1 = BO->getOperand(1); + if (!isConstantOrConstantVector(C1, true) && + !isConstantFPBuildVectorOrConstantFP(C1)) + return SDValue(); + + // Don't do this unless the old select is going away. We want to eliminate the + // binary operator, not replace a binop with a select. + // TODO: Handle ISD::SELECT_CC. + SDValue Sel = BO->getOperand(0); + if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) + return SDValue(); + + SDValue CT = Sel.getOperand(1); + if (!isConstantOrConstantVector(CT, true) && + !isConstantFPBuildVectorOrConstantFP(CT)) + return SDValue(); + + SDValue CF = Sel.getOperand(2); + if (!isConstantOrConstantVector(CF, true) && + !isConstantFPBuildVectorOrConstantFP(CF)) + return SDValue(); + + // We have a select-of-constants followed by a binary operator with a + // constant. Eliminate the binop by pulling the constant math into the select. + // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1 + EVT VT = Sel.getValueType(); + SDLoc DL(Sel); + SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1); + assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) || + isConstantFPBuildVectorOrConstantFP(NewCT)) && + "Failed to constant fold a binop with constant operands"); + + SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1); + assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) || + isConstantFPBuildVectorOrConstantFP(NewCF)) && + "Failed to constant fold a binop with constant operands"); + + return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); +} + SDValue DAGCombiner::visitADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -1712,6 +1876,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) { } } + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate add if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1)) return RADD; @@ -1774,6 +1941,19 @@ SDValue DAGCombiner::visitADD(SDNode *N) { VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); + if (SDValue Combined = visitADDLike(N0, N1, N)) + return Combined; + + if (SDValue Combined = visitADDLike(N1, N0, N)) + return Combined; + + return SDValue(); +} + +SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { + EVT VT = N0.getValueType(); + SDLoc DL(LocReference); + // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) @@ -1781,12 +1961,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) { DAG.getNode(ISD::SHL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1))); - if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0))) - return DAG.getNode(ISD::SUB, DL, VT, N1, - DAG.getNode(ISD::SHL, DL, VT, - N0.getOperand(0).getOperand(1), - N0.getOperand(1))); if (N1.getOpcode() == ISD::AND) { SDValue AndOp0 = N1.getOperand(0); @@ -1797,7 +1971,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // and similar xforms where the inner op is either ~0 or 0. if (NumSignBits == DestBits && isOneConstantOrOneSplatConstant(N1->getOperand(1))) - return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0); + return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); } // add (sext i1), X -> sub X, (zext i1) @@ -1825,39 +1999,61 @@ SDValue DAGCombiner::visitADDC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) - return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); if (N0C && !N1C) - return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0); + return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); // fold (addc x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + DL, MVT::Glue)); - // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits. - APInt LHSZero, LHSOne; - APInt RHSZero, RHSOne; - DAG.computeKnownBits(N0, LHSZero, LHSOne); + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); - if (LHSZero.getBoolValue()) { - DAG.computeKnownBits(N1, RHSZero, RHSOne); + return SDValue(); +} - // If all possibly-set bits on the LHS are clear on the RHS, return an OR. - // If all possibly-set bits on the RHS are clear on the LHS, return an OR. - if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero) - return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); - } +SDValue DAGCombiner::visitUADDO(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + EVT CarryVT = N->getValueType(1); + SDLoc DL(N); + + // If the flag result is dead, turn this into an ADD. + if (!N->hasAnyUseOfValue(1)) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getUNDEF(CarryVT)); + + // canonicalize constant to RHS. + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); + + // fold (uaddo x, 0) -> x + no carry out + if (isNullConstant(N1)) + return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getConstant(0, DL, CarryVT)); return SDValue(); } @@ -1920,6 +2116,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { N1.getNode()); } + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); // fold (sub x, c) -> (add x, -c) @@ -2066,6 +2265,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitUSUBO(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + EVT CarryVT = N->getValueType(1); + SDLoc DL(N); + + // If the flag result is dead, turn this into an SUB. + if (!N->hasAnyUseOfValue(1)) + return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), + DAG.getUNDEF(CarryVT)); + + // fold (usubo x, x) -> 0 + no borrow + if (N0 == N1) + return CombineTo(N, DAG.getConstant(0, DL, VT), + DAG.getConstant(0, DL, CarryVT)); + + // fold (usubo x, 0) -> x + no borrow + if (isNullConstant(N1)) + return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + + // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow + if (isAllOnesConstant(N0)) + return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), + DAG.getConstant(0, DL, CarryVT)); + + return SDValue(); +} + SDValue DAGCombiner::visitSUBE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2131,6 +2362,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // fold (mul x, 1) -> x if (N1IsConst && ConstValue1 == 1 && IsFullSplat) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (mul x, -1) -> 0-x if (N1IsConst && ConstValue1.isAllOnesValue()) { SDLoc DL(N); @@ -2297,6 +2532,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) { return combined; } +static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if (DAG.isUndef(N->getOpcode(), {N0, N1})) + return DAG.getUNDEF(VT); + + // undef / X -> 0 + // undef % X -> 0 + if (N0.isUndef()) + return DAG.getConstant(0, DL, VT); + + return SDValue(); +} + SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2319,8 +2571,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { return N0; // fold (sdiv X, -1) -> 0-X if (N1C && N1C->isAllOnesValue()) - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), N0); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); + + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 @@ -2372,7 +2629,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { // If integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. Targets may check function attributes for size/speed // trade-offs. - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; @@ -2384,13 +2641,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem; - // undef / X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X / undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2414,6 +2664,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { N0C, N1C)) return Folded; + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { @@ -2444,7 +2700,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { } // fold (udiv x, c) -> alternate - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; @@ -2456,13 +2712,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem; - // undef / X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X / undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2482,32 +2731,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C)) return Folded; + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (isSigned) { // If we know the sign bits of both operands are zero, strength reduce to a // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } else { - // fold (urem x, pow2) -> (and x, pow2-1) + SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (DAG.isKnownToBeAPowerOfTwo(N1)) { - APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); + // fold (urem x, pow2) -> (and x, pow2-1) + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } - // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) if (N1.getOpcode() == ISD::SHL && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { - APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); + // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } } - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. @@ -2536,13 +2788,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem.getValue(1); - // undef % X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X % undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2932,95 +3177,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { return SDValue(); } +/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. +SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, + const SDLoc &DL) { + SDValue LL, LR, RL, RR, N0CC, N1CC; + if (!isSetCCEquivalent(N0, LL, LR, N0CC) || + !isSetCCEquivalent(N1, RL, RR, N1CC)) + return SDValue(); + + assert(N0.getValueType() == N1.getValueType() && + "Unexpected operand types for bitwise logic op"); + assert(LL.getValueType() == LR.getValueType() && + RL.getValueType() == RR.getValueType() && + "Unexpected operand types for setcc"); + + // If we're here post-legalization or the logic op type is not i1, the logic + // op type must match a setcc result type. Also, all folds require new + // operations on the left and right operands, so those types must match. + EVT VT = N0.getValueType(); + EVT OpVT = LL.getValueType(); + if (LegalOperations || VT != MVT::i1) + if (VT != getSetCCResultType(OpVT)) + return SDValue(); + if (OpVT != RL.getValueType()) + return SDValue(); + + ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); + ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); + bool IsInteger = OpVT.isInteger(); + if (LR == RR && CC0 == CC1 && IsInteger) { + bool IsZero = isNullConstantOrNullSplatConstant(LR); + bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); + + // All bits clear? + bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; + // All sign bits clear? + bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; + // Any bits set? + bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; + // Any sign bits set? + bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; + + // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) + // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) + // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) + // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) + if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { + SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); + AddToWorklist(Or.getNode()); + return DAG.getSetCC(DL, VT, Or, LR, CC1); + } + + // All bits set? + bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; + // All sign bits set? + bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; + // Any bits clear? + bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; + // Any sign bits clear? + bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; + + // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) + // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) + // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) + // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) + if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); + AddToWorklist(And.getNode()); + return DAG.getSetCC(DL, VT, And, LR, CC1); + } + } + + // TODO: What is the 'or' equivalent of this fold? + // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) + if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE && + ((isNullConstant(LR) && isAllOnesConstant(RR)) || + (isAllOnesConstant(LR) && isNullConstant(RR)))) { + SDValue One = DAG.getConstant(1, DL, OpVT); + SDValue Two = DAG.getConstant(2, DL, OpVT); + SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); + AddToWorklist(Add.getNode()); + return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); + } + + // Try more general transforms if the predicates match and the only user of + // the compares is the 'and' or 'or'. + if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && + N0.hasOneUse() && N1.hasOneUse()) { + // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 + // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 + if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { + SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); + SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); + SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); + SDValue Zero = DAG.getConstant(0, DL, OpVT); + return DAG.getSetCC(DL, VT, Or, Zero, CC1); + } + } + + // Canonicalize equivalent operands to LL == RL. + if (LL == RR && LR == RL) { + CC1 = ISD::getSetCCSwappedOperands(CC1); + std::swap(RL, RR); + } + + // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) + // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) + if (LL == RL && LR == RR) { + ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) + : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); + if (NewCC != ISD::SETCC_INVALID && + (!LegalOperations || + (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && + TLI.isOperationLegal(ISD::SETCC, OpVT)))) + return DAG.getSetCC(DL, VT, LL, LR, NewCC); + } + + return SDValue(); +} + /// This contains all DAGCombine rules which reduce two values combined by /// an And operation to a single value. This makes them reusable in the context /// of visitSELECT(). Rules involving constants are not included as /// visitSELECT() already handles those cases. -SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, - SDNode *LocReference) { +SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); + SDLoc DL(N); // fold (and x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) - return DAG.getConstant(0, SDLoc(LocReference), VT); - // fold (and (setcc x), (setcc y)) -> (setcc (and x, y)) - SDValue LL, LR, RL, RR, CC0, CC1; - if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ - ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); - ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - - if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 && - LL.getValueType().isInteger()) { - // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0) - if (isNullConstant(LR) && Op1 == ISD::SETEQ) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - if (isAllOnesConstant(LR)) { - // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1) - if (Op1 == ISD::SETEQ) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ANDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); - } - } - // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1) - if (Op1 == ISD::SETGT) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - } - } - // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2) - if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) && - Op0 == Op1 && LL.getValueType().isInteger() && - Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) || - (isAllOnesConstant(LR) && isNullConstant(RR)))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDLoc DL(N0); - SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(), - LL, DAG.getConstant(1, DL, - LL.getValueType())); - AddToWorklist(ADDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode, - DAG.getConstant(2, DL, LL.getValueType()), - ISD::SETUGE); - } - } - // canonicalize equivalent to ll == rl - if (LL == RR && LR == RL) { - Op1 = ISD::getSetCCSwappedOperands(Op1); - std::swap(RL, RR); - } - if (LL == RL && LR == RR) { - bool isInteger = LL.getValueType().isInteger(); - ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger); - if (Result != ISD::SETCC_INVALID && - (!LegalOperations || - (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (N0.getValueType() == CCVT || - (!LegalOperations && N0.getValueType() == MVT::i1)) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); - } - } - } + return DAG.getConstant(0, DL, VT); + + if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) + return V; if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64) { @@ -3037,13 +3326,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { ADDC |= Mask; if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { - SDLoc DL(N0); + SDLoc DL0(N0); SDValue NewAdd = - DAG.getNode(ISD::ADD, DL, VT, + DAG.getNode(ISD::ADD, DL0, VT, N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); CombineTo(N0.getNode(), NewAdd); // Return N so it doesn't get rechecked! - return SDValue(LocReference, 0); + return SDValue(N, 0); } } } @@ -3068,7 +3357,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, unsigned MaskBits = AndMask.countTrailingOnes(); EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); - if (APIntOps::isMask(AndMask) && + if (AndMask.isMask() && // Required bits must not span the two halves of the integer and // must fit in the half size type. (ShiftBits + MaskBits <= Size / 2) && @@ -3108,7 +3397,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, bool &NarrowLoad) { uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); - if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue())) + if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits)) return false; ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); @@ -3191,6 +3480,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate and if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; @@ -3299,6 +3592,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to // preserve semantics once we get rid of the AND. SDValue NewLoad(Load, 0); + + // Fold the AND away. NewLoad may get replaced immediately. + CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); + if (Load->getExtensionType() == ISD::EXTLOAD) { NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), SDLoc(Load), @@ -3316,10 +3613,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } - // Fold the AND away, taking care not to fold to the old load node if we - // replaced it. - CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); - return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -3723,65 +4016,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { /// This contains all DAGCombine rules which reduce two values combined by /// an Or operation to a single value \see visitANDLike(). -SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { +SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); + SDLoc DL(N); + // fold (or x, undef) -> -1 - if (!LegalOperations && - (N0.isUndef() || N1.isUndef())) { - EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT; - return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), - SDLoc(LocReference), VT); - } - // fold (or (setcc x), (setcc y)) -> (setcc (or x, y)) - SDValue LL, LR, RL, RR, CC0, CC1; - if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ - ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); - ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - - if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) { - // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0) - // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0) - if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1) - // fold (or (setgt X, -1), (setgt Y -1)) -> (setgt (and X, Y), -1) - if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR), - LR.getValueType(), LL, RL); - AddToWorklist(ANDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); - } - } - } - // canonicalize equivalent to ll == rl - if (LL == RR && LR == RL) { - Op1 = ISD::getSetCCSwappedOperands(Op1); - std::swap(RL, RR); - } - if (LL == RL && LR == RR) { - bool isInteger = LL.getValueType().isInteger(); - ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger); - if (Result != ISD::SETCC_INVALID && - (!LegalOperations || - (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (N0.getValueType() == CCVT || - (!LegalOperations && N0.getValueType() == MVT::i1)) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); - } - } - } + if (!LegalOperations && (N0.isUndef() || N1.isUndef())) + return DAG.getAllOnesConstant(DL, VT); + + if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) + return V; // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && @@ -3802,7 +4046,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1.getOperand(0)); - SDLoc DL(LocReference); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(LHSMask | RHSMask, DL, VT)); } @@ -3818,7 +4061,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(1), N1.getOperand(1)); - return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X); + return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); } return SDValue(); @@ -3847,14 +4090,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // fold (or x, -1) -> -1, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) // do not return N0, because undef node may exist in N0 - return DAG.getConstant( - APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N), - N0.getValueType()); + return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 - return DAG.getConstant( - APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N), - N1.getValueType()); + return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) // Do this only if the resulting shuffle is legal. @@ -3867,7 +4106,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); // Ensure both shuffles have a zero input. - if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) { + if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); @@ -3939,6 +4178,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // fold (or x, -1) -> -1 if (isAllOnesConstant(N1)) return N1; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (or x, c) -> c iff (x & ~c) == 0 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; @@ -3956,7 +4199,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) - // iff (c1 & c2) == 0. + // iff (c1 & c2) != 0. if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && isa<ConstantSDNode>(N0.getOperand(1))) { ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1)); @@ -3978,6 +4221,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); + if (SDValue Load = MatchLoadCombine(N)) + return Load; + // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) @@ -4190,8 +4436,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { - APInt AllBits = APInt::getAllOnesValue(EltSizeInBits); - SDValue Mask = DAG.getConstant(AllBits, DL, VT); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); if (LHSMask.getNode()) { APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); @@ -4349,6 +4594,299 @@ struct BaseIndexOffset { }; } // namespace +namespace { +/// Represents known origin of an individual byte in load combine pattern. The +/// value of the byte is either constant zero or comes from memory. +struct ByteProvider { + // For constant zero providers Load is set to nullptr. For memory providers + // Load represents the node which loads the byte from memory. + // ByteOffset is the offset of the byte in the value produced by the load. + LoadSDNode *Load; + unsigned ByteOffset; + + ByteProvider() : Load(nullptr), ByteOffset(0) {} + + static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { + return ByteProvider(Load, ByteOffset); + } + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } + + bool isConstantZero() const { return !Load; } + bool isMemory() const { return Load; } + + bool operator==(const ByteProvider &Other) const { + return Other.Load == Load && Other.ByteOffset == ByteOffset; + } + +private: + ByteProvider(LoadSDNode *Load, unsigned ByteOffset) + : Load(Load), ByteOffset(ByteOffset) {} +}; + +/// Recursively traverses the expression calculating the origin of the requested +/// byte of the given value. Returns None if the provider can't be calculated. +/// +/// For all the values except the root of the expression verifies that the value +/// has exactly one use and if it's not true return None. This way if the origin +/// of the byte is returned it's guaranteed that the values which contribute to +/// the byte are not used outside of this expression. +/// +/// Because the parts of the expression are not allowed to have more than one +/// use this function iterates over trees, not DAGs. So it never visits the same +/// node more than once. +const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, + unsigned Depth, + bool Root = false) { + // Typical i64 by i8 pattern requires recursion up to 8 calls depth + if (Depth == 10) + return None; + + if (!Root && !Op.hasOneUse()) + return None; + + assert(Op.getValueType().isScalarInteger() && "can't handle other types"); + unsigned BitWidth = Op.getValueSizeInBits(); + if (BitWidth % 8 != 0) + return None; + unsigned ByteWidth = BitWidth / 8; + assert(Index < ByteWidth && "invalid index requested"); + (void) ByteWidth; + + switch (Op.getOpcode()) { + case ISD::OR: { + auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); + if (!LHS) + return None; + auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); + if (!RHS) + return None; + + if (LHS->isConstantZero()) + return RHS; + if (RHS->isConstantZero()) + return LHS; + return None; + } + case ISD::SHL: { + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!ShiftOp) + return None; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return None; + uint64_t ByteShift = BitShift / 8; + + return Index < ByteShift + ? ByteProvider::getConstantZero() + : calculateByteProvider(Op->getOperand(0), Index - ByteShift, + Depth + 1); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? Optional<ByteProvider>(ByteProvider::getConstantZero()) + : None; + return calculateByteProvider(NarrowOp, Index, Depth + 1); + } + case ISD::BSWAP: + return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, + Depth + 1); + case ISD::LOAD: { + auto L = cast<LoadSDNode>(Op.getNode()); + if (L->isVolatile() || L->isIndexed()) + return None; + + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return L->getExtensionType() == ISD::ZEXTLOAD + ? Optional<ByteProvider>(ByteProvider::getConstantZero()) + : None; + return ByteProvider::getMemory(L, Index); + } + } + + return None; +} +} // namespace + +/// Match a pattern where a wide type scalar value is loaded by several narrow +/// loads and combined by shifts and ors. Fold it into a single load or a load +/// and a BSWAP if the targets supports it. +/// +/// Assuming little endian target: +/// i8 *a = ... +/// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) +/// => +/// i32 val = *((i32)a) +/// +/// i8 *a = ... +/// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] +/// => +/// i32 val = BSWAP(*((i32)a)) +/// +/// TODO: This rule matches complex patterns with OR node roots and doesn't +/// interact well with the worklist mechanism. When a part of the pattern is +/// updated (e.g. one of the loads) its direct users are put into the worklist, +/// but the root node of the pattern which triggers the load combine is not +/// necessarily a direct user of the changed node. For example, once the address +/// of t28 load is reassociated load combine won't be triggered: +/// t25: i32 = add t4, Constant:i32<2> +/// t26: i64 = sign_extend t25 +/// t27: i64 = add t2, t26 +/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 +/// t29: i32 = zero_extend t28 +/// t32: i32 = shl t29, Constant:i8<8> +/// t33: i32 = or t23, t32 +/// As a possible fix visitLoad can check if the load can be a part of a load +/// combine pattern and add corresponding OR roots to the worklist. +SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { + assert(N->getOpcode() == ISD::OR && + "Can only match load combining against OR nodes"); + + // Handles simple types only + EVT VT = N->getValueType(0); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + unsigned ByteWidth = VT.getSizeInBits() / 8; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = []( + unsigned BW, unsigned i) { return i; }; + std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = []( + unsigned BW, unsigned i) { return BW - i - 1; }; + + bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); + auto MemoryByteOffset = [&] (ByteProvider P) { + assert(P.isMemory() && "Must be a memory byte provider"); + unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); + assert(LoadBitWidth % 8 == 0 && + "can only analyze providers for individual bytes not bit"); + unsigned LoadByteWidth = LoadBitWidth / 8; + return IsBigEndianTarget + ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) + : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); + }; + + Optional<BaseIndexOffset> Base; + SDValue Chain; + + SmallSet<LoadSDNode *, 8> Loads; + Optional<ByteProvider> FirstByteProvider; + int64_t FirstOffset = INT64_MAX; + + // Check if all the bytes of the OR we are looking at are loaded from the same + // base address. Collect bytes offsets from Base address in ByteOffsets. + SmallVector<int64_t, 4> ByteOffsets(ByteWidth); + for (unsigned i = 0; i < ByteWidth; i++) { + auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); + if (!P || !P->isMemory()) // All the bytes must be loaded from memory + return SDValue(); + + LoadSDNode *L = P->Load; + assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && + "Must be enforced by calculateByteProvider"); + assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); + + // All loads must share the same chain + SDValue LChain = L->getChain(); + if (!Chain) + Chain = LChain; + else if (Chain != LChain) + return SDValue(); + + // Loads must share the same base address + BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); + if (!Base) + Base = Ptr; + else if (!Base->equalBaseIndex(Ptr)) + return SDValue(); + + // Calculate the offset of the current byte from the base address + int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset(*P); + ByteOffsets[i] = ByteOffsetFromBase; + + // Remember the first byte load + if (ByteOffsetFromBase < FirstOffset) { + FirstByteProvider = P; + FirstOffset = ByteOffsetFromBase; + } + + Loads.insert(L); + } + assert(Loads.size() > 0 && "All the bytes of the value must be loaded from " + "memory, so there must be at least one load which produces the value"); + assert(Base && "Base address of the accessed memory location must be set"); + assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + + // Check if the bytes of the OR we are looking at match with either big or + // little endian value load + bool BigEndian = true, LittleEndian = true; + for (unsigned i = 0; i < ByteWidth; i++) { + int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; + LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); + BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); + if (!BigEndian && !LittleEndian) + return SDValue(); + } + assert((BigEndian != LittleEndian) && "should be either or"); + assert(FirstByteProvider && "must be set"); + + // Ensure that the first byte is loaded from zero offset of the first load. + // So the combined value can be loaded from the first load address. + if (MemoryByteOffset(*FirstByteProvider) != 0) + return SDValue(); + LoadSDNode *FirstLoad = FirstByteProvider->Load; + + // The node we are looking at matches with the pattern, check if we can + // replace it with a single load and bswap if needed. + + // If the load needs byte swap check if the target supports it + bool NeedsBswap = IsBigEndianTarget != BigEndian; + + // Before legalize we can introduce illegal bswaps which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // load and byte shuffling instead of several loads and byte shuffling. + if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // Check that a load of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + VT, FirstLoad->getAddressSpace(), + FirstLoad->getAlignment(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + + SDValue NewLoad = + DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + + // Transfer chain users from old loads to the new load. + for (LoadSDNode *L : Loads) + DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); + + return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; +} + SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4386,6 +4924,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate xor if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1)) return RXOR; @@ -4403,9 +4945,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: - return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC); + return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); case ISD::SELECT_CC: - return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2), + return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), N0.getOperand(3), NotCC); } } @@ -4470,6 +5012,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { N01C->getAPIntValue(), DL, VT)); } } + + // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) && + TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { + if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) + if (C->getAPIntValue() == (OpSizeInBits - 1)) + return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0)); + } + // fold (xor x, x) -> 0 if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); @@ -4673,6 +5226,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl undef, x) -> 0 if (N0.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -4808,9 +5365,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && isConstantOrConstantVector(N1, /* No Opaques */ true)) { - unsigned BitSize = VT.getScalarSizeInBits(); SDLoc DL(N); - SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT); + SDValue AllBits = DAG.getAllOnesConstant(DL, VT); SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); } @@ -4877,6 +5433,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { @@ -5024,6 +5584,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // if (srl x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -5074,9 +5638,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && isConstantOrConstantVector(N1, /* NoOpaques */ true)) { SDLoc DL(N); - APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits()); SDValue Mask = - DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1); + DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); AddToWorklist(Mask.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); } @@ -5202,6 +5765,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitABS(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // fold (abs c1) -> c2 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) + return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); + // fold (abs (abs x)) -> (abs x) + if (N0.getOpcode() == ISD::ABS) + return N0; + // fold (abs x) -> x iff not-negative + if (DAG.SignBitIsZero(N0)) + return N0; + return SDValue(); +} + SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -5217,7 +5796,11 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + // fold (bitreverse c1) -> c2 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) + return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); // fold (bitreverse (bitreverse x)) -> x if (N0.getOpcode() == ISD::BITREVERSE) return N0.getOperand(0); @@ -5311,7 +5894,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, } } -// TODO: We should handle other cases of selecting between {-1,0,1} here. SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5320,6 +5902,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { EVT CondVT = Cond.getValueType(); SDLoc DL(N); + if (!VT.isInteger()) + return SDValue(); + + auto *C1 = dyn_cast<ConstantSDNode>(N1); + auto *C2 = dyn_cast<ConstantSDNode>(N2); + if (!C1 || !C2) + return SDValue(); + + // Only do this before legalization to avoid conflicting with target-specific + // transforms in the other direction (create a select from a zext/sext). There + // is also a target-independent combine here in DAGCombiner in the other + // direction for (select Cond, -1, 0) when the condition is not i1. + if (CondVT == MVT::i1 && !LegalOperations) { + if (C1->isNullValue() && C2->isOne()) { + // select Cond, 0, 1 --> zext (!Cond) + SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); + if (VT != MVT::i1) + NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); + return NotCond; + } + if (C1->isNullValue() && C2->isAllOnesValue()) { + // select Cond, 0, -1 --> sext (!Cond) + SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); + if (VT != MVT::i1) + NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); + return NotCond; + } + if (C1->isOne() && C2->isNullValue()) { + // select Cond, 1, 0 --> zext (Cond) + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); + return Cond; + } + if (C1->isAllOnesValue() && C2->isNullValue()) { + // select Cond, -1, 0 --> sext (Cond) + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + return Cond; + } + + // For any constants that differ by 1, we can transform the select into an + // extend and add. Use a target hook because some targets may prefer to + // transform in the other direction. + if (TLI.convertSelectOfConstantsToMath()) { + if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { + // select Cond, C1, C1-1 --> add (zext Cond), C1-1 + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); + } + if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) { + // select Cond, C1, C1+1 --> add (sext Cond), C1+1 + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); + } + } + + return SDValue(); + } + // fold (select Cond, 0, 1) -> (xor Cond, 1) // We can't do this reliably if integer based booleans have different contents // to floating point based booleans. This is because we can't tell whether we @@ -5329,15 +5972,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { // undiscoverable (or not reasonably discoverable). For example, it could be // in another basic block or it could require searching a complicated // expression. - if (VT.isInteger() && - (CondVT == MVT::i1 || (CondVT.isInteger() && - TLI.getBooleanContents(false, true) == - TargetLowering::ZeroOrOneBooleanContent && - TLI.getBooleanContents(false, false) == - TargetLowering::ZeroOrOneBooleanContent)) && - isNullConstant(N1) && isOneConstant(N2)) { - SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond, - DAG.getConstant(1, DL, CondVT)); + if (CondVT.isInteger() && + TLI.getBooleanContents(false, true) == + TargetLowering::ZeroOrOneBooleanContent && + TLI.getBooleanContents(false, false) == + TargetLowering::ZeroOrOneBooleanContent && + C1->isNullValue() && C2->isOne()) { + SDValue NotCond = + DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); if (VT.bitsEq(CondVT)) return NotCond; return DAG.getZExtOrTrunc(NotCond, DL, VT); @@ -5847,7 +6489,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { ISD::NON_EXTLOAD, MLD->isExpandingLoad()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, - MLD->isExpandingLoad()); + MLD->isExpandingLoad()); MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), @@ -5921,34 +6563,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. - // If the VSELECT result requires splitting and the mask is provided by a - // SETCC, then split both nodes and its operands before legalization. This - // prevents the type legalizer from unrolling SETCC into scalar comparisons - // and enables future optimizations (e.g. min/max pattern matching on X86). - if (N0.getOpcode() == ISD::SETCC) { - EVT VT = N->getValueType(0); - - // Check if any splitting is required. - if (TLI.getTypeAction(*DAG.getContext(), VT) != - TargetLowering::TypeSplitVector) - return SDValue(); - - SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH; - std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG); - std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1); - std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2); - - Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL); - Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH); - - // Add the new VSELECT nodes to the work list in case they need to be split - // again. - AddToWorklist(Lo.getNode()); - AddToWorklist(Hi.getNode()); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); - } - // Fold (vselect (build_vector all_ones), N1, N2) -> N1 if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; @@ -6258,6 +6872,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); + // Simplify TF. + AddToWorklist(NewChain.getNode()); + CombineTo(N, NewValue); // Replace uses of the original load (before extension) @@ -6273,6 +6890,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) @@ -6281,8 +6899,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, - N0.getOperand(0)); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) @@ -6314,12 +6931,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign // bits, just sext from i32. if (NumSignBits > OpBits-MidBits) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); } else { // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign // bits, just truncate to i32. if (NumSignBits > OpBits-MidBits) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } // fold (sext (truncate x)) -> (sextinreg x). @@ -6329,7 +6946,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); else if (OpBits > DestBits) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op, + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(N0.getValueType())); } } @@ -6349,16 +6966,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), - ISD::SIGN_EXTEND); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6376,8 +6991,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); @@ -6411,7 +7025,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { LN0->getMemOperand()); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.sext(VT.getSizeInBits()); - SDLoc DL(N); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, @@ -6419,24 +7032,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { N0.getOperand(0).getValueType(), ExtLoad); CombineTo(N, And); CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, - ISD::SIGN_EXTEND); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } if (N0.getOpcode() == ISD::SETCC) { - EVT N0VT = N0.getOperand(0).getValueType(); + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); + EVT N00VT = N0.getOperand(0).getValueType(); + // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && - TLI.getBooleanContents(N0VT) == + TLI.getBooleanContents(N00VT) == TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. - EVT SVT = getSetCCResultType(N0VT); + EVT SVT = getSetCCResultType(N00VT); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result @@ -6444,19 +7060,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == SVT.getSizeInBits()) - return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), - N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); + return DAG.getSetCC(DL, VT, N00, N01, CC); // If the desired elements are smaller or larger than the source - // elements we can use a matching integer vector type and then - // truncate/sign extend - EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); - if (SVT == MatchingVectorType) { - SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, - N0.getOperand(0), N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); - return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT); + // elements, we can use a matching integer vector type and then + // truncate/sign extend. + EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); + if (SVT == MatchingVecType) { + SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); + return DAG.getSExtOrTrunc(VsetCC, DL, VT); } } @@ -6465,36 +7077,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // getBooleanContents(). unsigned SetCCWidth = N0.getScalarValueSizeInBits(); - SDLoc DL(N); // To determine the "true" side of the select, we need to know the high bit // of the value returned by the setcc if it evaluates to true. // If the type of the setcc is i1, then the true case of the select is just // sext(i1 1), that is, -1. // If the type of the setcc is larger (say, i8) then the value of the high - // bit depends on getBooleanContents(). So, ask TLI for a real "true" value + // bit depends on getBooleanContents(), so ask TLI for a real "true" value // of the appropriate width. - SDValue ExtTrueVal = - (SetCCWidth == 1) - ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), - DL, VT) - : TLI.getConstTrueVal(DAG, VT, DL); - - if (SDValue SCC = SimplifySelectCC( - DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal, - DAG.getConstant(0, DL, VT), - cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) + SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT) + : TLI.getConstTrueVal(DAG, VT, DL); + SDValue Zero = DAG.getConstant(0, DL, VT); + if (SDValue SCC = + SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) return SCC; if (!VT.isVector()) { - EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType()); - if (!LegalOperations || - TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { - SDLoc DL(N); - ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - SDValue SetCC = - DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); - return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, - DAG.getConstant(0, DL, VT)); + EVT SetCCVT = getSetCCResultType(N00VT); + // Don't do this transform for i1 because there's a select transform + // that would reverse it. + // TODO: We should not do this transform at all without a target hook + // because a sext is likely cheaper than a select? + if (SetCCVT.getScalarSizeInBits() != 1 && + (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { + SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); + return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); } } } @@ -6502,7 +7108,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext x) -> (zext x) if the sign bit is known zero. if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); return SDValue(); } @@ -6677,13 +7283,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); + CombineTo(N, ExtLoad); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6991,9 +7598,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitAssertZext(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT EVT = cast<VTSDNode>(N1)->getVT(); + + // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt) + if (N0.getOpcode() == ISD::AssertZext && + EVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) + return N0; + + return SDValue(); +} + /// See if the specified operand can be simplified with the knowledge that only /// the bits specified by Mask are used. If so, return the simpler operand, /// otherwise return a null SDValue. +/// +/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can +/// simplify nodes with multiple uses more aggressively.) SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { switch (V.getOpcode()) { default: break; @@ -7029,6 +7652,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, V.getOperand(1)); } + break; + case ISD::AND: { + // X & -1 -> X (ignoring bits which aren't demanded). + ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1)); + if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask) + return V.getOperand(0); + break; + } } return SDValue(); } @@ -7244,6 +7875,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } + // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x) + if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && + N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { + if (!LegalOperations || + TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) + return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT); + } + // fold (sext_in_reg (zext x)) -> (sext x) // iff we are extending the source sign bit. if (N0.getOpcode() == ISD::ZERO_EXTEND) { @@ -7254,7 +7895,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. - if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits))) + if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1))) return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType()); // fold operands of sext_in_reg based on knowledge that the top bits are not @@ -7496,6 +8137,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { VT.getSizeInBits()))) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } + // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { @@ -7517,6 +8159,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } } + // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), // where ... are all 'undef'. if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { @@ -7582,6 +8225,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) + // When the adde's carry is not used. + if (N0.getOpcode() == ISD::ADDE && N0.hasOneUse() && + !N0.getNode()->hasAnyUseOfValue(1) && + (!LegalOperations || TLI.isOperationLegal(ISD::ADDE, VT))) { + SDLoc SL(N); + auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); + auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); + return DAG.getNode(ISD::ADDE, SL, DAG.getVTList(VT, MVT::Glue), + X, Y, N0.getOperand(2)); + } + return SDValue(); } @@ -7672,6 +8327,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (N0.isUndef()) + return DAG.getUNDEF(VT); + // If the input is a BUILD_VECTOR with all constant elements, fold this now. // Only do this before legalize, since afterward the target may be depending // on the bitconvert. @@ -8040,6 +8698,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { return DAG.getBuildVector(VT, DL, Ops); } +static bool isContractable(SDNode *N) { + SDNodeFlags F = cast<BinaryWithFlagsSDNode>(N)->Flags; + return F.hasAllowContract() || F.hasUnsafeAlgebra(); +} + /// Try to perform FMA combining on a given FADD node. SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); @@ -8048,24 +8711,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + // If the addition is not contractable, do not combine. + if (!AllowFusionGlobally && !isContractable(N)) + return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); - ; - if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -8073,35 +8739,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // Is the node an FMUL and contractable either due to global flags or + // SDNodeFlags. + auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { + if (N.getOpcode() != ISD::FMUL) + return false; + return AllowFusionGlobally || isContractable(N.getNode()); + }; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && N0.getOpcode() == ISD::FMUL && - N1.getOpcode() == ISD::FMUL) { + if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { if (N0.getNode()->use_size() > N1.getNode()->use_size()) std::swap(N0, N1); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (N0.getOpcode() == ISD::FMUL && - (Aggressive || N0->hasOneUse())) { + if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FMUL && - (Aggressive || N1->hasOneUse())) { + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } // Look through FP_EXTEND nodes to do more combining. - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N00)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8113,7 +8783,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N10)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), @@ -8154,7 +8824,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N0)); } - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( @@ -8169,7 +8839,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); - if (N020.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N020)) return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), N1); @@ -8195,7 +8865,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); - if (N002.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N002)) return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), N1); @@ -8208,7 +8878,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N12 = N1.getOperand(2); if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); - if (N120.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N120)) return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), N0); @@ -8224,7 +8894,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == PreferredFusedOpcode) { SDValue N102 = N10.getOperand(2); - if (N102.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N102)) return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), N0); @@ -8244,23 +8914,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); - // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + // If the subtraction is not contractable, do not combine. + if (!AllowFusionGlobally && !isContractable(N)) + return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); - if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -8268,9 +8941,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // Is the node an FMUL and contractable either due to global flags or + // SDNodeFlags. + auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { + if (N.getOpcode() != ISD::FMUL) + return false; + return AllowFusionGlobally || isContractable(N.getNode()); + }; + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) - if (N0.getOpcode() == ISD::FMUL && - (Aggressive || N0->hasOneUse())) { + if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1)); @@ -8278,16 +8958,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FMUL && - (Aggressive || N1->hasOneUse())) + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0); // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) - if (N0.getOpcode() == ISD::FNEG && - N0.getOperand(0).getOpcode() == ISD::FMUL && + if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); @@ -8297,12 +8975,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N00)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8316,7 +8994,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N10)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8336,7 +9014,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N000)) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8358,7 +9036,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N000)) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8378,10 +9056,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma x, y (fma u, v, (fneg z))) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && - N0.getOpcode() == PreferredFusedOpcode && - N0.getOperand(2).getOpcode() == ISD::FMUL && - N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { + if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && + isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && + N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8395,9 +9072,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma (fneg y), z, (fma (fneg u), v, x)) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && - N1.getOpcode() == PreferredFusedOpcode && - N1.getOperand(2).getOpcode() == ISD::FMUL) { + if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && + isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8410,14 +9086,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N21, N0)); } - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); - if (N020.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N020)) return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8440,7 +9116,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); - if (N002.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N002)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8461,7 +9137,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N1.getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N1.getOperand(2).getOperand(0); - if (N120.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N120)) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8488,7 +9164,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N100 = N1.getOperand(0).getOperand(0); SDValue N101 = N1.getOperand(0).getOperand(1); SDValue N102 = N1.getOperand(0).getOperand(2); - if (N102.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N102)) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8624,6 +9300,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) @@ -8637,7 +9316,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { GetNegatedExpression(N0, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. - if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || N->getFlags()->hasNoSignedZeros()) { // fold (fadd A, 0) -> A if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1)) if (N1C->isZero()) @@ -8771,13 +9450,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (N0CFP && N1CFP) return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. - if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || N->getFlags()->hasNoSignedZeros()) { // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) @@ -8850,6 +9532,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (N1CFP && N1CFP->isExactlyValue(1.0)) return N0; + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (Options.UnsafeFPMath) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) @@ -9104,6 +9789,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (N0CFP && N1CFP) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (Options.UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { @@ -9207,6 +9895,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, &cast<BinaryWithFlagsSDNode>(N)->Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + return SDValue(); } @@ -10361,7 +11052,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { dbgs() << "\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); - + AddUsersToWorklist(Chain.getNode()); if (N->use_empty()) deleteAndRecombine(N); @@ -10414,7 +11105,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { StoreSDNode *PrevST = cast<StoreSDNode>(Chain); if (PrevST->getBasePtr() == Ptr && PrevST->getValue().getValueType() == N->getValueType(0)) - return CombineTo(N, Chain.getOperand(1), Chain); + return CombineTo(N, PrevST->getOperand(1), Chain); } } @@ -10432,14 +11123,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { } } - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); -#ifndef NDEBUG - if (CombinerAAOnlyFunc.getNumOccurrences() && - CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) - UseAA = false; -#endif - if (UseAA && LD->isUnindexed()) { + if (LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); @@ -11021,6 +11705,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, ArgChains); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); + AddToWorklist(Chain.getNode()); return true; } @@ -11414,18 +12099,24 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, return false; } -SDValue DAGCombiner::getMergedConstantVectorStore( - SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores, - SmallVectorImpl<SDValue> &Chains, EVT Ty) const { - SmallVector<SDValue, 8> BuildVector; +SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, + unsigned NumStores) { + SmallVector<SDValue, 8> Chains; + SmallPtrSet<const SDNode *, 8> Visited; + SDLoc StoreDL(StoreNodes[0].MemNode); + + for (unsigned i = 0; i < NumStores; ++i) { + Visited.insert(StoreNodes[i].MemNode); + } - for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { - StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode); - Chains.push_back(St->getChain()); - BuildVector.push_back(St->getValue()); + // don't include nodes that are children + for (unsigned i = 0; i < NumStores; ++i) { + if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) + Chains.push_back(StoreNodes[i].MemNode->getChain()); } - return DAG.getBuildVector(Ty, SL, BuildVector); + assert(Chains.size() > 0 && "Chain should have generated a chain"); + return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( @@ -11436,22 +12127,8 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( return false; int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - unsigned LatestNodeUsed = 0; - - for (unsigned i=0; i < NumStores; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // latest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) - LatestNodeUsed = i; - } - - SmallVector<SDValue, 8> Chains; // The latest Node in the DAG. - LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); SDValue StoredVal; @@ -11467,7 +12144,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); if (IsConstantSrc) { - StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty); + SmallVector<SDValue, 8> BuildVector; + for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); + SDValue Val = St->getValue(); + if (MemVT.getScalarType().isInteger()) + if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue())) + Val = DAG.getConstant( + (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(), + SDLoc(CFP), MemVT); + BuildVector.push_back(Val); + } + StoredVal = DAG.getBuildVector(Ty, DL, BuildVector); } else { SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumStores; ++i) { @@ -11477,7 +12165,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( if (Val.getValueType() != MemVT) return false; Ops.push_back(Val); - Chains.push_back(St->getChain()); } // Build the extracted vector elements back into a vector. @@ -11497,7 +12184,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( for (unsigned i = 0; i < NumStores; ++i) { unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); - Chains.push_back(St->getChain()); SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes * 8; @@ -11515,54 +12201,27 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } - assert(!Chains.empty()); - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstInChain->getAlignment()); - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - if (UseAA) { - // Replace all merged stores with the new store. - for (unsigned i = 0; i < NumStores; ++i) - CombineTo(StoreNodes[i].MemNode, NewStore); - } else { - // Replace the last store with the new store. - CombineTo(LatestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumStores; ++i) { - if (StoreNodes[i].MemNode == LatestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - // ReplaceAllUsesWith will replace all uses that existed when it was - // called, but graph optimizations may cause new ones to appear. For - // example, the case in pr14333 looks like - // - // St's chain -> St -> another store -> X - // - // And the only difference from St to the other store is the chain. - // When we change it's chain to be St's chain they become identical, - // get CSEed and the net result is that X is now a use of St. - // Since we know that St is redundant, just iterate. - while (!St->use_empty()) - DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); - } - } + // Replace all merged stores with the new store. + for (unsigned i = 0; i < NumStores; ++i) + CombineTo(StoreNodes[i].MemNode, NewStore); - StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end()); + AddToWorklist(NewChain.getNode()); return true; } -void DAGCombiner::getStoreMergeAndAliasCandidates( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, - SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) { +void DAGCombiner::getStoreMergeCandidates( + StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); + EVT MemVT = St->getMemoryVT(); // We must have a base and an offset. if (!BasePtr.Base.getNode()) @@ -11572,104 +12231,71 @@ void DAGCombiner::getStoreMergeAndAliasCandidates( if (BasePtr.Base.isUndef()) return; - // Walk up the chain and look for nodes with offsets from the same - // base pointer. Stop when reaching an instruction with a different kind - // or instruction which has a different base pointer. - EVT MemVT = St->getMemoryVT(); - unsigned Seq = 0; - StoreSDNode *Index = St; - - - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - - if (UseAA) { - // Look at other users of the same chain. Stores on the same chain do not - // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized - // to be on the same chain, so don't bother looking at adjacent chains. - - SDValue Chain = St->getChain(); - for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) { - if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { - if (I.getOperandNo() != 0) - continue; - - if (OtherST->isVolatile() || OtherST->isIndexed()) - continue; - - if (OtherST->getMemoryVT() != MemVT) - continue; - - BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG); - - if (Ptr.equalBaseIndex(BasePtr)) - StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++)); - } - } - - return; - } - - while (Index) { - // If the chain has more than one use, then we can't reorder the mem ops. - if (Index != St && !SDValue(Index, 0)->hasOneUse()) - break; - - // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); - - // Check that the base pointer is the same as the original one. - if (!Ptr.equalBaseIndex(BasePtr)) - break; - - // The memory operands must not be volatile. - if (Index->isVolatile() || Index->isIndexed()) - break; - - // No truncation. - if (Index->isTruncatingStore()) - break; - - // The stored memory type must be the same. - if (Index->getMemoryVT() != MemVT) - break; - - // We do not allow under-aligned stores in order to prevent - // overriding stores. NOTE: this is a bad hack. Alignment SHOULD - // be irrelevant here; what MATTERS is that we not move memory - // operations that potentially overlap past each-other. - if (Index->getAlignment() < MemVT.getStoreSize()) - break; - - // We found a potential memory operand to merge. - StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); - - // Find the next memory operand in the chain. If the next operand in the - // chain is a store then move up and continue the scan with the next - // memory operand. If the next operand is a load save it and use alias - // information to check if it interferes with anything. - SDNode *NextInChain = Index->getChain().getNode(); - while (1) { - if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { - // We found a store node. Use it for the next iteration. - Index = STn; - break; - } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) { - if (Ldn->isVolatile()) { - Index = nullptr; - break; + bool IsLoadSrc = isa<LoadSDNode>(St->getValue()); + bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) || + isa<ConstantFPSDNode>(St->getValue()); + bool IsExtractVecSrc = + (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); + auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool { + if (Other->isVolatile() || Other->isIndexed()) + return false; + // We can merge constant floats to equivalent integers + if (Other->getMemoryVT() != MemVT) + if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && + isa<ConstantFPSDNode>(Other->getValue()))) + return false; + if (IsLoadSrc) + if (!isa<LoadSDNode>(Other->getValue())) + return false; + if (IsConstantSrc) + if (!(isa<ConstantSDNode>(Other->getValue()) || + isa<ConstantFPSDNode>(Other->getValue()))) + return false; + if (IsExtractVecSrc) + if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) + return false; + Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); + return (Ptr.equalBaseIndex(BasePtr)); + }; + // We looking for a root node which is an ancestor to all mergable + // stores. We search up through a load, to our root and then down + // through all children. For instance we will find Store{1,2,3} if + // St is Store1, Store2. or Store3 where the root is not a load + // which always true for nonvolatile ops. TODO: Expand + // the search to find all valid candidates through multiple layers of loads. + // + // Root + // |-------|-------| + // Load Load Store3 + // | | + // Store1 Store2 + // + // FIXME: We should be able to climb and + // descend TokenFactors to find candidates as well. + + SDNode *RootNode = (St->getChain()).getNode(); + + if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { + RootNode = Ldn->getChain().getNode(); + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain + for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) + if (I2.getOperandNo() == 0) + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { + BaseIndexOffset Ptr; + if (CandidateMatch(OtherST, Ptr)) + StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset)); + } + } else + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + if (I.getOperandNo() == 0) + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { + BaseIndexOffset Ptr; + if (CandidateMatch(OtherST, Ptr)) + StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset)); } - - // Save the load node for later. Continue the scan. - AliasLoadNodes.push_back(Ldn); - NextInChain = Ldn->getChain().getNode(); - continue; - } else { - Index = nullptr; - break; - } - } - } } // We need to check that merging these stores does not cause a loop @@ -11678,31 +12304,34 @@ void DAGCombiner::getStoreMergeAndAliasCandidates( // through the chain). Check in parallel by searching up from // non-chain operands of candidates. bool DAGCombiner::checkMergeStoreCandidatesForDependencies( - SmallVectorImpl<MemOpLink> &StoreNodes) { + SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) { SmallPtrSet<const SDNode *, 16> Visited; SmallVector<const SDNode *, 8> Worklist; // search ops of store candidates - for (unsigned i = 0; i < StoreNodes.size(); ++i) { + for (unsigned i = 0; i < NumStores; ++i) { SDNode *n = StoreNodes[i].MemNode; // Potential loops may happen only through non-chain operands for (unsigned j = 1; j < n->getNumOperands(); ++j) Worklist.push_back(n->getOperand(j).getNode()); } // search through DAG. We can stop early if we find a storenode - for (unsigned i = 0; i < StoreNodes.size(); ++i) { + for (unsigned i = 0; i < NumStores; ++i) { if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist)) return false; } return true; } -bool DAGCombiner::MergeConsecutiveStores( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) { +bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + + if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) + return false; + bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute( Attribute::NoImplicitFloat); @@ -11731,145 +12360,137 @@ bool DAGCombiner::MergeConsecutiveStores( if (MemVT.isVector() && IsLoadSrc) return false; - // Only look at ends of store sequences. - SDValue Chain = SDValue(St, 0); - if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) - return false; - - // Save the LoadSDNodes that we find in the chain. - // We need to make sure that these nodes do not interfere with - // any of the store nodes. - SmallVector<LSBaseSDNode*, 8> AliasLoadNodes; - - getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes); + SmallVector<MemOpLink, 8> StoreNodes; + // Find potential store merge candidates by searching through chain sub-DAG + getStoreMergeCandidates(St, StoreNodes); // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; - // only do dependence check in AA case - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes)) - return false; - // Sort the memory operands according to their distance from the - // base pointer. As a secondary criteria: make sure stores coming - // later in the code come first in the list. This is important for - // the non-UseAA case, because we're merging stores into the FINAL - // store along a chain which potentially contains aliasing stores. - // Thus, if there are multiple stores to the same address, the last - // one can be considered for merging but not the others. + // base pointer. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { - return LHS.OffsetFromBase < RHS.OffsetFromBase || - (LHS.OffsetFromBase == RHS.OffsetFromBase && - LHS.SequenceNum < RHS.SequenceNum); - }); + return LHS.OffsetFromBase < RHS.OffsetFromBase; + }); // Scan the memory operations on the chain and find the first non-consecutive // store memory address. - unsigned LastConsecutiveStore = 0; + unsigned NumConsecutiveStores = 0; int64_t StartAddress = StoreNodes[0].OffsetFromBase; - for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) { - // Check that the addresses are consecutive starting from the second - // element in the list of stores. - if (i > 0) { - int64_t CurrAddress = StoreNodes[i].OffsetFromBase; - if (CurrAddress - StartAddress != (ElementSizeBytes * i)) - break; - } - - // Check if this store interferes with any of the loads that we found. - // If we find a load that alias with this store. Stop the sequence. - if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) { - return isAlias(Ldn, StoreNodes[i].MemNode); - })) + // Check that the addresses are consecutive starting from the second + // element in the list of stores. + for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { + int64_t CurrAddress = StoreNodes[i].OffsetFromBase; + if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; - - // Mark this node as useful. - LastConsecutiveStore = i; + NumConsecutiveStores = i + 1; } + if (NumConsecutiveStores < 2) + return false; + + // Check that we can merge these candidates without causing a cycle + if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumConsecutiveStores)) + return false; + + // The node with the lowest store address. - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - unsigned FirstStoreAS = FirstInChain->getAddressSpace(); - unsigned FirstStoreAlign = FirstInChain->getAlignment(); LLVMContext &Context = *DAG.getContext(); const DataLayout &DL = DAG.getDataLayout(); // Store the constants into memory as one consecutive store. if (IsConstantSrc) { - unsigned LastLegalType = 0; - unsigned LastLegalVectorType = 0; - bool NonZero = false; - for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - SDValue StoredVal = St->getValue(); - - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { - NonZero |= !C->isNullValue(); - } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) { - NonZero |= !C->getConstantFPValue()->isNullValue(); - } else { - // Non-constant. - break; - } + bool RV = false; + while (NumConsecutiveStores > 1) { + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + unsigned LastLegalType = 0; + unsigned LastLegalVectorType = 0; + bool NonZero = false; + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); + SDValue StoredVal = ST->getValue(); + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { + NonZero |= !C->isNullValue(); + } else if (ConstantFPSDNode *C = + dyn_cast<ConstantFPSDNode>(StoredVal)) { + NonZero |= !C->getConstantFPValue()->isNullValue(); + } else { + // Non-constant. + break; + } - // Find a legal type for the constant store. - unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); - bool IsFast; - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) { - LastLegalType = i+1; - // Or check whether a truncstore is legal. - } else if (TLI.getTypeAction(Context, StoreTy) == - TargetLowering::TypePromoteInteger) { - EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFast) && + // Find a legal type for the constant store. + unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; + EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); + bool IsFast = false; + if (TLI.isTypeLegal(StoreTy) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) { LastLegalType = i + 1; + // Or check whether a truncstore is legal. + } else if (TLI.getTypeAction(Context, StoreTy) == + TargetLowering::TypePromoteInteger) { + EVT LegalizedStoredValueTy = + TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); + if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFast) && + IsFast) { + LastLegalType = i + 1; + } } - } - // We only use vectors if the constant is known to be zero or the target - // allows it and the function is not marked with the noimplicitfloat - // attribute. - if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1, - FirstStoreAS)) && - !NoVectors) { - // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); - if (TLI.isTypeLegal(Ty) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) - LastLegalVectorType = i + 1; + // We only use vectors if the constant is known to be zero or the target + // allows it and the function is not marked with the noimplicitfloat + // attribute. + if ((!NonZero || + TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && + !NoVectors) { + // Find a legal type for the vector store. + EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1); + if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) && + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && + IsFast) + LastLegalVectorType = i + 1; + } } - } - // Check if we found a legal integer type to store. - if (LastLegalType == 0 && LastLegalVectorType == 0) - return false; + // Check if we found a legal integer type that creates a meaningful merge. + if (LastLegalType < 2 && LastLegalVectorType < 2) + break; - bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; - unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; + bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; + unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, - true, UseVector); + bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + true, UseVector); + if (!Merged) + break; + // Remove merged stores for next iteration. + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + RV = true; + NumConsecutiveStores -= NumElem; + } + return RV; } // When extracting multiple vector elements, try to store them // in one vector store rather than a sequence of scalar stores. if (IsExtractVecSrc) { + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); unsigned NumStoresToMerge = 0; bool IsVec = MemVT.isVector(); - for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); unsigned StoreValOpcode = St->getValue().getOpcode(); // This restriction could be loosened. @@ -11909,7 +12530,7 @@ bool DAGCombiner::MergeConsecutiveStores( // Find acceptable loads. Loads need to have the same chain (token factor), // must not be zext, volatile, indexed, and they must be consecutive. BaseIndexOffset LdBasePtr; - for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); if (!Ld) break; @@ -11942,7 +12563,7 @@ bool DAGCombiner::MergeConsecutiveStores( } // We found a potential memory operand to merge. - LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0)); + LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset)); } if (LoadNodes.size() < 2) @@ -11954,7 +12575,9 @@ bool DAGCombiner::MergeConsecutiveStores( if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && St->getAlignment() >= RequiredAlignment) return false; - + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); unsigned FirstLoadAS = FirstLoad->getAddressSpace(); unsigned FirstLoadAlign = FirstLoad->getAlignment(); @@ -12023,31 +12646,12 @@ bool DAGCombiner::MergeConsecutiveStores( // We add +1 here because the LastXXX variables refer to location while // the NumElem refers to array/index size. - unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1; + unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); NumElem = std::min(LastLegalType, NumElem); if (NumElem < 2) return false; - // Collect the chains from all merged stores. - SmallVector<SDValue, 8> MergeStoreChains; - MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); - - // The latest Node in the DAG. - unsigned LatestNodeUsed = 0; - for (unsigned i=1; i<NumElem; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // latest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) - LatestNodeUsed = i; - - MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); - } - - LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; - // Find if it is better to use vectors or integers to load and store // to memory. EVT JointMemOpVT; @@ -12067,8 +12671,9 @@ bool DAGCombiner::MergeConsecutiveStores( FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoadAlign); - SDValue NewStoreChain = - DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); + + AddToWorklist(NewStoreChain.getNode()); SDValue NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), @@ -12081,25 +12686,9 @@ bool DAGCombiner::MergeConsecutiveStores( SDValue(NewLoad.getNode(), 1)); } - if (UseAA) { - // Replace the all stores with the new store. - for (unsigned i = 0; i < NumElem; ++i) - CombineTo(StoreNodes[i].MemNode, NewStore); - } else { - // Replace the last store with the new store. - CombineTo(LatestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumElem; ++i) { - // Remove all Store nodes. - if (StoreNodes[i].MemNode == LatestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); - } - } - - StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end()); + // Replace the all stores with the new store. + for (unsigned i = 0; i < NumElem; ++i) + CombineTo(StoreNodes[i].MemNode, NewStore); return true; } @@ -12256,19 +12845,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); -#ifndef NDEBUG - if (CombinerAAOnlyFunc.getNumOccurrences() && - CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) - UseAA = false; -#endif - if (UseAA && ST->isUnindexed()) { - // FIXME: We should do this even without AA enabled. AA will just allow - // FindBetterChain to work in more situations. The problem with this is that - // any combine that expects memory operations to be on consecutive chains - // first needs to be updated to look for users of the same chain. - + if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. if (findBetterNeighborChains(ST)) { @@ -12302,8 +12879,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SimplifyDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), - ST->getMemoryVT().getScalarSizeInBits()))) + ST->getMemoryVT().getScalarSizeInBits()))) { + // Re-visit the store if anything changed and the store hasn't been merged + // with another node (N is deleted) SimplifyDemandedBits will add Value's + // node back to the worklist if necessary, but we also need to re-visit + // the Store node itself. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); return SDValue(N, 0); + } } // If this is a load followed by a store to the same location, then the store @@ -12347,15 +12931,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. - SmallVector<MemOpLink, 8> StoreNodes; - bool Changed = MergeConsecutiveStores(ST, StoreNodes); + bool Changed = MergeConsecutiveStores(ST); if (!Changed) break; - - if (any_of(StoreNodes, - [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) { - // ST has been merged and no longer exists. + // Return N as merge only uses CombineTo and no worklist clean + // up is necessary. + if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N)) return SDValue(N, 0); - } } } @@ -12364,7 +12945,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // Make sure to do this only after attempting to merge stores in order to // avoid changing the types of some subset of stores due to visit order, // preventing their merging. - if (isa<ConstantFPSDNode>(Value)) { + if (isa<ConstantFPSDNode>(ST->getValue())) { if (SDValue NewSt = replaceStoreOfFPConstant(ST)) return NewSt; } @@ -12493,10 +13074,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); - // If we can't generate a legal BUILD_VECTOR, exit - if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) - return SDValue(); - // Check that we know which element is being inserted if (!isa<ConstantSDNode>(EltNo)) return SDValue(); @@ -12523,6 +13100,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { } } + // If we can't generate a legal BUILD_VECTOR, exit + if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) + return SDValue(); + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. @@ -12544,11 +13125,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // All the operands of BUILD_VECTOR must have the same type; // we enforce that here. EVT OpVT = Ops[0].getValueType(); - if (InVal.getValueType() != OpVT) - InVal = OpVT.bitsGT(InVal.getValueType()) ? - DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : - DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); - Ops[Elt] = InVal; + Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; } // Return the new vector @@ -12568,6 +13145,11 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); + ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? + ISD::NON_EXTLOAD : ISD::EXTLOAD; + if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) + return SDValue(); + Align = NewAlign; SDValue NewPtr = OriginalLoad->getBasePtr(); @@ -12639,6 +13221,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); + if (InVec.isUndef()) + return DAG.getUNDEF(NVT); + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the @@ -13022,7 +13607,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { return DAG.getNode(Opcode, DL, VT, BV); } -SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N, +SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef<int> VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx) { @@ -13300,6 +13885,35 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); + // Check if we can express BUILD VECTOR via subvector extract. + if (!LegalTypes && (N->getNumOperands() > 1)) { + SDValue Op0 = N->getOperand(0); + auto checkElem = [&](SDValue Op) -> uint64_t { + if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && + (Op0.getOperand(0) == Op.getOperand(0))) + if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return CNode->getZExtValue(); + return -1; + }; + + int Offset = checkElem(Op0); + for (unsigned i = 0; i < N->getNumOperands(); ++i) { + if (Offset + i != checkElem(N->getOperand(i))) { + Offset = -1; + break; + } + } + + if ((Offset == 0) && + (Op0.getOperand(0).getValueType() == N->getValueType(0))) + return Op0.getOperand(0); + if ((Offset != -1) && + ((Offset % N->getValueType(0).getVectorNumElements()) == + 0)) // IDX must be multiple of output size. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), + Op0.getOperand(0), Op0.getOperand(1)); + } + if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; @@ -13491,8 +14105,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, - VT.getSizeInBits() / SclTy.getSizeInBits()); + unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); + if (VNTNumElms < 2) + return SDValue(); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) return SDValue(); @@ -13611,15 +14228,19 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); - if (V->getOpcode() == ISD::CONCAT_VECTORS) { - // Combine: - // (extract_subvec (concat V1, V2, ...), i) - // Into: - // Vi if possible - // Only operand 0 is checked as 'concat' assumes all inputs of the same - // type. - if (V->getOperand(0).getValueType() != NVT) - return SDValue(); + // Extract from UNDEF is UNDEF. + if (V.isUndef()) + return DAG.getUNDEF(NVT); + + // Combine: + // (extract_subvec (concat V1, V2, ...), i) + // Into: + // Vi if possible + // Only operand 0 is checked as 'concat' assumes all inputs of the same + // type. + if (V->getOpcode() == ISD::CONCAT_VECTORS && + isa<ConstantSDNode>(N->getOperand(1)) && + V->getOperand(0).getValueType() == NVT) { unsigned Idx = N->getConstantOperandVal(1); unsigned NumElems = NVT.getVectorNumElements(); assert((Idx % NumElems) == 0 && @@ -13633,19 +14254,16 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { // Handle only simple case where vector being inserted and vector - // being extracted are of same type, and are half size of larger vectors. - EVT BigVT = V->getOperand(0).getValueType(); + // being extracted are of same size. EVT SmallVT = V->getOperand(1).getValueType(); - if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits()) + if (!NVT.bitsEq(SmallVT)) return SDValue(); - // Only handle cases where both indexes are constants with the same type. + // Only handle cases where both indexes are constants. ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1)); ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2)); - if (InsIdx && ExtIdx && - InsIdx->getValueType(0).getSizeInBits() <= 64 && - ExtIdx->getValueType(0).getSizeInBits() <= 64) { + if (InsIdx && ExtIdx) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) // Into: @@ -13892,6 +14510,113 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, return DAG.getBuildVector(VT, SDLoc(SVN), Ops); } +// Match shuffles that can be converted to any_vector_extend_in_reg. +// This is often generated during legalization. +// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) +// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. +SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + EVT VT = SVN->getValueType(0); + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + // TODO Add support for big-endian when we have a test case. + if (!VT.isInteger() || IsBigEndian) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + ArrayRef<int> Mask = SVN->getMask(); + SDValue N0 = SVN->getOperand(0); + + // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) + auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) + continue; + return false; + } + return true; + }; + + // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for + // power-of-2 extensions as they are the most likely. + for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { + if (!isAnyExtend(Scale)) + continue; + + EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) + return DAG.getBitcast(VT, + DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); + } + + return SDValue(); +} + +// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of +// each source element of a large type into the lowest elements of a smaller +// destination type. This is often generated during legalization. +// If the source node itself was a '*_extend_vector_inreg' node then we should +// then be able to remove it. +SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) { + EVT VT = SVN->getValueType(0); + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + // TODO Add support for big-endian when we have a test case. + if (!VT.isInteger() || IsBigEndian) + return SDValue(); + + SDValue N0 = SVN->getOperand(0); + while (N0.getOpcode() == ISD::BITCAST) + N0 = N0.getOperand(0); + + unsigned Opcode = N0.getOpcode(); + if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && + Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && + Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + ArrayRef<int> Mask = SVN->getMask(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); + + // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> + // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> + // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> + auto isTruncate = [&Mask, &NumElts](unsigned Scale) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) + continue; + return false; + } + return true; + }; + + // At the moment we just handle the case where we've truncated back to the + // same size as before the extension. + // TODO: handle more extension/truncation cases as cases arise. + if (EltSizeInBits != ExtSrcSizeInBits) + return SDValue(); + + // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for + // power-of-2 truncations as they are the most likely. + for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) + if (isTruncate(Scale)) + return DAG.getBitcast(VT, N00); + + return SDValue(); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -13996,6 +14721,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG)) return S; + // Match shuffles that can be converted to any_vector_extend_in_reg. + if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) + return V; + + // Combine "truncate_vector_in_reg" style shuffles. + if (SDValue V = combineTruncationShuffle(SVN, DAG)) + return V; + if (N0.getOpcode() == ISD::CONCAT_VECTORS && Level < AfterLegalizeVectorOps && (N1.isUndef() || @@ -14253,6 +14986,16 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); + // If inserting an UNDEF, just return the original vector. + if (N1.isUndef()) + return N0; + + // If this is an insert of an extracted vector into an undef vector, we can + // just use the input to the extract. + if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) + return N1.getOperand(0); + // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -14262,26 +15005,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); - if (N0.getValueType() != N1.getValueType()) + if (!isa<ConstantSDNode>(N2)) return SDValue(); + unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); + + // Canonicalize insert_subvector dag nodes. + // Example: + // (insert_subvector (insert_subvector A, Idx0), Idx1) + // -> (insert_subvector (insert_subvector A, Idx1), Idx0) + if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && + N1.getValueType() == N0.getOperand(1).getValueType() && + isa<ConstantSDNode>(N0.getOperand(2))) { + unsigned OtherIdx = cast<ConstantSDNode>(N0.getOperand(2))->getZExtValue(); + if (InsIdx < OtherIdx) { + // Swap nodes. + SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, + N0.getOperand(0), N1, N2); + AddToWorklist(NewOp.getNode()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), + VT, NewOp, N0.getOperand(1), N0.getOperand(2)); + } + } + // If the input vector is a concatenation, and the insert replaces - // one of the halves, we can optimize into a single concat_vectors. - if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 && - N2.getOpcode() == ISD::Constant) { - APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue(); + // one of the pieces, we can optimize into a single concat_vectors. + if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && + N0.getOperand(0).getValueType() == N1.getValueType()) { + unsigned Factor = N1.getValueType().getVectorNumElements(); - // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) -> - // (concat_vectors Z, Y) - if (InsIdx == 0) - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1, - N0.getOperand(1)); + SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); + Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1; - // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) -> - // (concat_vectors X, Z) - if (InsIdx == VT.getVectorNumElements() / 2) - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0), - N1); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } return SDValue(); @@ -15257,7 +16013,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, if (Base.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) { Base = Base.getOperand(0); - Offset += C->getZExtValue(); + Offset += C->getSExtValue(); } } @@ -15454,6 +16210,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, ++Depth; break; + case ISD::CopyFromReg: + // Forward past CopyFromReg. + Chains.push_back(Chain.getOperand(0)); + ++Depth; + break; + default: // For all other instructions we will just have to take what we can get. Aliases.push_back(Chain); @@ -15482,6 +16244,18 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } +// This function tries to collect a bunch of potentially interesting +// nodes to improve the chains of, all at once. This might seem +// redundant, as this function gets called when visiting every store +// node, so why not let the work be done on each store as it's visited? +// +// I believe this is mainly important because MergeConsecutiveStores +// is unable to deal with merging stores of different sizes, so unless +// we improve the chains of all the potential candidates up-front +// before running MergeConsecutiveStores, it might only see some of +// the nodes that will eventually be candidates, and then not be able +// to go from a partially-merged state to the desired final +// fully-merged state. bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. @@ -15517,10 +16291,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { if (!Ptr.equalBaseIndex(BasePtr)) break; - // Find the next memory operand in the chain. If the next operand in the - // chain is a store then move up and continue the scan with the next - // memory operand. If the next operand is a load save it and use alias - // information to check if it interferes with anything. + // Walk up the chain to find the next store node, ignoring any + // intermediate loads. Any other kind of node will halt the loop. SDNode *NextInChain = Index->getChain().getNode(); while (true) { if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { @@ -15539,9 +16311,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { Index = nullptr; break; } - } + } // end while } + // At this point, ChainedStores lists all of the Store nodes + // reachable by iterating up through chain nodes matching the above + // conditions. For each such store identified, try to find an + // earlier chain to attach the store to which won't violate the + // required ordering. bool MadeChangeToSt = false; SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains; diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index e2f33bb433bac..0584ab9f60d1b 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1,4 +1,4 @@ -//===-- FastISel.cpp - Implementation of the FastISel class ---------------===// +//===- FastISel.cpp - Implementation of the FastISel class ----------------===// // // The LLVM Compiler Infrastructure // @@ -39,35 +39,76 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "isel" @@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by " "target-specific selector"); STATISTIC(NumFastIselDead, "Number of dead insts removed on failure"); -void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS, - unsigned AttrIdx) { - IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); - IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); - IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); - IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); - IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); - IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); - IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); - IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); - IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); - IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); - Alignment = CS->getParamAlignment(AttrIdx); -} - /// Set the current block to which generated machine instructions will be /// appended, and clear the local CSE map. void FastISel::startNewBlock() { @@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) { // Try to emit the constant by using an integer constant with a cast. const APFloat &Flt = CF->getValueAPF(); EVT IntVT = TLI.getPointerTy(DL); - - uint64_t x[2]; uint32_t IntBitWidth = IntVT.getSizeInBits(); + APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false); bool isExact; - (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true, - APFloat::rmTowardZero, &isExact); + (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact); if (isExact) { - APInt IntVal(IntBitWidth, x); - unsigned IntegerReg = - getRegForValue(ConstantInt::get(V->getContext(), IntVal)); + getRegForValue(ConstantInt::get(V->getContext(), SIntVal)); if (IntegerReg != 0) Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg, /*Kill=*/false); @@ -646,7 +668,7 @@ bool FastISel::selectStackmap(const CallInst *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::STACKMAP)); for (auto const &MO : Ops) - MIB.addOperand(MO); + MIB.add(MO); // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); @@ -672,10 +694,8 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, Args.reserve(NumArgs); // Populate the argument list. - // Attributes for args start at offset 1, after the return attribute. ImmutableCallSite CS(CI); - for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; - ArgI != ArgE; ++ArgI) { + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) { Value *V = CI->getOperand(ArgI); assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); @@ -683,7 +703,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, ArgListEntry Entry; Entry.Val = V; Entry.Ty = V->getType(); - Entry.setAttributes(&CS, AttrI); + Entry.setAttributes(&CS, ArgIdx); Args.push_back(Entry); } @@ -826,7 +846,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) { TII.get(TargetOpcode::PATCHPOINT)); for (auto &MO : Ops) - MIB.addOperand(MO); + MIB.add(MO); MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI); @@ -841,9 +861,9 @@ bool FastISel::selectPatchpoint(const CallInst *I) { return true; } -/// Returns an AttributeSet representing the attributes applied to the return +/// Returns an AttributeList representing the attributes applied to the return /// value of the given call. -static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) { SmallVector<Attribute::AttrKind, 2> Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); @@ -852,8 +872,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); - return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, - Attrs); + return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, + Attrs); } bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName, @@ -885,9 +905,10 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol, ArgListEntry Entry; Entry.Val = V; Entry.Ty = V->getType(); - Entry.setAttributes(&CS, ArgI + 1); + Entry.setAttributes(&CS, ArgI); Args.push_back(Entry); } + TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args); CallLoweringInfo CLI; CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs); @@ -1021,7 +1042,7 @@ bool FastISel::lowerCall(const CallInst *CI) { Entry.Ty = V->getType(); // Skip the first return-type Attribute to get to params. - Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Entry.setAttributes(&CS, i - CS.arg_begin()); Args.push_back(Entry); } @@ -1149,7 +1170,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { } else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) + .add(*Op) .addImm(0) .addMetadata(DI->getVariable()) .addMetadata(DI->getExpression()); @@ -1362,7 +1383,7 @@ bool FastISel::selectInstruction(const Instruction *I) { if (const auto *Call = dyn_cast<CallInst>(I)) { const Function *F = Call->getCalledFunction(); - LibFunc::Func Func; + LibFunc Func; // As a special case, don't handle calls to builtin library functions that // may be translated directly to target instructions. @@ -1665,7 +1686,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo, TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo), SkipTargetIndependentISel(SkipTargetIndependentISel) {} -FastISel::~FastISel() {} +FastISel::~FastISel() = default; bool FastISel::fastLowerArguments() { return false; } diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 4a9042cfb3f44..e85d1951e3aed 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -235,7 +235,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, if (II.OpInfo[i].isOptionalDef()) { // Optional def must be a physical register. - unsigned NumResults = CountResults(Node); VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg(); assert(TargetRegisterInfo::isPhysicalRegister(VRBase)); MIB.addReg(VRBase, RegState::Define); diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b0028252836ab..fc7cd020fe2e3 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1192,8 +1192,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { // If the index is dependent on the store we will introduce a cycle when // creating the load (the load uses the index, and by replacing the chain - // we will make the index dependent on the load). - if (SDNode::hasPredecessorHelper(ST, Visited, Worklist)) + // we will make the index dependent on the load). Also, the store might be + // dependent on the extractelement and introduce a cycle when creating + // the load. + if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) || + ST->hasPredecessor(Op.getNode())) continue; StackPtr = ST->getBasePtr(); @@ -1909,8 +1912,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1935,9 +1938,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setTailCall(isTailCall) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1960,8 +1967,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, for (unsigned i = 0; i != NumOps; ++i) { Entry.Node = Ops[i]; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1970,9 +1977,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1994,8 +2004,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2004,9 +2014,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2081,8 +2094,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } @@ -2090,8 +2103,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDValue FIPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = FIPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2099,9 +2112,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2185,24 +2201,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, // Pass the argument. Entry.Node = Node->getOperand(0); Entry.Ty = RetTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); // Pass the return address of sin. SDValue SinPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = SinPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); // Also pass the return address of the cos. SDValue CosPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = CosPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2210,9 +2226,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), - Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args)); + CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( + TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, + std::move(Args)); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2529,12 +2545,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0); APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0); for (unsigned J = 0; J != Sz; J += 8) { - MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J)); - MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J)); - MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J)); - MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J)); - MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J)); - MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J)); + MaskHi4 = MaskHi4 | (0xF0ull << J); + MaskLo4 = MaskLo4 | (0x0Full << J); + MaskHi2 = MaskHi2 | (0xCCull << J); + MaskLo2 = MaskLo2 | (0x33ull << J); + MaskHi1 = MaskHi1 | (0xAAull << J); + MaskLo1 = MaskLo1 | (0x55ull << J); } // BSWAP if the type is wider than a single byte. @@ -3091,7 +3107,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { TLI.getVectorIdxTy(DAG.getDataLayout())))); } - Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + Tmp1 = DAG.getBuildVector(VT, dl, Ops); // We may have changed the BUILD_VECTOR type. Cast it back to the Node type. Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1); Results.push_back(Tmp1); @@ -3790,8 +3806,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Scalars.push_back(DAG.getNode(Node->getOpcode(), dl, VT.getScalarType(), Ex, Sh)); } - SDValue Result = - DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars); + + SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars); ReplaceNode(SDValue(Node, 0), Result); break; } @@ -3830,10 +3846,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__sync_synchronize", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + .setLibCallee( + CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__sync_synchronize", + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -3870,10 +3887,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("abort", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol( + "abort", TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); Results.push_back(CallResult.second); @@ -4424,8 +4441,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { NewOps.push_back(Elt); } - SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps); - + SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps); Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec)); break; } diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 72b56d84d9452..6f2b1b94ce465 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -459,7 +459,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) { Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op); if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat) - SoftenFloatResult(Op.getNode(), 0); + AddToWorklist(Op.getNode()); } if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) { @@ -472,8 +472,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); - if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat) - Op = GetSoftenedFloat(Op); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first; } @@ -1054,15 +1052,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - assert(NVT.getSizeInBits() == integerPartWidth && + assert(NVT.getSizeInBits() == 64 && "Do not know how to expand this float constant!"); APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt(); SDLoc dl(N); Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), - APInt(integerPartWidth, C.getRawData()[1])), + APInt(64, C.getRawData()[1])), dl, NVT); Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), - APInt(integerPartWidth, C.getRawData()[0])), + APInt(64, C.getRawData()[0])), dl, NVT); } diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index dc436ce045142..85068e890756b 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -690,7 +690,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { case TargetLowering::TypePromoteInteger: Res = GetPromotedInteger(InOp); break; - case TargetLowering::TypeSplitVector: + case TargetLowering::TypeSplitVector: { EVT InVT = InOp.getValueType(); assert(InVT.isVector() && "Cannot split scalar types"); unsigned NumElts = InVT.getVectorNumElements(); @@ -709,6 +709,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2); } + case TargetLowering::TypeWidenVector: { + SDValue WideInOp = GetWidenedVector(InOp); + + // Truncate widened InOp. + unsigned NumElem = WideInOp.getValueType().getVectorNumElements(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), + N->getValueType(0).getScalarType(), NumElem); + SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp); + + // Zero extend so that the elements are of same type as those of NVT + EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(), + NumElem); + SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc); + + // Extract the low NVT subvector. + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx); + } + } // Truncate to NVT instead of VT return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); @@ -1089,6 +1109,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { SDValue Cond = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); + if (N->getOpcode() == ISD::VSELECT) + if (SDValue Res = WidenVSELECTAndMask(N)) + return Res; + // Promote all the way up to the canonical SetCC type. EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; Cond = PromoteTargetBoolean(Cond, OpVT); @@ -2586,24 +2610,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = true; - Entry.isZExt = false; + Entry.IsSExt = true; + Entry.IsZExt = false; Args.push_back(Entry); } // Also pass the address of the overflow check. Entry.Node = Temp; Entry.Ty = PtrTy->getPointerTo(); - Entry.isSExt = true; - Entry.isZExt = false; + Entry.IsSExt = true; + Entry.IsZExt = false; Args.push_back(Entry); SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) - .setSExtResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) + .setSExtResult(); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index cf19d75676cda..0a2b680e1c66e 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -199,8 +199,7 @@ bool DAGTypeLegalizer::run() { // non-leaves. for (SDNode &Node : DAG.allnodes()) { if (Node.getNumOperands() == 0) { - Node.setNodeId(ReadyToProcess); - Worklist.push_back(&Node); + AddToWorklist(&Node); } else { Node.setNodeId(Unanalyzed); } @@ -331,6 +330,12 @@ ScanOperands: // to the worklist etc. if (NeedsReanalyzing) { assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?"); + + // Remove any result values from SoftenedFloats as N will be revisited + // again. + for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) + SoftenedFloats.erase(SDValue(N, i)); + N->setNodeId(NewNode); // Recompute the NodeId and correct processed operands, adding the node to // the worklist if ready. @@ -749,6 +754,8 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) { // new uses of From due to CSE. If this happens, replace the new uses of // From with To. } while (!From.use_empty()); + + SoftenedFloats.erase(From); } void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) { @@ -1077,8 +1084,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1087,9 +1094,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index ec55662d75c0b..80c939700518f 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -191,6 +191,11 @@ private: void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, SDValue &Lo, SDValue &Hi); + void AddToWorklist(SDNode *N) { + N->setNodeId(ReadyToProcess); + Worklist.push_back(N); + } + //===--------------------------------------------------------------------===// // Integer Promotion Support: LegalizeIntegerTypes.cpp //===--------------------------------------------------------------------===// @@ -597,6 +602,7 @@ private: SDValue ScalarizeVecRes_TernaryOp(SDNode *N); SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_InregOp(SDNode *N); + SDValue ScalarizeVecRes_VecInregOp(SDNode *N); SDValue ScalarizeVecRes_BITCAST(SDNode *N); SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); @@ -672,6 +678,7 @@ private: SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); @@ -713,6 +720,7 @@ private: SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); + SDValue WidenVSELECTAndMask(SDNode *N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); SDValue WidenVecRes_UNDEF(SDNode *N); @@ -782,6 +790,13 @@ private: /// By default, the vector will be widened with undefined values. SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false); + /// Return a mask of vector type MaskVT to replace InMask. Also adjust + /// MaskVT to ToMaskVT if needed with vector extension or truncation. + SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT); + + /// Get the target mask VT, and widen if needed. + EVT getSETCCWidenedResultTy(SDValue SetCC); + //===--------------------------------------------------------------------===// // Generic Splitting: LegalizeTypesGeneric.cpp //===--------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 3682c32460c66..c02b8960b36cb 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -512,8 +512,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, GetSplitOp(Op, Lo, Hi); } -void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, - SDValue &Hi) { +static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, + SelectionDAG &DAG) { + SDLoc DL(N); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + // Split the inputs. + SDValue Lo, Hi, LL, LH, RL, RH; + std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); + std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); + + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); + + return std::make_pair(Lo, Hi); +} + +void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH, CL, CH; SDLoc dl(N); GetSplitOp(N->getOperand(1), LL, LH); @@ -522,9 +538,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue Cond = N->getOperand(0); CL = CH = Cond; if (Cond.getValueType().isVector()) { + if (SDValue Res = WidenVSELECTAndMask(N)) + std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); + // It seems to improve code to generate two narrow SETCCs as opposed to + // splitting a wide result vector. + else if (Cond.getOpcode() == ISD::SETCC) + std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG); // Check if there are already splitted versions of the vector available and // use those instead of splitting the mask operand again. - if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) + else if (getTypeAction(Cond.getValueType()) == + TargetLowering::TypeSplitVector) GetSplitVector(Cond, CL, CH); else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index d4fa20f35274c..5f167f8de1cfc 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ class VectorLegalizer { SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); @@ -621,8 +622,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) { } NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - Value = DAG.getNode(ISD::BUILD_VECTOR, dl, - Op.getNode()->getValueType(0), Vals); + Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals); } else { SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG); @@ -692,6 +692,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) { return ExpandUINT_TO_FLOAT(Op); case ISD::FNEG: return ExpandFNEG(Op); + case ISD::FSUB: + return ExpandFSUB(Op); case ISD::SETCC: return UnrollVSETCC(Op); case ISD::BITREVERSE: @@ -720,8 +722,6 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { assert(VT.isVector() && !Mask.getValueType().isVector() && Op1.getValueType() == Op2.getValueType() && "Invalid type"); - unsigned NumElem = VT.getVectorNumElements(); - // If we can't even use the basic vector operations of // AND,OR,XOR, we will have to scalarize the op. // Notice that the operation may be 'promoted' which means that it is @@ -745,8 +745,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { DAG.getConstant(0, DL, BitTy)); // Broadcast the mask so that the entire vector is all-one or all zero. - SmallVector<SDValue, 8> Ops(NumElem, Mask); - Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops); + Mask = DAG.getSplatBuildVector(MaskTy, DL, Mask); // Bitcast the operands to be the same type as the mask. // This is needed when we select between FP types because @@ -1025,6 +1024,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) { return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandFSUB(SDValue Op) { + // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal, + // we can defer this to operation legalization where it will be lowered as + // a+(-b). + EVT VT = Op.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) && + TLI.isOperationLegalOrCustom(ISD::FADD, VT)) + return Op; // Defer to LegalizeDAG + + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { EVT VT = Op.getValueType(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); @@ -1102,7 +1113,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) { (EltVT.getSizeInBits()), dl, EltVT), DAG.getConstant(0, dl, EltVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } } diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 6906f67ebacb6..78fddb5ce8f58 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + R = ScalarizeVecRes_VecInregOp(N); + break; case ISD::ANY_EXTEND: case ISD::BITREVERSE: case ISD::BSWAP: @@ -97,6 +102,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::ZERO_EXTEND: + case ISD::FCANONICALIZE: R = ScalarizeVecRes_UnaryOp(N); break; @@ -257,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) { LHS, DAG.getValueType(ExtVT)); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + + EVT OpVT = Op.getValueType(); + EVT OpEltVT = OpVT.getVectorElementType(); + EVT EltVT = N->getValueType(0).getVectorElementType(); + + if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { + Op = GetScalarizedVector(Op); + } else { + Op = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op, + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + + switch (N->getOpcode()) { + case ISD::ANY_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op); + } + + llvm_unreachable("Illegal extend_vector_inreg opcode"); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) { // If the operand is wider than the vector element type then it is implicitly // truncated. Make that explicit here. @@ -486,7 +520,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) { SmallVector<SDValue, 8> Ops(N->getNumOperands()); for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) Ops[i] = GetScalarizedVector(N->getOperand(i)); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops); + return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops); } /// If the input is a vector that needs to be scalarized, it must be <1 x ty>, @@ -637,6 +671,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::FCANONICALIZE: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -781,10 +816,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); unsigned LoNumElts = LoVT.getVectorNumElements(); SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts); - Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps); + Lo = DAG.getBuildVector(LoVT, dl, LoOps); SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end()); - Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps); + Hi = DAG.getBuildVector(HiVT, dl, HiOps); } void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, @@ -928,7 +963,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDLoc dl(N); SDValue InLo, InHi; - GetSplitVector(N0, InLo, InHi); + + if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(N0, InLo, InHi); + else + std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0); + EVT InLoVT = InLo.getValueType(); unsigned InNumElements = InLoVT.getVectorNumElements(); @@ -1372,7 +1412,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, } // Construct the Lo/Hi output using a BUILD_VECTOR. - Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps); + Output = DAG.getBuildVector(NewVT, dl, SVOps); } else if (InputUsed[0] == -1U) { // No input vectors were used! The result is undefined. Output = DAG.getUNDEF(NewVT); @@ -1466,8 +1506,15 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::FTRUNC: + case ISD::FCANONICALIZE: Res = SplitVecOp_UnaryOp(N); break; + + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + Res = SplitVecOp_ExtVecInRegOp(N); + break; } } @@ -1615,7 +1662,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { EltVT = MVT::i8; VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, VecVT.getVectorNumElements()); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps); + Vec = DAG.getBuildVector(VecVT, dl, ElementOps); } // Store the vector to the stack. @@ -1629,6 +1676,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { MachinePointerInfo(), EltVT); } +SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) { + SDValue Lo, Hi; + + // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so + // splitting the result has the same effect as splitting the input operand. + SplitVecRes_ExtVecInRegOp(N, Lo, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo) { EVT LoVT, HiVT; @@ -1881,7 +1938,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts); + return DAG.getBuildVector(N->getValueType(0), DL, Elts); } SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { @@ -2323,6 +2380,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { return DAG.getNode(Opcode, DL, WidenVT, InOp); return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags); } + if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) { + // If both input and result vector types are of same width, extend + // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which + // accepts fewer elements in the result than in the input. + if (Opcode == ISD::SIGN_EXTEND) + return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT); + if (Opcode == ISD::ZERO_EXTEND) + return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT); + } } if (TLI.isTypeLegal(InWidenVT)) { @@ -2375,7 +2441,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, DL, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -2430,7 +2496,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { while (Ops.size() != WidenNumElts) Ops.push_back(DAG.getUNDEF(WidenSVT)); - return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, DL, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) { @@ -2593,7 +2659,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps); + return DAG.getBuildVector(WidenVT, dl, NewOps); } SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { @@ -2663,7 +2729,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(EltVT); for (; Idx < WidenNumElts; ++Idx) Ops[Idx] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { @@ -2704,7 +2770,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { @@ -2814,6 +2880,212 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) { WidenVT, N->getOperand(0)); } +// Return true if this is a node that could have two SETCCs as operands. +static inline bool isLogicalMaskOp(unsigned Opcode) { + switch (Opcode) { + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return true; + } + return false; +} + +// This is used just for the assert in convertMask(). Check that this either +// a SETCC or a previously handled SETCC by convertMask(). +#ifndef NDEBUG +static inline bool isSETCCorConvertedSETCC(SDValue N) { + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) + N = N.getOperand(0); + else if (N.getOpcode() == ISD::CONCAT_VECTORS) { + for (unsigned i = 1; i < N->getNumOperands(); ++i) + if (!N->getOperand(i)->isUndef()) + return false; + N = N.getOperand(0); + } + + if (N.getOpcode() == ISD::TRUNCATE) + N = N.getOperand(0); + else if (N.getOpcode() == ISD::SIGN_EXTEND) + N = N.getOperand(0); + + return (N.getOpcode() == ISD::SETCC); +} +#endif + +// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT +// to ToMaskVT if needed with vector extension or truncation. +SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT, + EVT ToMaskVT) { + LLVMContext &Ctx = *DAG.getContext(); + + // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled. + unsigned InMaskOpc = InMask->getOpcode(); + assert((InMaskOpc == ISD::SETCC || + (isLogicalMaskOp(InMaskOpc) && + isSETCCorConvertedSETCC(InMask->getOperand(0)) && + isSETCCorConvertedSETCC(InMask->getOperand(1)))) && + "Unexpected mask argument."); + + // Make a new Mask node, with a legal result VT. + SmallVector<SDValue, 4> Ops; + for (unsigned i = 0; i < InMask->getNumOperands(); ++i) + Ops.push_back(InMask->getOperand(i)); + SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops); + + // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign + // extend or truncate is needed. + unsigned MaskScalarBits = MaskVT.getScalarSizeInBits(); + unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits(); + if (MaskScalarBits < ToMaskScalBits) { + EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), + MaskVT.getVectorNumElements()); + Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask); + } else if (MaskScalarBits > ToMaskScalBits) { + EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), + MaskVT.getVectorNumElements()); + Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask); + } + + assert(Mask->getValueType(0).getScalarSizeInBits() == + ToMaskVT.getScalarSizeInBits() && + "Mask should have the right element size by now."); + + // Adjust Mask to the right number of elements. + unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements(); + if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) { + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy); + Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask, + ZeroIdx); + } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) { + unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls); + EVT SubVT = Mask->getValueType(0); + SmallVector<SDValue, 16> SubConcatOps(NumSubVecs); + SubConcatOps[0] = Mask; + for (unsigned i = 1; i < NumSubVecs; ++i) + SubConcatOps[i] = DAG.getUNDEF(SubVT); + Mask = + DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps); + } + + assert((Mask->getValueType(0) == ToMaskVT) && + "A mask of ToMaskVT should have been produced by now."); + + return Mask; +} + +// Get the target mask VT, and widen if needed. +EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) { + assert(SetCC->getOpcode() == ISD::SETCC); + LLVMContext &Ctx = *DAG.getContext(); + EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType()); + if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) + MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT); + return MaskVT; +} + +// This method tries to handle VSELECT and its mask by legalizing operands +// (which may require widening) and if needed adjusting the mask vector type +// to match that of the VSELECT. Without it, many cases end up with +// scalarization of the SETCC, with many unnecessary instructions. +SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) { + LLVMContext &Ctx = *DAG.getContext(); + SDValue Cond = N->getOperand(0); + + if (N->getOpcode() != ISD::VSELECT) + return SDValue(); + + if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode())) + return SDValue(); + + // If this is a splitted VSELECT that was previously already handled, do + // nothing. + if (Cond->getValueType(0).getScalarSizeInBits() != 1) + return SDValue(); + + EVT VSelVT = N->getValueType(0); + // Only handle vector types which are a power of 2. + if (!isPowerOf2_64(VSelVT.getSizeInBits())) + return SDValue(); + + // Don't touch if this will be scalarized. + EVT FinalVT = VSelVT; + while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector) + FinalVT = EVT::getVectorVT(Ctx, FinalVT.getVectorElementType(), + FinalVT.getVectorNumElements() / 2); + if (FinalVT.getVectorNumElements() == 1) + return SDValue(); + + // If there is support for an i1 vector mask, don't touch. + if (Cond.getOpcode() == ISD::SETCC) { + EVT SetCCOpVT = Cond->getOperand(0).getValueType(); + while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal) + SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT); + EVT SetCCResVT = getSetCCResultType(SetCCOpVT); + if (SetCCResVT.getScalarSizeInBits() == 1) + return SDValue(); + } + + // Get the VT and operands for VSELECT, and widen if needed. + SDValue VSelOp1 = N->getOperand(1); + SDValue VSelOp2 = N->getOperand(2); + if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) { + VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT); + VSelOp1 = GetWidenedVector(VSelOp1); + VSelOp2 = GetWidenedVector(VSelOp2); + } + + // The mask of the VSELECT should have integer elements. + EVT ToMaskVT = VSelVT; + if (!ToMaskVT.getScalarType().isInteger()) + ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger(); + + SDValue Mask; + if (Cond->getOpcode() == ISD::SETCC) { + EVT MaskVT = getSETCCWidenedResultTy(Cond); + Mask = convertMask(Cond, MaskVT, ToMaskVT); + } else if (isLogicalMaskOp(Cond->getOpcode()) && + Cond->getOperand(0).getOpcode() == ISD::SETCC && + Cond->getOperand(1).getOpcode() == ISD::SETCC) { + // Cond is (AND/OR/XOR (SETCC, SETCC)) + SDValue SETCC0 = Cond->getOperand(0); + SDValue SETCC1 = Cond->getOperand(1); + EVT VT0 = getSETCCWidenedResultTy(SETCC0); + EVT VT1 = getSETCCWidenedResultTy(SETCC1); + unsigned ScalarBits0 = VT0.getScalarSizeInBits(); + unsigned ScalarBits1 = VT1.getScalarSizeInBits(); + unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits(); + EVT MaskVT; + // If the two SETCCs have different VTs, either extend/truncate one of + // them to the other "towards" ToMaskVT, or truncate one and extend the + // other to ToMaskVT. + if (ScalarBits0 != ScalarBits1) { + EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1); + EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0); + if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits()) + MaskVT = WideVT; + else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits()) + MaskVT = NarrowVT; + else + MaskVT = ToMaskVT; + } else + // If the two SETCCs have the same VT, don't change it. + MaskVT = VT0; + + // Make new SETCCs and logical nodes. + SETCC0 = convertMask(SETCC0, VT0, MaskVT); + SETCC1 = convertMask(SETCC1, VT1, MaskVT); + Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1); + + // Convert the logical op for VSELECT if needed. + Mask = convertMask(Cond, MaskVT, ToMaskVT); + } else + return SDValue(); + + return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2); +} + SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); @@ -2821,6 +3093,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { SDValue Cond1 = N->getOperand(0); EVT CondVT = Cond1.getValueType(); if (CondVT.isVector()) { + if (SDValue Res = WidenVSELECTAndMask(N)) + return Res; + EVT CondEltVT = CondVT.getVectorElementType(); EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenNumElts); @@ -3093,7 +3368,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp, DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())))); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { @@ -3144,7 +3419,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) { @@ -3565,10 +3840,9 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain, for (; i != WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } - void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST) { // The strategy assumes that we can efficiently store power-of-two widths. @@ -3737,5 +4011,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) Ops[Idx] = FillVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + return DAG.getBuildVector(NVT, dl, Ops); } diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index ded8e68fcbce5..a1d70ab6f036f 100644 --- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -57,10 +57,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) RegPressure.resize(NumRC); std::fill(RegLimit.begin(), RegLimit.end(), 0); std::fill(RegPressure.begin(), RegPressure.end(), 0); - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); - I != E; ++I) - RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF); + for (const TargetRegisterClass *RC : TRI->regclasses()) + RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF); ParallelLiveRanges = 0; HorizontalVerticalBalance = 0; @@ -364,16 +362,11 @@ int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) { return RegBalance; if (RawPressure) { - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; + for (const TargetRegisterClass *RC : TRI->regclasses()) RegBalance += rawRegPressureDelta(SU, RC->getID()); - } } else { - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; + for (const TargetRegisterClass *RC : TRI->regclasses()) { if ((RegPressure[RC->getID()] + rawRegPressureDelta(SU, RC->getID()) > 0) && (RegPressure[RC->getID()] + diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 3549ccd9e345b..e923e30e50377 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -422,11 +422,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner, } // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END. if (N->isMachineOpcode()) { - if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameDestroyOpcode()) { + if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { ++NestLevel; - } else if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameSetupOpcode()) { + } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { if (NestLevel == 0) return false; --NestLevel; @@ -480,12 +478,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest, } // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END. if (N->isMachineOpcode()) { - if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameDestroyOpcode()) { + if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { ++NestLevel; MaxNest = std::max(MaxNest, NestLevel); - } else if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameSetupOpcode()) { + } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { assert(NestLevel != 0); --NestLevel; if (NestLevel == 0) @@ -550,7 +546,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) { if (!LiveRegDefs[CallResource]) for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) if (Node->isMachineOpcode() && - Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { unsigned NestLevel = 0; unsigned MaxNest = 0; SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII); @@ -755,7 +751,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; LiveRegDefs[CallResource] = nullptr; @@ -826,7 +822,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { ++NumLiveRegs; LiveRegDefs[CallResource] = SU; LiveRegGens[CallResource] = CallSeqEndForStart[SU]; @@ -839,7 +835,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; LiveRegDefs[CallResource] = nullptr; @@ -1305,7 +1301,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { // If we're in the middle of scheduling a call, don't begin scheduling // another call. Also, don't allow any physical registers to be live across // the call. - if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) || + (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) { // Check the special calling-sequence resource. unsigned CallResource = TRI->getNumRegs(); if (LiveRegDefs[CallResource]) { @@ -1659,9 +1656,8 @@ public: RegPressure.resize(NumRC); std::fill(RegLimit.begin(), RegLimit.end(), 0); std::fill(RegPressure.begin(), RegPressure.end(), 0); - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) - RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF); + for (const TargetRegisterClass *RC : TRI->regclasses()) + RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF); } } @@ -1788,7 +1784,7 @@ public: } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(ScheduleDAG *DAG) const override { + LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override { // Emulate pop() without clobbering NodeQueueIds. std::vector<SUnit*> DumpQueue = Queue; SF DumpPicker = Picker; @@ -1924,19 +1920,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const { // Register Pressure Tracking //===----------------------------------------------------------------------===// -void RegReductionPQBase::dumpRegPressure() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; +LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const { + for (const TargetRegisterClass *RC : TRI->regclasses()) { unsigned Id = RC->getID(); unsigned RP = RegPressure[Id]; if (!RP) continue; DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / " << RegLimit[Id] << '\n'); } -#endif } +#endif bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const { if (!TLI) @@ -2092,7 +2086,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) { RegPressure[RCId] -= Cost; } } - dumpRegPressure(); + DEBUG(dumpRegPressure()); } void RegReductionPQBase::unscheduledNode(SUnit *SU) { @@ -2172,7 +2166,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) { } } - dumpRegPressure(); + DEBUG(dumpRegPressure()); } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 3be622f8c179a..3c8526ebb7029 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -650,6 +650,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use, } void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const { + // Cannot completely remove virtual function even in release mode. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) if (!SU->getNode()) { dbgs() << "PHYS REG COPY\n"; @@ -704,8 +705,8 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, if (!N->getHasDebugValue()) return; - // Opportunistically insert immediate dbg_value uses, i.e. those with source - // order number right after the N. + // Opportunistically insert immediate dbg_value uses, i.e. those with the same + // source order number as N. MachineBasicBlock *BB = Emitter.getBlock(); MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos(); ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N); @@ -713,7 +714,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, if (DVs[i]->isInvalidated()) continue; unsigned DVOrder = DVs[i]->getOrder(); - if (!Order || DVOrder == ++Order) { + if (!Order || DVOrder == Order) { MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap); if (DbgMI) { Orders.push_back(std::make_pair(DVOrder, DbgMI)); @@ -835,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { GluedNodes.push_back(N); while (!GluedNodes.empty()) { SDNode *N = GluedNodes.back(); - Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned, - VRBaseMap); + Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap); // Remember the source order of the inserted instruction. if (HasDbg) ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e225ba8703b77..003ea5030bfce 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -289,28 +289,28 @@ static int isSignedOp(ISD::CondCode Opcode) { } ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2, - bool isInteger) { - if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) + bool IsInteger) { + if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed integer setcc with an unsigned integer setcc. return ISD::SETCC_INVALID; unsigned Op = Op1 | Op2; // Combine all of the condition bits. - // If the N and U bits get set then the resultant comparison DOES suddenly - // care about orderedness, and is true when ordered. + // If the N and U bits get set, then the resultant comparison DOES suddenly + // care about orderedness, and it is true when ordered. if (Op > ISD::SETTRUE2) Op &= ~16; // Clear the U bit if the N bit is set. // Canonicalize illegal integer setcc's. - if (isInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT + if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT Op = ISD::SETNE; return ISD::CondCode(Op); } ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, - bool isInteger) { - if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) + bool IsInteger) { + if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed setcc with an unsigned setcc. return ISD::SETCC_INVALID; @@ -318,7 +318,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, ISD::CondCode Result = ISD::CondCode(Op1 & Op2); // Canonicalize illegal integer setcc's. - if (isInteger) { + if (IsInteger) { switch (Result) { default: break; case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT @@ -871,11 +871,13 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) DbgInfo = new SDDbgInfo(); } -void SelectionDAG::init(MachineFunction &mf) { - MF = &mf; +void SelectionDAG::init(MachineFunction &NewMF, + OptimizationRemarkEmitter &NewORE) { + MF = &NewMF; + ORE = &NewORE; TLI = getSubtarget().getTargetLowering(); TSI = getSubtarget().getSelectionDAGInfo(); - Context = &mf.getFunction()->getContext(); + Context = &MF->getFunction()->getContext(); } SelectionDAG::~SelectionDAG() { @@ -1994,8 +1996,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, /// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows /// us to only collect the known bits that are shared by the requested vector /// elements. -/// TODO: We only support DemandedElts on a few opcodes so far, the remainder -/// should be added when they become necessary. void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne, const APInt &DemandedElts, unsigned Depth) const { @@ -2251,10 +2251,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownZero2.countLeadingOnes(), BitWidth) - BitWidth; - TrailZ = std::min(TrailZ, BitWidth); - LeadZ = std::min(LeadZ, BitWidth); - KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | - APInt::getHighBitsSet(BitWidth, LeadZ); + KnownZero.clearAllBits(); + KnownZero.setLowBits(std::min(TrailZ, BitWidth)); + KnownZero.setHighBits(std::min(LeadZ, BitWidth)); break; } case ISD::UDIV: { @@ -2272,7 +2271,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); - KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); + KnownZero.setHighBits(LeadZ); break; } case ISD::SELECT: @@ -2297,10 +2296,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; - case ISD::SADDO: - case ISD::UADDO: - case ISD::SSUBO: - case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) @@ -2312,14 +2307,14 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + KnownZero.setBitsFrom(1); break; case ISD::SETCC: // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + KnownZero.setBitsFrom(1); break; case ISD::SHL: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { @@ -2328,7 +2323,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownZero = KnownZero << *ShAmt; KnownOne = KnownOne << *ShAmt; // Low bits are known zero. - KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue()); + KnownZero.setLowBits(ShAmt->getZExtValue()); } break; case ISD::SRL: @@ -2338,8 +2333,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownZero = KnownZero.lshr(*ShAmt); KnownOne = KnownOne.lshr(*ShAmt); // High bits are known zero. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); - KnownZero |= HighBits; + KnownZero.setHighBits(ShAmt->getZExtValue()); } break; case ISD::SRA: @@ -2350,13 +2344,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownOne = KnownOne.lshr(*ShAmt); // If we know the value of the sign bit, then we know it is copied across // the high bits by the shift amount. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); APInt SignBit = APInt::getSignBit(BitWidth); SignBit = SignBit.lshr(*ShAmt); // Adjust to where it is now in the mask. if (KnownZero.intersects(SignBit)) { - KnownZero |= HighBits; // New bits are known zero. + KnownZero.setHighBits(ShAmt->getZExtValue());// New bits are known zero. } else if (KnownOne.intersects(SignBit)) { - KnownOne |= HighBits; // New bits are known one. + KnownOne.setHighBits(ShAmt->getZExtValue()); // New bits are known one. } } break; @@ -2401,9 +2394,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: { - unsigned LowBits = Log2_32(BitWidth)+1; - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); - KnownOne.clearAllBits(); + KnownZero.setBitsFrom(Log2_32(BitWidth)+1); break; } case ISD::LOAD: { @@ -2412,26 +2403,39 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) { EVT VT = LD->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + KnownZero.setBitsFrom(MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { if (LD->getExtensionType() == ISD::NON_EXTLOAD) computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne); } break; } + case ISD::ZERO_EXTEND_VECTOR_INREG: { + EVT InVT = Op.getOperand(0).getValueType(); + unsigned InBits = InVT.getScalarSizeInBits(); + KnownZero = KnownZero.trunc(InBits); + KnownOne = KnownOne.trunc(InBits); + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, + DemandedElts.zext(InVT.getVectorNumElements()), + Depth + 1); + KnownZero = KnownZero.zext(BitWidth); + KnownOne = KnownOne.zext(BitWidth); + KnownZero.setBitsFrom(InBits); + break; + } case ISD::ZERO_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); KnownZero = KnownZero.trunc(InBits); KnownOne = KnownOne.trunc(InBits); computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, Depth + 1); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); - KnownZero |= NewBits; + KnownZero.setBitsFrom(InBits); break; } + // TODO ISD::SIGN_EXTEND_VECTOR_INREG case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); @@ -2478,10 +2482,21 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, } case ISD::FGETSIGN: // All bits are zero except the low bit. - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1); + KnownZero.setBitsFrom(1); break; - - case ISD::SUB: { + case ISD::USUBO: + case ISD::SSUBO: + if (Op.getResNo() == 1) { + // If we know the result of a setcc has the top bits zero, use this info. + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + KnownZero.setBitsFrom(1); + break; + } + LLVM_FALLTHROUGH; + case ISD::SUB: + case ISD::SUBC: { if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) { // We know that the top bits of C-X are clear if X contains less bits // than C (i.e. no wrap-around can happen). For example, 20-X is @@ -2499,13 +2514,40 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if ((KnownZero2 & MaskV) == MaskV) { unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros(); // Top bits known zero. - KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); + KnownZero.setHighBits(NLZ2); } } } - LLVM_FALLTHROUGH; + + // If low bits are know to be zero in both operands, then we know they are + // going to be 0 in the result. Both addition and complement operations + // preserve the low zero bits. + computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); + unsigned KnownZeroLow = KnownZero2.countTrailingOnes(); + if (KnownZeroLow == 0) + break; + + computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); + KnownZeroLow = std::min(KnownZeroLow, + KnownZero2.countTrailingOnes()); + KnownZero.setBits(0, KnownZeroLow); + break; } + case ISD::UADDO: + case ISD::SADDO: + if (Op.getResNo() == 1) { + // If we know the result of a setcc has the top bits zero, use this info. + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + KnownZero.setBitsFrom(1); + break; + } + LLVM_FALLTHROUGH; case ISD::ADD: + case ISD::ADDC: case ISD::ADDE: { // Output known-0 bits are known if clear or set in both the low clear bits // common to both LHS & RHS. For example, 8+(X<<3) is known to have the @@ -2526,19 +2568,19 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownZeroLow = std::min(KnownZeroLow, KnownZero2.countTrailingOnes()); - if (Opcode == ISD::ADD) { - KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow); - if (KnownZeroHigh > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1); + if (Opcode == ISD::ADDE) { + // With ADDE, a carry bit may be added in, so we can only use this + // information if we know (at least) that the low two bits are clear. + // We then return to the caller that the low bit is unknown but that + // other bits are known zero. + if (KnownZeroLow >= 2) + KnownZero.setBits(1, KnownZeroLow); break; } - // With ADDE, a carry bit may be added in, so we can only use this - // information if we know (at least) that the low two bits are clear. We - // then return to the caller that the low bit is unknown but that other bits - // are known zero. - if (KnownZeroLow >= 2) // ADDE - KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow); + KnownZero.setLowBits(KnownZeroLow); + if (KnownZeroHigh > 1) + KnownZero.setHighBits(KnownZeroHigh - 1); break; } case ISD::SREM: @@ -2591,7 +2633,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, uint32_t Leaders = std::max(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes()); KnownOne.clearAllBits(); - KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); + KnownZero.clearAllBits(); + KnownZero.setHighBits(Leaders); break; } case ISD::EXTRACT_ELEMENT: { @@ -2673,6 +2716,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, } break; } + case ISD::BITREVERSE: { + computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); + KnownZero = KnownZero2.reverseBits(); + KnownOne = KnownOne2.reverseBits(); + break; + } case ISD::BSWAP: { computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, Depth + 1); @@ -2680,12 +2730,62 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, KnownOne = KnownOne2.byteSwap(); break; } - case ISD::SMIN: - case ISD::SMAX: - case ISD::UMIN: + case ISD::ABS: { + computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); + + // If the source's MSB is zero then we know the rest of the bits already. + if (KnownZero2[BitWidth - 1]) { + KnownZero = KnownZero2; + KnownOne = KnownOne2; + break; + } + + // We only know that the absolute values's MSB will be zero iff there is + // a set bit that isn't the sign bit (otherwise it could be INT_MIN). + KnownOne2.clearBit(BitWidth - 1); + if (KnownOne2.getBoolValue()) { + KnownZero = APInt::getSignBit(BitWidth); + break; + } + break; + } + case ISD::UMIN: { + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Depth + 1); + computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); + + // UMIN - we know that the result will have the maximum of the + // known zero leading bits of the inputs. + unsigned LeadZero = KnownZero.countLeadingOnes(); + LeadZero = std::max(LeadZero, KnownZero2.countLeadingOnes()); + + KnownZero &= KnownZero2; + KnownOne &= KnownOne2; + KnownZero.setHighBits(LeadZero); + break; + } case ISD::UMAX: { computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + Depth + 1); + + // UMAX - we know that the result will have the maximum of the + // known one leading bits of the inputs. + unsigned LeadOne = KnownOne.countLeadingOnes(); + LeadOne = std::max(LeadOne, KnownOne2.countLeadingOnes()); + + KnownZero &= KnownZero2; + KnownOne &= KnownOne2; + KnownOne.setHighBits(LeadOne); + break; + } + case ISD::SMIN: + case ISD::SMAX: { + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Depth + 1); // If we don't know any bits, early out. if (!KnownOne && !KnownZero) break; @@ -2699,7 +2799,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, case ISD::TargetFrameIndex: if (unsigned Align = InferPtrAlignment(Op)) { // The low bits are known zero if the pointer is aligned. - KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align)); + KnownZero.setLowBits(Log2_32(Align)); break; } break; @@ -2712,13 +2812,48 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_VOID: // Allow the target to implement this method for its nodes. - TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth); + TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, DemandedElts, + *this, Depth); break; } assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); } +SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0, + SDValue N1) const { + // X + 0 never overflow + if (isNullConstant(N1)) + return OFK_Never; + + APInt N1Zero, N1One; + computeKnownBits(N1, N1Zero, N1One); + if (N1Zero.getBoolValue()) { + APInt N0Zero, N0One; + computeKnownBits(N0, N0Zero, N0One); + + bool overflow; + (~N0Zero).uadd_ov(~N1Zero, overflow); + if (!overflow) + return OFK_Never; + } + + // mulhi + 1 never overflow + if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 && + (~N1Zero & 0x01) == ~N1Zero) + return OFK_Never; + + if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) { + APInt N0Zero, N0One; + computeKnownBits(N0, N0Zero, N0One); + + if ((~N0Zero & 0x01) == ~N0Zero) + return OFK_Never; + } + + return OFK_Sometime; +} + bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { EVT OpVT = Val.getValueType(); unsigned BitWidth = OpVT.getScalarSizeInBits(); @@ -2745,7 +2880,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // Are all operands of a build vector constant powers of two? if (Val.getOpcode() == ISD::BUILD_VECTOR) - if (llvm::all_of(Val->ops(), [this, BitWidth](SDValue E) { + if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E)) return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2(); return false; @@ -2764,6 +2899,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { EVT VT = Op.getValueType(); + APInt DemandedElts = VT.isVector() + ? APInt::getAllOnesValue(VT.getVectorNumElements()) + : APInt(1, 1); + return ComputeNumSignBits(Op, DemandedElts, Depth); +} + +unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, + unsigned Depth) const { + EVT VT = Op.getValueType(); assert(VT.isInteger() && "Invalid VT!"); unsigned VTBits = VT.getScalarSizeInBits(); unsigned Tmp, Tmp2; @@ -2772,6 +2916,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { if (Depth == 6) return 1; // Limit search depth. + if (!DemandedElts) + return 1; // No demanded elts, better to assume we don't know anything. + switch (Op.getOpcode()) { default: break; case ISD::AssertSext: @@ -2786,7 +2933,28 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { return Val.getNumSignBits(); } + case ISD::BUILD_VECTOR: + Tmp = VTBits; + for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) { + if (!DemandedElts[i]) + continue; + + SDValue SrcOp = Op.getOperand(i); + Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1); + + // BUILD_VECTOR can implicitly truncate sources, we must handle this. + if (SrcOp.getValueSizeInBits() != VTBits) { + assert(SrcOp.getValueSizeInBits() > VTBits && + "Expected BUILD_VECTOR implicit truncation"); + unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits; + Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1); + } + Tmp = std::min(Tmp, Tmp2); + } + return Tmp; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; @@ -2799,7 +2967,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { return std::max(Tmp, Tmp2); case ISD::SRA: - Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); + Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1); // SRA X, C -> adds C sign bits. if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) { APInt ShiftVal = C->getAPIntValue(); @@ -2887,6 +3055,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { } break; case ISD::ADD: + case ISD::ADDC: // Add can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); @@ -2961,19 +3130,63 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // result. Otherwise it gives either negative or > bitwidth result return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0); } + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = Op.getOperand(0); + SDValue InVal = Op.getOperand(1); + SDValue EltNo = Op.getOperand(2); + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + + ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo); + if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) { + // If we know the element index, split the demand between the + // source vector and the inserted element. + unsigned EltIdx = CEltNo->getZExtValue(); + + // If we demand the inserted element then get its sign bits. + Tmp = UINT_MAX; + if (DemandedElts[EltIdx]) + Tmp = ComputeNumSignBits(InVal, Depth + 1); + + // If we demand the source vector then get its sign bits, and determine + // the minimum. + APInt VectorElts = DemandedElts; + VectorElts.clearBit(EltIdx); + if (!!VectorElts) { + Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + } else { + // Unknown element index, so ignore DemandedElts and demand them all. + Tmp = ComputeNumSignBits(InVec, Depth + 1); + Tmp2 = ComputeNumSignBits(InVal, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); + return Tmp; + } case ISD::EXTRACT_VECTOR_ELT: { - // At the moment we keep this simple and skip tracking the specific - // element. This way we get the lowest common denominator for all elements - // of the vector. - // TODO: get information for given vector element + SDValue InVec = Op.getOperand(0); + SDValue EltNo = Op.getOperand(1); + EVT VecVT = InVec.getValueType(); const unsigned BitWidth = Op.getValueSizeInBits(); const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); + const unsigned NumSrcElts = VecVT.getVectorNumElements(); + // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know // anything about sign bits. But if the sizes match we can derive knowledge // about sign bits from the vector operand. - if (BitWidth == EltBitWidth) - return ComputeNumSignBits(Op.getOperand(0), Depth+1); - break; + if (BitWidth != EltBitWidth) + break; + + // If we know the element index, just demand that vector element, else for + // an unknown element index, ignore DemandedElts and demand them all. + APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts); + ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); + if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) + DemandedSrcElts = + APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue()); + + return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1); } case ISD::EXTRACT_SUBVECTOR: return ComputeNumSignBits(Op.getOperand(0), Depth + 1); @@ -3008,14 +3221,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || Op.getOpcode() == ISD::INTRINSIC_VOID) { - unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth); - if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits); + unsigned NumBits = + TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth); + if (NumBits > 1) + FirstAnswer = std::max(FirstAnswer, NumBits); } // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. APInt KnownZero, KnownOne; - computeKnownBits(Op, KnownZero, KnownOne, Depth); + computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth); APInt Mask; if (KnownZero.isNegative()) { // sign bit is 0 @@ -3054,6 +3269,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const { if (getTarget().Options.NoNaNsFPMath) return true; + if (const BinaryWithFlagsSDNode *BF = dyn_cast<BinaryWithFlagsSDNode>(Op)) + return BF->Flags.hasNoNaNs(); + // If the value is a constant, we can obviously see if it is a NaN or not. if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) return !C->getValueAPF().isNaN(); @@ -3206,6 +3424,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == MVT::f128 && C->getValueType(0) == MVT::i128) return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT); break; + case ISD::ABS: + return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(), + C->isOpaque()); + case ISD::BITREVERSE: + return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(), + C->isOpaque()); case ISD::BSWAP: return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(), C->isOpaque()); @@ -3220,6 +3444,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::CTTZ_ZERO_UNDEF: return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(), C->isOpaque()); + case ISD::FP16_TO_FP: { + bool Ignored; + APFloat FPV(APFloat::IEEEhalf(), + (Val.getBitWidth() == 16) ? Val : Val.trunc(16)); + + // This can return overflow, underflow, or inexact; we don't care. + // FIXME need to be more flexible about rounding mode. + (void)FPV.convert(EVTToAPFloatSemantics(VT), + APFloat::rmNearestTiesToEven, &Ignored); + return getConstantFP(FPV, DL, VT); + } } } @@ -3261,17 +3496,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { - integerPart x[2]; bool ignored; - static_assert(integerPartWidth >= 64, "APFloat parts too small!"); + APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT); // FIXME need to be more flexible about rounding mode. - APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(), - Opcode==ISD::FP_TO_SINT, - APFloat::rmTowardZero, &ignored); - if (s==APFloat::opInvalidOp) // inexact is OK, in fact usual + APFloat::opStatus s = + V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored); + if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual break; - APInt api(VT.getSizeInBits(), x); - return getConstant(api, DL, VT); + return getConstant(IntVal, DL, VT); } case ISD::BITCAST: if (VT == MVT::i16 && C->getValueType(0) == MVT::f16) @@ -3281,6 +3513,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64) return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT); break; + case ISD::FP_TO_FP16: { + bool Ignored; + // This can return overflow, underflow, or inexact; we don't care. + // FIXME need to be more flexible about rounding mode. + (void)V.convert(APFloat::IEEEhalf(), + APFloat::rmNearestTiesToEven, &Ignored); + return getConstant(V.bitcastToAPInt(), DL, VT); + } } } @@ -3303,6 +3543,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: + case ISD::ABS: + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: @@ -3420,6 +3662,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; + case ISD::ABS: + assert(VT.isInteger() && VT == Operand.getValueType() && + "Invalid ABS!"); + if (OpOpcode == ISD::UNDEF) + return getUNDEF(VT); + break; case ISD::BSWAP: assert(VT.isInteger() && VT == Operand.getValueType() && "Invalid BSWAP!"); @@ -3569,6 +3817,30 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, GA->getOffset() + uint64_t(Offset)); } +bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { + switch (Opcode) { + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: { + // If a divisor is zero/undef or any element of a divisor vector is + // zero/undef, the whole op is undef. + assert(Ops.size() == 2 && "Div/rem should have 2 operands"); + SDValue Divisor = Ops[1]; + if (Divisor.isUndef() || isNullConstant(Divisor)) + return true; + + return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) && + any_of(Divisor->op_values(), + [](SDValue V) { return V.isUndef() || isNullConstant(V); }); + // TODO: Handle signed overflow. + } + // TODO: Handle oversized shifts. + default: + return false; + } +} + SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, SDNode *Cst1, SDNode *Cst2) { @@ -3578,6 +3850,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); + if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)})) + return getUNDEF(VT); + // Handle the case of two scalars. if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) { if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) { @@ -3645,6 +3920,9 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); + if (isUndef(Opcode, Ops)) + return getUNDEF(VT); + // We can only fold vectors - maybe merge with FoldConstantArithmetic someday? if (!VT.isVector()) return SDValue(); @@ -3676,7 +3954,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, // Find legal integer scalar type for constant promotion and // ensure that its scalar size is at least as large as source. EVT LegalSVT = VT.getScalarType(); - if (LegalSVT.isInteger()) { + if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) { LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT); if (LegalSVT.bitsLT(VT.getScalarType())) return SDValue(); @@ -3910,35 +4188,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending - auto SignExtendInReg = [&](APInt Val) { + auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) { unsigned FromBits = EVT.getScalarSizeInBits(); Val <<= Val.getBitWidth() - FromBits; Val = Val.ashr(Val.getBitWidth() - FromBits); - return getConstant(Val, DL, VT.getScalarType()); + return getConstant(Val, DL, ConstantVT); }; if (N1C) { const APInt &Val = N1C->getAPIntValue(); - return SignExtendInReg(Val); + return SignExtendInReg(Val, VT); } if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { SmallVector<SDValue, 8> Ops; + llvm::EVT OpVT = N1.getOperand(0).getValueType(); for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { SDValue Op = N1.getOperand(i); if (Op.isUndef()) { - Ops.push_back(getUNDEF(VT.getScalarType())); + Ops.push_back(getUNDEF(OpVT)); continue; } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - APInt Val = C->getAPIntValue(); - Val = Val.zextOrTrunc(VT.getScalarSizeInBits()); - Ops.push_back(SignExtendInReg(Val)); - continue; - } - break; + ConstantSDNode *C = cast<ConstantSDNode>(Op); + APInt Val = C->getAPIntValue(); + Ops.push_back(SignExtendInReg(Val, OpVT)); } - if (Ops.size() == VT.getVectorNumElements()) - return getBuildVector(VT, DL, Ops); + return getBuildVector(VT, DL, Ops); } break; } @@ -4040,6 +4314,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT.getSimpleVT() == N1.getSimpleValueType()) return N1; + // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF. + if (N1.isUndef()) + return getUNDEF(VT); + + // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of + // the concat have the same type as the extract. + if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS && + N1.getNumOperands() > 0 && + VT == N1.getOperand(0).getValueType()) { + unsigned Factor = VT.getVectorNumElements(); + return N1.getOperand(N2C->getZExtValue() / Factor); + } + // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created // during shuffle legalization. if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) && @@ -4943,11 +5230,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5004,11 +5291,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5066,11 +5353,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -7049,6 +7336,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const { return Seen; } +/// Return true if the only users of N are contained in Nodes. +bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) { + bool Seen = false; + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + SDNode *User = *I; + if (llvm::any_of(Nodes, + [&User](const SDNode *Node) { return User == Node; })) + Seen = true; + else + return false; + } + + return Seen; +} + /// isOperand - Return true if this node is an operand of N. /// bool SDValue::isOperandOf(const SDNode *N) const { @@ -7070,21 +7372,39 @@ bool SDNode::isOperandOf(const SDNode *N) const { /// side-effecting instructions on any chain path. In practice, this looks /// through token factors and non-volatile loads. In order to remain efficient, /// this only looks a couple of nodes in, it does not do an exhaustive search. +/// +/// Note that we only need to examine chains when we're searching for +/// side-effects; SelectionDAG requires that all side-effects are represented +/// by chains, even if another operand would force a specific ordering. This +/// constraint is necessary to allow transformations like splitting loads. bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, - unsigned Depth) const { + unsigned Depth) const { if (*this == Dest) return true; // Don't search too deeply, we just want to be able to see through // TokenFactor's etc. if (Depth == 0) return false; - // If this is a token factor, all inputs to the TF happen in parallel. If any - // of the operands of the TF does not reach dest, then we cannot do the xform. + // If this is a token factor, all inputs to the TF happen in parallel. if (getOpcode() == ISD::TokenFactor) { - for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1)) - return false; - return true; + // First, try a shallow search. + if (is_contained((*this)->ops(), Dest)) { + // We found the chain we want as an operand of this TokenFactor. + // Essentially, we reach the chain without side-effects if we could + // serialize the TokenFactor into a simple chain of operations with + // Dest as the last operation. This is automatically true if the + // chain has one use: there are no other ordering constraints. + // If the chain has more than one use, we give up: some other + // use of Dest might force a side-effect between Dest and the current + // node. + if (Dest.hasOneUse()) + return true; + } + // Next, try a deep search: check whether every operand of the TokenFactor + // reaches Dest. + return all_of((*this)->ops(), [=](SDValue Op) { + return Op.reachesChainWithoutSideEffects(Dest, Depth - 1); + }); } // Loads don't have side effects, look through them. @@ -7102,11 +7422,6 @@ bool SDNode::hasPredecessor(const SDNode *N) const { return hasPredecessorHelper(N, Visited, Worklist); } -uint64_t SDNode::getConstantOperandVal(unsigned Num) const { - assert(Num < NumOperands && "Invalid child # of SDNode!"); - return cast<ConstantSDNode>(OperandList[Num])->getZExtValue(); -} - const SDNodeFlags *SDNode::getFlags() const { if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this)) return &FlagsNode->Flags; @@ -7377,13 +7692,13 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, unsigned BitPos = j * EltBitSize; if (OpVal.isUndef()) - SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize); + SplatUndef.setBits(BitPos, BitPos + EltBitSize); else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) - SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize). - zextOrTrunc(sz) << BitPos; + SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltBitSize), + BitPos); else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) - SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos; - else + SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos); + else return false; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 996c95bd5f07b..8708f58f1e632 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -84,10 +84,6 @@ LimitFPPrecision("limit-float-precision", cl::location(LimitFloatPrecision), cl::init(0)); -static cl::opt<bool> -EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden, - cl::desc("Enable fast-math-flags for DAG nodes")); - /// Minimum jump table density for normal functions. static cl::opt<unsigned> JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden, @@ -634,10 +630,6 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, } } -/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from -/// this value and returns the result as a ValueVT value. This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used. SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, @@ -739,10 +731,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values); } -/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the -/// specified value into the registers specified by this object. This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used. void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V, @@ -796,9 +784,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } -/// AddInlineAsmOperands - Add this value to the specified inlineasm node -/// operand list. This adds the code marker and includes the number of -/// values added into it. void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, unsigned MatchingIdx, const SDLoc &dl, SelectionDAG &DAG, @@ -850,12 +835,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, LPadToCallSiteMap.clear(); } -/// clear - Clear out the current SelectionDAG and the associated -/// state and prepare this SelectionDAGBuilder object to be used -/// for a new block. This doesn't clear out information about -/// additional blocks that are needed to complete switch lowering -/// or PHI node updating; that information is cleared out as it is -/// consumed. void SelectionDAGBuilder::clear() { NodeMap.clear(); UnusedArgNodeMap.clear(); @@ -867,21 +846,10 @@ void SelectionDAGBuilder::clear() { StatepointLowering.clear(); } -/// clearDanglingDebugInfo - Clear the dangling debug information -/// map. This function is separated from the clear so that debug -/// information that is dangling in a basic block can be properly -/// resolved in a different basic block. This allows the -/// SelectionDAG to resolve dangling debug information attached -/// to PHI nodes. void SelectionDAGBuilder::clearDanglingDebugInfo() { DanglingDebugInfoMap.clear(); } -/// getRoot - Return the current virtual root of the Selection DAG, -/// flushing any PendingLoad items. This must be done before emitting -/// a store or any other node that may need to be ordered after any -/// prior load instructions. -/// SDValue SelectionDAGBuilder::getRoot() { if (PendingLoads.empty()) return DAG.getRoot(); @@ -901,10 +869,6 @@ SDValue SelectionDAGBuilder::getRoot() { return Root; } -/// getControlRoot - Similar to getRoot, but instead of flushing all the -/// PendingLoad items, flush all the PendingExports items. It is necessary -/// to do this before emitting a terminator instruction. -/// SDValue SelectionDAGBuilder::getControlRoot() { SDValue Root = DAG.getRoot(); @@ -937,7 +901,9 @@ void SelectionDAGBuilder::visit(const Instruction &I) { HandlePHINodesInSuccessorBlocks(I.getParent()); } - ++SDNodeOrder; + // Increase the SDNodeOrder if dealing with a non-debug instruction. + if (!isa<DbgInfoIntrinsic>(I)) + ++SDNodeOrder; CurInst = &I; @@ -1403,16 +1369,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { const Function *F = I.getParent()->getParent(); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; LLVMContext &Context = F->getContext(); - bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, - Attribute::InReg); + bool RetInReg = F->getAttributes().hasAttribute( + AttributeList::ReturnIndex, Attribute::InReg); for (unsigned j = 0; j != NumValues; ++j) { EVT VT = ValueVTs[j]; @@ -1582,7 +1548,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, BranchProbability TProb, - BranchProbability FProb) { + BranchProbability FProb, + bool InvertCond) { const BasicBlock *BB = CurBB->getBasicBlock(); // If the leaf of the tree is a comparison, merge the condition into @@ -1596,10 +1563,14 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, isExportableFromCurrentBlock(BOp->getOperand(1), BB))) { ISD::CondCode Condition; if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) { - Condition = getICmpCondCode(IC->getPredicate()); + ICmpInst::Predicate Pred = + InvertCond ? IC->getInversePredicate() : IC->getPredicate(); + Condition = getICmpCondCode(Pred); } else { const FCmpInst *FC = cast<FCmpInst>(Cond); - Condition = getFCmpCondCode(FC->getPredicate()); + FCmpInst::Predicate Pred = + InvertCond ? FC->getInversePredicate() : FC->getPredicate(); + Condition = getFCmpCondCode(Pred); if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); } @@ -1612,7 +1583,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, } // Create a CaseBlock record representing this branch. - CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()), + ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ; + CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()), nullptr, TBB, FBB, CurBB, TProb, FProb); SwitchCases.push_back(CB); } @@ -1625,16 +1597,44 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, MachineBasicBlock *SwitchBB, Instruction::BinaryOps Opc, BranchProbability TProb, - BranchProbability FProb) { - // If this node is not part of the or/and tree, emit it as a branch. + BranchProbability FProb, + bool InvertCond) { + // Skip over not part of the tree and remember to invert op and operands at + // next level. + if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) { + const Value *CondOp = BinaryOperator::getNotArgument(Cond); + if (InBlock(CondOp, CurBB->getBasicBlock())) { + FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb, + !InvertCond); + return; + } + } + const Instruction *BOp = dyn_cast<Instruction>(Cond); + // Compute the effective opcode for Cond, taking into account whether it needs + // to be inverted, e.g. + // and (not (or A, B)), C + // gets lowered as + // and (and (not A, not B), C) + unsigned BOpc = 0; + if (BOp) { + BOpc = BOp->getOpcode(); + if (InvertCond) { + if (BOpc == Instruction::And) + BOpc = Instruction::Or; + else if (BOpc == Instruction::Or) + BOpc = Instruction::And; + } + } + + // If this node is not part of the or/and tree, emit it as a branch. if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) || - (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() || + BOpc != Opc || !BOp->hasOneUse() || BOp->getParent() != CurBB->getBasicBlock() || !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) || !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) { EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, - TProb, FProb); + TProb, FProb, InvertCond); return; } @@ -1669,14 +1669,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, auto NewFalseProb = TProb / 2 + FProb; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, - NewTrueProb, NewFalseProb); + NewTrueProb, NewFalseProb, InvertCond); // Normalize A/2 and B to get A/(1+B) and 2B/(1+B). SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb}; BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - Probs[0], Probs[1]); + Probs[0], Probs[1], InvertCond); } else { assert(Opc == Instruction::And && "Unknown merge op!"); // Codegen X & Y as: @@ -1702,14 +1702,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, auto NewFalseProb = FProb / 2; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, - NewTrueProb, NewFalseProb); + NewTrueProb, NewFalseProb, InvertCond); // Normalize A and B/2 to get 2A/(1+A) and B/(1+A). SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2}; BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - Probs[0], Probs[1]); + Probs[0], Probs[1], InvertCond); } } @@ -1793,7 +1793,8 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) { FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode, getEdgeProbability(BrMBB, Succ0MBB), - getEdgeProbability(BrMBB, Succ1MBB)); + getEdgeProbability(BrMBB, Succ1MBB), + /*InvertCond=*/false); // If the compares in later blocks need to use values not currently // exported from this block, export them now. This block should always // be the first entry. @@ -2027,7 +2028,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, Entry.Node = StackSlot; Entry.Ty = FnTy->getParamType(0); if (Fn->hasAttribute(1, Attribute::AttrKind::InReg)) - Entry.isInReg = true; + Entry.IsInReg = true; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); @@ -2581,13 +2582,13 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) { Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); Flags.setVectorReduction(vec_redux); - if (EnableFMFInDAG) { - Flags.setAllowReciprocal(FMF.allowReciprocal()); - Flags.setNoInfs(FMF.noInfs()); - Flags.setNoNaNs(FMF.noNaNs()); - Flags.setNoSignedZeros(FMF.noSignedZeros()); - Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); - } + Flags.setAllowReciprocal(FMF.allowReciprocal()); + Flags.setAllowContract(FMF.allowContract()); + Flags.setNoInfs(FMF.noInfs()); + Flags.setNoNaNs(FMF.noNaNs()); + Flags.setNoSignedZeros(FMF.noSignedZeros()); + Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); + SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, &Flags); setValue(&I, BinNodeValue); @@ -2914,7 +2915,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) { DestVT, N)); // convert types. // Check if the original LLVM IR Operand was a ConstantInt, because getValue() // might fold any kind of constant expression to an integer constant and that - // is not what we are looking for. Only regcognize a bitcast of a genuine + // is not what we are looking for. Only recognize a bitcast of a genuine // constant integer as an opaque constant. else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0))) setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false, @@ -3067,14 +3068,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { if (SrcNumElts > MaskNumElts) { // Analyze the access pattern of the vector to see if we can extract - // two subvectors and do the shuffle. The analysis is done by calculating - // the range of elements the mask access on both vectors. - int MinRange[2] = { static_cast<int>(SrcNumElts), - static_cast<int>(SrcNumElts)}; - int MaxRange[2] = {-1, -1}; - - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; + // two subvectors and do the shuffle. + int StartIdx[2] = { -1, -1 }; // StartIdx to extract from + bool CanExtract = true; + for (int Idx : Mask) { unsigned Input = 0; if (Idx < 0) continue; @@ -3083,41 +3080,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { Input = 1; Idx -= SrcNumElts; } - if (Idx > MaxRange[Input]) - MaxRange[Input] = Idx; - if (Idx < MinRange[Input]) - MinRange[Input] = Idx; - } - - // Check if the access is smaller than the vector size and can we find - // a reasonable extract index. - int RangeUse[2] = { -1, -1 }; // 0 = Unused, 1 = Extract, -1 = Can not - // Extract. - int StartIdx[2]; // StartIdx to extract from - for (unsigned Input = 0; Input < 2; ++Input) { - if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) { - RangeUse[Input] = 0; // Unused - StartIdx[Input] = 0; - continue; - } - // Find a good start index that is a multiple of the mask length. Then - // see if the rest of the elements are in range. - StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts; - if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts && - StartIdx[Input] + MaskNumElts <= SrcNumElts) - RangeUse[Input] = 1; // Extract from a multiple of the mask length. + // If all the indices come from the same MaskNumElts sized portion of + // the sources we can use extract. Also make sure the extract wouldn't + // extract past the end of the source. + int NewStartIdx = alignDown(Idx, MaskNumElts); + if (NewStartIdx + MaskNumElts > SrcNumElts || + (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx)) + CanExtract = false; + // Make sure we always update StartIdx as we use it to track if all + // elements are undef. + StartIdx[Input] = NewStartIdx; } - if (RangeUse[0] == 0 && RangeUse[1] == 0) { + if (StartIdx[0] < 0 && StartIdx[1] < 0) { setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used. return; } - if (RangeUse[0] >= 0 && RangeUse[1] >= 0) { + if (CanExtract) { // Extract appropriate subvector and generate a vector shuffle for (unsigned Input = 0; Input < 2; ++Input) { SDValue &Src = Input == 0 ? Src1 : Src2; - if (RangeUse[Input] == 0) + if (StartIdx[Input] < 0) Src = DAG.getUNDEF(VT); else { Src = DAG.getNode( @@ -3128,16 +3112,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { } // Calculate new mask. - SmallVector<int, 8> MappedOps; - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; - if (Idx >= 0) { - if (Idx < (int)SrcNumElts) - Idx -= StartIdx[0]; - else - Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; - } - MappedOps.push_back(Idx); + SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end()); + for (int &Idx : MappedOps) { + if (Idx >= (int)SrcNumElts) + Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; + else if (Idx >= 0) + Idx -= StartIdx[0]; } setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps)); @@ -3151,8 +3131,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { EVT EltVT = VT.getVectorElementType(); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); SmallVector<SDValue,8> Ops; - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; + for (int Idx : Mask) { SDValue Res; if (Idx < 0) { @@ -3281,7 +3260,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { // N = N + Offset uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); - // In an inbouds GEP with an offset that is nonnegative even when + // In an inbounds GEP with an offset that is nonnegative even when // interpreted as signed, assume there is no unsigned overflow. SDNodeFlags Flags; if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds()) @@ -4752,7 +4731,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( else FuncInfo.ArgDbgValues.push_back( BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) + .add(*Op) .addImm(Offset) .addMetadata(Variable) .addMetadata(Expr)); @@ -4764,7 +4743,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, DILocalVariable *Variable, DIExpression *Expr, int64_t Offset, - DebugLoc dl, + const DebugLoc &dl, unsigned DbgSDNodeOrder) { SDDbgValue *SDV; auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode()); @@ -4794,9 +4773,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, # define setjmp_undefined_for_msvc #endif -/// visitIntrinsicCall - Lower the call to the specified intrinsic function. If -/// we want to emit this as a call to a named external function, return the name -/// otherwise lower it and return null. +/// Lower the call to the specified intrinsic function. If we want to emit this +/// as a call to a named external function, return the name. Otherwise, lower it +/// and return null. const char * SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -4929,14 +4908,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { report_fatal_error("Unsupported element size"); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(sdl) - .setChain(getRoot()) - .setCallee(TLI.getLibcallCallingConv(LibraryCall), - Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol( - TLI.getLibcallName(LibraryCall), - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( + TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); DAG.setRoot(CallResult.second); @@ -5301,6 +5278,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return nullptr; + case Intrinsic::experimental_constrained_fadd: + case Intrinsic::experimental_constrained_fsub: + case Intrinsic::experimental_constrained_fmul: + case Intrinsic::experimental_constrained_fdiv: + case Intrinsic::experimental_constrained_frem: + visitConstrainedFPIntrinsic(I, Intrinsic); + return nullptr; case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && @@ -5537,7 +5521,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::trap: { StringRef TrapFuncName = I.getAttributes() - .getAttribute(AttributeSet::FunctionIndex, "trap-func-name") + .getAttribute(AttributeList::FunctionIndex, "trap-func-name") .getValueAsString(); if (TrapFuncName.empty()) { ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? @@ -5548,7 +5532,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { TargetLowering::ArgListTy Args; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee( + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( CallingConv::C, I.getType(), DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy(DAG.getDataLayout())), @@ -5749,6 +5733,46 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } } +void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I, + unsigned Intrinsic) { + SDLoc sdl = getCurSDLoc(); + unsigned Opcode; + switch (Intrinsic) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::experimental_constrained_fadd: + Opcode = ISD::STRICT_FADD; + break; + case Intrinsic::experimental_constrained_fsub: + Opcode = ISD::STRICT_FSUB; + break; + case Intrinsic::experimental_constrained_fmul: + Opcode = ISD::STRICT_FMUL; + break; + case Intrinsic::experimental_constrained_fdiv: + Opcode = ISD::STRICT_FDIV; + break; + case Intrinsic::experimental_constrained_frem: + Opcode = ISD::STRICT_FREM; + break; + } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Chain = getRoot(); + SDValue Ops[3] = { Chain, getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)) }; + SmallVector<EVT, 4> ValueVTs; + ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); + ValueVTs.push_back(MVT::Other); // Out chain + + SDVTList VTs = DAG.getVTList(ValueVTs); + SDValue Result = DAG.getNode(Opcode, sdl, VTs, Ops); + + assert(Result.getNode()->getNumValues() == 2); + SDValue OutChain = Result.getValue(1); + DAG.setRoot(OutChain); + SDValue FPResult = Result.getValue(0); + setValue(&I, FPResult); +} + std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, const BasicBlock *EHPadBB) { @@ -5827,7 +5851,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, Type *RetTy = CS.getType(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; Args.reserve(CS.arg_size()); const Value *SwiftErrorVal = nullptr; @@ -5843,6 +5866,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { + TargetLowering::ArgListEntry Entry; const Value *V = *i; // Skip empty types @@ -5852,11 +5876,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, SDValue ArgNode = getValue(V); Entry.Node = ArgNode; Entry.Ty = V->getType(); - // Skip the first return-type Attribute to get to params. - Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Entry.setAttributes(&CS, i - CS.arg_begin()); // Use swifterror virtual register as input to the call. - if (Entry.isSwiftError && TLI.supportSwiftError()) { + if (Entry.IsSwiftError && TLI.supportSwiftError()) { SwiftErrorVal = V; // We find the virtual register for the actual swifterror argument. // Instead of using the Value, we use the virtual register instead. @@ -5869,7 +5892,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, // If we have an explicit sret argument that is an Instruction, (i.e., it // might point to function-local memory), we can't meaningfully tail-call. - if (Entry.isSRet && isa<Instruction>(V)) + if (Entry.IsSRet && isa<Instruction>(V)) isTailCall = false; } @@ -5912,8 +5935,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, } } -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. +/// Return true if it only matters that the value is equal or not-equal to zero. static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { for (const User *U : V->users()) { if (const ICmpInst *IC = dyn_cast<ICmpInst>(U)) @@ -5928,13 +5950,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { } static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, - Type *LoadTy, SelectionDAGBuilder &Builder) { // Check to see if this load can be trivially constant folded, e.g. if the // input is from a string literal. if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) { // Cast pointer to the type we really want to load. + Type *LoadTy = + Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits()); + if (LoadVT.isVector()) + LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements()); + LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput), PointerType::getUnqual(LoadTy)); @@ -5967,8 +5993,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, return LoadVal; } -/// processIntegerCallValue - Record the value for an instruction that -/// produces an integer result, converting the type where necessary. +/// Record the value for an instruction that produces an integer result, +/// converting the type where necessary. void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, SDValue Value, bool IsSigned) { @@ -5981,20 +6007,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, setValue(&I, Value); } -/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a memcmp call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { - // Verify that the prototype makes sense. int memcmp(void*,void*,size_t) - if (I.getNumArgOperands() != 3) - return false; - const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); - if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() || - !I.getArgOperand(2)->getType()->isIntegerTy() || - !I.getType()->isIntegerTy()) - return false; - const Value *Size = I.getArgOperand(2); const ConstantInt *CSize = dyn_cast<ConstantInt>(Size); if (CSize && CSize->getZExtValue() == 0) { @@ -6005,11 +6024,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { } const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); - std::pair<SDValue, SDValue> Res = - TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(), - getValue(LHS), getValue(RHS), getValue(Size), - MachinePointerInfo(LHS), - MachinePointerInfo(RHS)); + std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp( + DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS), + getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, true); PendingLoads.push_back(Res.second); @@ -6018,88 +6035,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 - if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) { - bool ActuallyDoIt = true; - MVT LoadVT; - Type *LoadTy; - switch (CSize->getZExtValue()) { - default: - LoadVT = MVT::Other; - LoadTy = nullptr; - ActuallyDoIt = false; - break; - case 2: - LoadVT = MVT::i16; - LoadTy = Type::getInt16Ty(CSize->getContext()); - break; - case 4: - LoadVT = MVT::i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - break; - case 8: - LoadVT = MVT::i64; - LoadTy = Type::getInt64Ty(CSize->getContext()); - break; - /* - case 16: - LoadVT = MVT::v4i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - LoadTy = VectorType::get(LoadTy, 4); - break; - */ - } - - // This turns into unaligned loads. We only do this if the target natively - // supports the MVT we'll be loading or if it is small enough (<= 4) that - // we'll only produce a small number of byte loads. + if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I)) + return false; - // Require that we can find a legal MVT, and only do this if the target - // supports unaligned loads of that type. Expanding into byte loads would - // bloat the code. + // If the target has a fast compare for the given size, it will return a + // preferred load type for that size. Require that the load VT is legal and + // that the target supports unaligned loads of that type. Otherwise, return + // INVALID. + auto hasFastLoadsAndCompare = [&](unsigned NumBits) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (ActuallyDoIt && CSize->getZExtValue() > 4) { - unsigned DstAS = LHS->getType()->getPointerAddressSpace(); - unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + MVT LVT = TLI.hasFastEqualityCompare(NumBits); + if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { // TODO: Handle 5 byte compare as 4-byte + 1 byte. // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. // TODO: Check alignment of src and dest ptrs. - if (!TLI.isTypeLegal(LoadVT) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS)) - ActuallyDoIt = false; + unsigned DstAS = LHS->getType()->getPointerAddressSpace(); + unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + if (!TLI.isTypeLegal(LVT) || + !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || + !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) + LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; } - if (ActuallyDoIt) { - SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); - SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); + return LVT; + }; - SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, - ISD::SETNE); - processIntegerCallValue(I, Res, false); - return true; - } + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. + MVT LoadVT; + unsigned NumBitsToCompare = CSize->getZExtValue() * 8; + switch (NumBitsToCompare) { + default: + return false; + case 16: + LoadVT = MVT::i16; + break; + case 32: + LoadVT = MVT::i32; + break; + case 64: + case 128: + case 256: + LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); + break; } + if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; - return false; + SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); + SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + + // Bitcast to a wide integer type if the loads are vectors. + if (LoadVT.isVector()) { + EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); + LoadL = DAG.getBitcast(CmpVT, LoadL); + LoadR = DAG.getBitcast(CmpVT, LoadR); + } + + SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); + processIntegerCallValue(I, Cmp, false); + return true; } -/// visitMemChrCall -- See if we can lower a memchr call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a memchr call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { - // Verify that the prototype makes sense. void *memchr(void *, int, size_t) - if (I.getNumArgOperands() != 3) - return false; - const Value *Src = I.getArgOperand(0); const Value *Char = I.getArgOperand(1); const Value *Length = I.getArgOperand(2); - if (!Src->getType()->isPointerTy() || - !Char->getType()->isIntegerTy() || - !Length->getType()->isIntegerTy() || - !I.getType()->isPointerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6115,15 +6123,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { return false; } -/// -/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to -/// to adjust the dst pointer by the size of the copied memory. +/// See if we can lower a mempcpy call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { - - // Verify argument count: void *mempcpy(void *, const void *, size_t) - if (I.getNumArgOperands() != 3) - return false; - SDValue Dst = getValue(I.getArgOperand(0)); SDValue Src = getValue(I.getArgOperand(1)); SDValue Size = getValue(I.getArgOperand(2)); @@ -6158,19 +6163,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { return true; } -/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an -/// optimized form. If so, return true and lower it, otherwise return false -/// and it will be lowered like a normal call. +/// See if we can lower a strcpy call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { - // Verify that the prototype makes sense. char *strcpy(char *, char *) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isPointerTy() || - !I.getType()->isPointerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6187,19 +6186,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { return false; } -/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a strcmp call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { - // Verify that the prototype makes sense. int strcmp(void*,void*) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isPointerTy() || - !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6216,17 +6209,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { return false; } -/// visitStrLenCall -- See if we can lower a strlen call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strlen call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { - // Verify that the prototype makes sense. size_t strlen(char *) - if (I.getNumArgOperands() != 1) - return false; - const Value *Arg0 = I.getArgOperand(0); - if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6241,19 +6230,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { return false; } -/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strnlen call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { - // Verify that the prototype makes sense. size_t strnlen(char *, size_t) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isIntegerTy() || - !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6269,16 +6252,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { return false; } -/// visitUnaryFloatCall - If a call instruction is a unary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a unary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it, otherwise return +/// false and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { - // Sanity check that it really is a unary floating-point call. - if (I.getNumArgOperands() != 1 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - !I.onlyReadsMemory()) + // We already checked this call's prototype; verify it doesn't modify errno. + if (!I.onlyReadsMemory()) return false; SDValue Tmp = getValue(I.getArgOperand(0)); @@ -6286,17 +6268,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, return true; } -/// visitBinaryFloatCall - If a call instruction is a binary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a binary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it. Otherwise return +/// false, and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { - // Sanity check that it really is a binary floating-point call. - if (I.getNumArgOperands() != 2 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - I.getType() != I.getArgOperand(1)->getType() || - !I.onlyReadsMemory()) + // We already checked this call's prototype; verify it doesn't modify errno. + if (!I.onlyReadsMemory()) return false; SDValue Tmp0 = getValue(I.getArgOperand(0)); @@ -6336,20 +6316,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. Don't do the check if marked as nobuiltin for // some reason. - LibFunc::Func Func; + LibFunc Func; if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && - LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; - case LibFunc::copysign: - case LibFunc::copysignf: - case LibFunc::copysignl: - if (I.getNumArgOperands() == 2 && // Basic sanity checks. - I.getArgOperand(0)->getType()->isFloatingPointTy() && - I.getType() == I.getArgOperand(0)->getType() && - I.getType() == I.getArgOperand(1)->getType() && - I.onlyReadsMemory()) { + case LibFunc_copysign: + case LibFunc_copysignf: + case LibFunc_copysignl: + // We already checked this call's prototype; verify it doesn't modify + // errno. + if (I.onlyReadsMemory()) { SDValue LHS = getValue(I.getArgOperand(0)); SDValue RHS = getValue(I.getArgOperand(1)); setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(), @@ -6357,122 +6335,122 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { return; } break; - case LibFunc::fabs: - case LibFunc::fabsf: - case LibFunc::fabsl: + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: if (visitUnaryFloatCall(I, ISD::FABS)) return; break; - case LibFunc::fmin: - case LibFunc::fminf: - case LibFunc::fminl: + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: if (visitBinaryFloatCall(I, ISD::FMINNUM)) return; break; - case LibFunc::fmax: - case LibFunc::fmaxf: - case LibFunc::fmaxl: + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: if (visitBinaryFloatCall(I, ISD::FMAXNUM)) return; break; - case LibFunc::sin: - case LibFunc::sinf: - case LibFunc::sinl: + case LibFunc_sin: + case LibFunc_sinf: + case LibFunc_sinl: if (visitUnaryFloatCall(I, ISD::FSIN)) return; break; - case LibFunc::cos: - case LibFunc::cosf: - case LibFunc::cosl: + case LibFunc_cos: + case LibFunc_cosf: + case LibFunc_cosl: if (visitUnaryFloatCall(I, ISD::FCOS)) return; break; - case LibFunc::sqrt: - case LibFunc::sqrtf: - case LibFunc::sqrtl: - case LibFunc::sqrt_finite: - case LibFunc::sqrtf_finite: - case LibFunc::sqrtl_finite: + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + case LibFunc_sqrt_finite: + case LibFunc_sqrtf_finite: + case LibFunc_sqrtl_finite: if (visitUnaryFloatCall(I, ISD::FSQRT)) return; break; - case LibFunc::floor: - case LibFunc::floorf: - case LibFunc::floorl: + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: if (visitUnaryFloatCall(I, ISD::FFLOOR)) return; break; - case LibFunc::nearbyint: - case LibFunc::nearbyintf: - case LibFunc::nearbyintl: + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: if (visitUnaryFloatCall(I, ISD::FNEARBYINT)) return; break; - case LibFunc::ceil: - case LibFunc::ceilf: - case LibFunc::ceill: + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: if (visitUnaryFloatCall(I, ISD::FCEIL)) return; break; - case LibFunc::rint: - case LibFunc::rintf: - case LibFunc::rintl: + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: if (visitUnaryFloatCall(I, ISD::FRINT)) return; break; - case LibFunc::round: - case LibFunc::roundf: - case LibFunc::roundl: + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: if (visitUnaryFloatCall(I, ISD::FROUND)) return; break; - case LibFunc::trunc: - case LibFunc::truncf: - case LibFunc::truncl: + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: if (visitUnaryFloatCall(I, ISD::FTRUNC)) return; break; - case LibFunc::log2: - case LibFunc::log2f: - case LibFunc::log2l: + case LibFunc_log2: + case LibFunc_log2f: + case LibFunc_log2l: if (visitUnaryFloatCall(I, ISD::FLOG2)) return; break; - case LibFunc::exp2: - case LibFunc::exp2f: - case LibFunc::exp2l: + case LibFunc_exp2: + case LibFunc_exp2f: + case LibFunc_exp2l: if (visitUnaryFloatCall(I, ISD::FEXP2)) return; break; - case LibFunc::memcmp: + case LibFunc_memcmp: if (visitMemCmpCall(I)) return; break; - case LibFunc::mempcpy: + case LibFunc_mempcpy: if (visitMemPCpyCall(I)) return; break; - case LibFunc::memchr: + case LibFunc_memchr: if (visitMemChrCall(I)) return; break; - case LibFunc::strcpy: + case LibFunc_strcpy: if (visitStrCpyCall(I, false)) return; break; - case LibFunc::stpcpy: + case LibFunc_stpcpy: if (visitStrCpyCall(I, true)) return; break; - case LibFunc::strcmp: + case LibFunc_strcmp: if (visitStrCmpCall(I)) return; break; - case LibFunc::strlen: + case LibFunc_strlen: if (visitStrLenCall(I)) return; break; - case LibFunc::strnlen: + case LibFunc_strnlen: if (visitStrNLenCall(I)) return; break; @@ -7361,7 +7339,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( // Populate the argument list. // Attributes for args start at offset 1, after the return attribute. - for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) { const Value *V = CS->getOperand(ArgI); @@ -7370,7 +7348,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( TargetLowering::ArgListEntry Entry; Entry.Node = getValue(V); Entry.Ty = V->getType(); - Entry.setAttributes(&CS, AttrI); + Entry.setAttributes(&CS, ArgIdx); Args.push_back(Entry); } @@ -7631,9 +7609,9 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, FuncInfo.MF->getFrameInfo().setHasPatchPoint(); } -/// Returns an AttributeSet representing the attributes applied to the return +/// Returns an AttributeList representing the attributes applied to the return /// value of the given call. -static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { SmallVector<Attribute::AttrKind, 2> Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); @@ -7642,8 +7620,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); - return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, - Attrs); + return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, + Attrs); } /// TargetLowering::LowerCallTo - This is the default LowerCallTo @@ -7683,15 +7661,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ArgListEntry Entry; Entry.Node = DemoteStackSlot; Entry.Ty = StackSlotPtrType; - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isInReg = false; - Entry.isSRet = true; - Entry.isNest = false; - Entry.isByVal = false; - Entry.isReturned = false; - Entry.isSwiftSelf = false; - Entry.isSwiftError = false; + Entry.IsSExt = false; + Entry.IsZExt = false; + Entry.IsInReg = false; + Entry.IsSRet = true; + Entry.IsNest = false; + Entry.IsByVal = false; + Entry.IsReturned = false; + Entry.IsSwiftSelf = false; + Entry.IsSwiftError = false; Entry.Alignment = Align; CLI.getArgs().insert(CLI.getArgs().begin(), Entry); CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext()); @@ -7724,7 +7702,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ArgListTy &Args = CLI.getArgs(); if (supportSwiftError()) { for (unsigned i = 0, e = Args.size(); i != e; ++i) { - if (Args[i].isSwiftError) { + if (Args[i].IsSwiftError) { ISD::InputArg MyFlags; MyFlags.VT = getPointerTy(DL); MyFlags.ArgVT = EVT(getPointerTy(DL)); @@ -7741,7 +7719,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs); Type *FinalType = Args[i].Ty; - if (Args[i].isByVal) + if (Args[i].IsByVal) FinalType = cast<PointerType>(Args[i].Ty)->getElementType(); bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( FinalType, CLI.CallConv, CLI.IsVarArg); @@ -7754,11 +7732,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ISD::ArgFlagsTy Flags; unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - if (Args[i].isZExt) + if (Args[i].IsZExt) Flags.setZExt(); - if (Args[i].isSExt) + if (Args[i].IsSExt) Flags.setSExt(); - if (Args[i].isInReg) { + if (Args[i].IsInReg) { // If we are using vectorcall calling convention, a structure that is // passed InReg - is surely an HVA if (CLI.CallConv == CallingConv::X86_VectorCall && @@ -7771,15 +7749,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // Set InReg Flag Flags.setInReg(); } - if (Args[i].isSRet) + if (Args[i].IsSRet) Flags.setSRet(); - if (Args[i].isSwiftSelf) + if (Args[i].IsSwiftSelf) Flags.setSwiftSelf(); - if (Args[i].isSwiftError) + if (Args[i].IsSwiftError) Flags.setSwiftError(); - if (Args[i].isByVal) + if (Args[i].IsByVal) Flags.setByVal(); - if (Args[i].isInAlloca) { + if (Args[i].IsInAlloca) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated @@ -7788,7 +7766,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // in the various CC lowering callbacks. Flags.setByVal(); } - if (Args[i].isByVal || Args[i].isInAlloca) { + if (Args[i].IsByVal || Args[i].IsInAlloca) { PointerType *Ty = cast<PointerType>(Args[i].Ty); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); @@ -7801,7 +7779,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { FrameAlign = getByValTypeAlignment(ElementTy, DL); Flags.setByValAlign(FrameAlign); } - if (Args[i].isNest) + if (Args[i].IsNest) Flags.setNest(); if (NeedsRegBlock) Flags.setInConsecutiveRegs(); @@ -7812,13 +7790,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SmallVector<SDValue, 4> Parts(NumParts); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (Args[i].isSExt) + if (Args[i].IsSExt) ExtendKind = ISD::SIGN_EXTEND; - else if (Args[i].isZExt) + else if (Args[i].IsZExt) ExtendKind = ISD::ZERO_EXTEND; // Conservatively only handle 'returned' on non-vectors for now - if (Args[i].isReturned && !Op.getValueType().isVector()) { + if (Args[i].IsReturned && !Op.getValueType().isVector()) { assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues && "unexpected use of 'returned'"); // Before passing 'returned' to the target lowering code, ensure that @@ -7832,9 +7810,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // parameter extension method is not compatible with the return // extension method if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) || - (ExtendKind != ISD::ANY_EXTEND && - CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt)) - Flags.setReturned(); + (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt && + CLI.RetZExt == Args[i].IsZExt)) + Flags.setReturned(); } getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, @@ -8010,6 +7988,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) { return true; } +typedef DenseMap<const Argument *, + std::pair<const AllocaInst *, const StoreInst *>> + ArgCopyElisionMapTy; + +/// Scan the entry block of the function in FuncInfo for arguments that look +/// like copies into a local alloca. Record any copied arguments in +/// ArgCopyElisionCandidates. +static void +findArgumentCopyElisionCandidates(const DataLayout &DL, + FunctionLoweringInfo *FuncInfo, + ArgCopyElisionMapTy &ArgCopyElisionCandidates) { + // Record the state of every static alloca used in the entry block. Argument + // allocas are all used in the entry block, so we need approximately as many + // entries as we have arguments. + enum StaticAllocaInfo { Unknown, Clobbered, Elidable }; + SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas; + unsigned NumArgs = FuncInfo->Fn->arg_size(); + StaticAllocas.reserve(NumArgs * 2); + + auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * { + if (!V) + return nullptr; + V = V->stripPointerCasts(); + const auto *AI = dyn_cast<AllocaInst>(V); + if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI)) + return nullptr; + auto Iter = StaticAllocas.insert({AI, Unknown}); + return &Iter.first->second; + }; + + // Look for stores of arguments to static allocas. Look through bitcasts and + // GEPs to handle type coercions, as long as the alloca is fully initialized + // by the store. Any non-store use of an alloca escapes it and any subsequent + // unanalyzed store might write it. + // FIXME: Handle structs initialized with multiple stores. + for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) { + // Look for stores, and handle non-store uses conservatively. + const auto *SI = dyn_cast<StoreInst>(&I); + if (!SI) { + // We will look through cast uses, so ignore them completely. + if (I.isCast()) + continue; + // Ignore debug info intrinsics, they don't escape or store to allocas. + if (isa<DbgInfoIntrinsic>(I)) + continue; + // This is an unknown instruction. Assume it escapes or writes to all + // static alloca operands. + for (const Use &U : I.operands()) { + if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U)) + *Info = StaticAllocaInfo::Clobbered; + } + continue; + } + + // If the stored value is a static alloca, mark it as escaped. + if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand())) + *Info = StaticAllocaInfo::Clobbered; + + // Check if the destination is a static alloca. + const Value *Dst = SI->getPointerOperand()->stripPointerCasts(); + StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst); + if (!Info) + continue; + const AllocaInst *AI = cast<AllocaInst>(Dst); + + // Skip allocas that have been initialized or clobbered. + if (*Info != StaticAllocaInfo::Unknown) + continue; + + // Check if the stored value is an argument, and that this store fully + // initializes the alloca. Don't elide copies from the same argument twice. + const Value *Val = SI->getValueOperand()->stripPointerCasts(); + const auto *Arg = dyn_cast<Argument>(Val); + if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() || + Arg->getType()->isEmptyTy() || + DL.getTypeStoreSize(Arg->getType()) != + DL.getTypeAllocSize(AI->getAllocatedType()) || + ArgCopyElisionCandidates.count(Arg)) { + *Info = StaticAllocaInfo::Clobbered; + continue; + } + + DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n'); + + // Mark this alloca and store for argument copy elision. + *Info = StaticAllocaInfo::Elidable; + ArgCopyElisionCandidates.insert({Arg, {AI, SI}}); + + // Stop scanning if we've seen all arguments. This will happen early in -O0 + // builds, which is useful, because -O0 builds have large entry blocks and + // many allocas. + if (ArgCopyElisionCandidates.size() == NumArgs) + break; + } +} + +/// Try to elide argument copies from memory into a local alloca. Succeeds if +/// ArgVal is a load from a suitable fixed stack object. +static void tryToElideArgumentCopy( + FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains, + DenseMap<int, int> &ArgCopyElisionFrameIndexMap, + SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs, + ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg, + SDValue ArgVal, bool &ArgHasUses) { + // Check if this is a load from a fixed stack object. + auto *LNode = dyn_cast<LoadSDNode>(ArgVal); + if (!LNode) + return; + auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()); + if (!FINode) + return; + + // Check that the fixed stack object is the right size and alignment. + // Look at the alignment that the user wrote on the alloca instead of looking + // at the stack object. + auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg); + assert(ArgCopyIter != ArgCopyElisionCandidates.end()); + const AllocaInst *AI = ArgCopyIter->second.first; + int FixedIndex = FINode->getIndex(); + int &AllocaIndex = FuncInfo->StaticAllocaMap[AI]; + int OldIndex = AllocaIndex; + MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo(); + if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) { + DEBUG(dbgs() << " argument copy elision failed due to bad fixed stack " + "object size\n"); + return; + } + unsigned RequiredAlignment = AI->getAlignment(); + if (!RequiredAlignment) { + RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment( + AI->getAllocatedType()); + } + if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) { + DEBUG(dbgs() << " argument copy elision failed: alignment of alloca " + "greater than stack argument alignment (" + << RequiredAlignment << " vs " + << MFI.getObjectAlignment(FixedIndex) << ")\n"); + return; + } + + // Perform the elision. Delete the old stack object and replace its only use + // in the variable info map. Mark the stack object as mutable. + DEBUG({ + dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n' + << " Replacing frame index " << OldIndex << " with " << FixedIndex + << '\n'; + }); + MFI.RemoveStackObject(OldIndex); + MFI.setIsImmutableObjectIndex(FixedIndex, false); + AllocaIndex = FixedIndex; + ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex}); + Chains.push_back(ArgVal.getValue(1)); + + // Avoid emitting code for the store implementing the copy. + const StoreInst *SI = ArgCopyIter->second.second; + ElidedArgCopyInstrs.insert(SI); + + // Check for uses of the argument again so that we can avoid exporting ArgVal + // if it is't used by anything other than the store. + for (const Value *U : Arg.users()) { + if (U != SI) { + ArgHasUses = true; + break; + } + } +} + void SelectionDAGISel::LowerArguments(const Function &F) { SelectionDAG &DAG = SDB->DAG; SDLoc dl = SDB->getCurSDLoc(); @@ -8032,15 +8177,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) { Ins.push_back(RetArg); } + // Look for stores of arguments to static allocas. Mark such arguments with a + // flag to ask the target to give us the memory location of that argument if + // available. + ArgCopyElisionMapTy ArgCopyElisionCandidates; + findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates); + // Set up the incoming argument description vector. - unsigned Idx = 1; - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); - I != E; ++I, ++Idx) { + unsigned Idx = 0; + for (const Argument &Arg : F.args()) { + ++Idx; SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); - bool isArgValueUsed = !I->use_empty(); + ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); + bool isArgValueUsed = !Arg.use_empty(); unsigned PartBase = 0; - Type *FinalType = I->getType(); + Type *FinalType = Arg.getType(); if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) FinalType = cast<PointerType>(FinalType)->getElementType(); bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( @@ -8060,7 +8211,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // If we are using vectorcall calling convention, a structure that is // passed InReg - is surely an HVA if (F.getCallingConv() == CallingConv::X86_VectorCall && - isa<StructType>(I->getType())) { + isa<StructType>(Arg.getType())) { // The first value of a structure is marked if (0 == Value) Flags.setHvaStart(); @@ -8092,7 +8243,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { Flags.setByVal(); } if (Flags.isByVal() || Flags.isInAlloca()) { - PointerType *Ty = cast<PointerType>(I->getType()); + PointerType *Ty = cast<PointerType>(Arg.getType()); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); // For ByVal, alignment should be passed from FE. BE will guess if @@ -8109,6 +8260,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) { if (NeedsRegBlock) Flags.setInConsecutiveRegs(); Flags.setOrigAlign(OriginalAlignment); + if (ArgCopyElisionCandidates.count(&Arg)) + Flags.setCopyElisionCandidate(); MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT); unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT); @@ -8155,7 +8308,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Set up the argument values. unsigned i = 0; - Idx = 1; + Idx = 0; if (!FuncInfo->CanLowerReturn) { // Create a virtual register for the sret pointer, and put in a copy // from the sret argument into it. @@ -8181,25 +8334,39 @@ void SelectionDAGISel::LowerArguments(const Function &F) { ++i; } - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; - ++I, ++Idx) { + SmallVector<SDValue, 4> Chains; + DenseMap<int, int> ArgCopyElisionFrameIndexMap; + for (const Argument &Arg : F.args()) { + ++Idx; SmallVector<SDValue, 4> ArgValues; SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); + ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); unsigned NumValues = ValueVTs.size(); + if (NumValues == 0) + continue; + + bool ArgHasUses = !Arg.use_empty(); + + // Elide the copying store if the target loaded this argument from a + // suitable fixed stack object. + if (Ins[i].Flags.isCopyElisionCandidate()) { + tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap, + ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg, + InVals[i], ArgHasUses); + } // If this argument is unused then remember its value. It is used to generate // debugging information. bool isSwiftErrorArg = TLI->supportSwiftError() && F.getAttributes().hasAttribute(Idx, Attribute::SwiftError); - if (I->use_empty() && NumValues && !isSwiftErrorArg) { - SDB->setUnusedArgValue(&*I, InVals[i]); + if (!ArgHasUses && !isSwiftErrorArg) { + SDB->setUnusedArgValue(&Arg, InVals[i]); // Also remember any frame index for use in FastISel. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(InVals[i].getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); } for (unsigned Val = 0; Val != NumValues; ++Val) { @@ -8210,16 +8377,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Even an apparant 'unused' swifterror argument needs to be returned. So // we do generate a copy for it that can be used on return from the // function. - if (!I->use_empty() || isSwiftErrorArg) { + if (ArgHasUses || isSwiftErrorArg) { Optional<ISD::NodeType> AssertOp; if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) AssertOp = ISD::AssertSext; else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) AssertOp = ISD::AssertZext; - ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], - NumParts, PartVT, VT, - nullptr, AssertOp)); + ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, + PartVT, VT, nullptr, AssertOp)); } i += NumParts; @@ -8232,18 +8398,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Note down frame index. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues), SDB->getCurSDLoc()); - SDB->setValue(&*I, Res); + SDB->setValue(&Arg, Res); if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) { if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(Res.getOperand(0).getNode())) if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); } // Update the SwiftErrorVRegDefMap. @@ -8263,18 +8429,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // uses with vregs. unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { - FuncInfo->ValueMap[&*I] = Reg; + FuncInfo->ValueMap[&Arg] = Reg; continue; } } - if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) { - FuncInfo->InitializeRegForValue(&*I); - SDB->CopyToExportRegsIfNeeded(&*I); + if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) { + FuncInfo->InitializeRegForValue(&Arg); + SDB->CopyToExportRegsIfNeeded(&Arg); } } + if (!Chains.empty()) { + Chains.push_back(NewRoot); + NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + } + + DAG.setRoot(NewRoot); + assert(i == InVals.size() && "Argument register count mismatch!"); + // If any argument copy elisions occurred and we have debug info, update the + // stale frame indices used in the dbg.declare variable info table. + MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo(); + if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) { + for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) { + auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot); + if (I != ArgCopyElisionFrameIndexMap.end()) + VI.Slot = I->second; + } + } + // Finally, if the target has anything special to do, allow it to do so. EmitFunctionEntryCode(); } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index abde8a89befce..c6acc09b66028 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -616,33 +616,27 @@ public: void init(GCFunctionInfo *gfi, AliasAnalysis &aa, const TargetLibraryInfo *li); - /// clear - Clear out the current SelectionDAG and the associated - /// state and prepare this SelectionDAGBuilder object to be used - /// for a new block. This doesn't clear out information about - /// additional blocks that are needed to complete switch lowering - /// or PHI node updating; that information is cleared out as it is - /// consumed. + /// Clear out the current SelectionDAG and the associated state and prepare + /// this SelectionDAGBuilder object to be used for a new block. This doesn't + /// clear out information about additional blocks that are needed to complete + /// switch lowering or PHI node updating; that information is cleared out as + /// it is consumed. void clear(); - /// clearDanglingDebugInfo - Clear the dangling debug information - /// map. This function is separated from the clear so that debug - /// information that is dangling in a basic block can be properly - /// resolved in a different basic block. This allows the - /// SelectionDAG to resolve dangling debug information attached - /// to PHI nodes. + /// Clear the dangling debug information map. This function is separated from + /// the clear so that debug information that is dangling in a basic block can + /// be properly resolved in a different basic block. This allows the + /// SelectionDAG to resolve dangling debug information attached to PHI nodes. void clearDanglingDebugInfo(); - /// getRoot - Return the current virtual root of the Selection DAG, - /// flushing any PendingLoad items. This must be done before emitting - /// a store or any other node that may need to be ordered after any - /// prior load instructions. - /// + /// Return the current virtual root of the Selection DAG, flushing any + /// PendingLoad items. This must be done before emitting a store or any other + /// node that may need to be ordered after any prior load instructions. SDValue getRoot(); - /// getControlRoot - Similar to getRoot, but instead of flushing all the - /// PendingLoad items, flush all the PendingExports items. It is necessary - /// to do this before emitting a terminator instruction. - /// + /// Similar to getRoot, but instead of flushing all the PendingLoad items, + /// flush all the PendingExports items. It is necessary to do this before + /// emitting a terminator instruction. SDValue getControlRoot(); SDLoc getCurSDLoc() const { @@ -688,12 +682,13 @@ public: MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, Instruction::BinaryOps Opc, BranchProbability TW, - BranchProbability FW); + BranchProbability FW, bool InvertCond); void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, - BranchProbability TW, BranchProbability FW); + BranchProbability TW, BranchProbability FW, + bool InvertCond); bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases); bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB); void CopyToExportRegsIfNeeded(const Value *V); @@ -900,6 +895,7 @@ private: void visitInlineAsm(ImmutableCallSite CS); const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); + void visitConstrainedFPIntrinsic(const CallInst &I, unsigned Intrinsic); void visitVAStart(const CallInst &I); void visitVAArg(const VAArgInst &I); @@ -944,8 +940,8 @@ private: /// Return the appropriate SDDbgValue based on N. SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable, - DIExpression *Expr, int64_t Offset, DebugLoc dl, - unsigned DbgSDNodeOrder); + DIExpression *Expr, int64_t Offset, + const DebugLoc &dl, unsigned DbgSDNodeOrder); }; /// RegsForValue - This struct represents the registers (physical or virtual) @@ -958,26 +954,23 @@ private: /// type. /// struct RegsForValue { - /// ValueVTs - The value types of the values, which may not be legal, and + /// The value types of the values, which may not be legal, and /// may need be promoted or synthesized from one or more registers. - /// SmallVector<EVT, 4> ValueVTs; - /// RegVTs - The value types of the registers. This is the same size as - /// ValueVTs and it records, for each value, what the type of the assigned - /// register or registers are. (Individual values are never synthesized - /// from more than one type of register.) + /// The value types of the registers. This is the same size as ValueVTs and it + /// records, for each value, what the type of the assigned register or + /// registers are. (Individual values are never synthesized from more than one + /// type of register.) /// /// With virtual registers, the contents of RegVTs is redundant with TLI's /// getRegisterType member function, however when with physical registers /// it is necessary to have a separate record of the types. - /// SmallVector<MVT, 4> RegVTs; - /// Regs - This list holds the registers assigned to the values. + /// This list holds the registers assigned to the values. /// Each legal or promoted value requires one register, and each /// expanded value requires multiple registers. - /// SmallVector<unsigned, 4> Regs; RegsForValue(); @@ -987,33 +980,33 @@ struct RegsForValue { RegsForValue(LLVMContext &Context, const TargetLowering &TLI, const DataLayout &DL, unsigned Reg, Type *Ty); - /// append - Add the specified values to this one. + /// Add the specified values to this one. void append(const RegsForValue &RHS) { ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end()); RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end()); Regs.append(RHS.Regs.begin(), RHS.Regs.end()); } - /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from - /// this value and returns the result as a ValueVTs value. This uses - /// Chain/Flag as the input and updates them for the output Chain/Flag. - /// If the Flag pointer is NULL, no flag is used. + /// Emit a series of CopyFromReg nodes that copies from this value and returns + /// the result as a ValueVTs value. This uses Chain/Flag as the input and + /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no + /// flag is used. SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr) const; - /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified - /// value into the registers specified by this object. This uses Chain/Flag - /// as the input and updates them for the output Chain/Flag. If the Flag - /// pointer is nullptr, no flag is used. If V is not nullptr, then it is used - /// in printing better diagnostic messages on error. + /// Emit a series of CopyToReg nodes that copies the specified value into the + /// registers specified by this object. This uses Chain/Flag as the input and + /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no + /// flag is used. If V is not nullptr, then it is used in printing better + /// diagnostic messages on error. void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr, ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const; - /// AddInlineAsmOperands - Add this value to the specified inlineasm node - /// operand list. This adds the code marker, matching input operand index - /// (if applicable), and includes the number of values added into it. + /// Add this value to the specified inlineasm node operand list. This adds the + /// code marker, matching input operand index (if applicable), and includes + /// the number of values added into it. void AddInlineAsmOperands(unsigned Kind, bool HasMatching, unsigned MatchingIdx, const SDLoc &dl, SelectionDAG &DAG, std::vector<SDValue> &Ops) const; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0faaad8a21b76..488c60a28ffbc 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -300,6 +300,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; // Bit manipulation + case ISD::ABS: return "abs"; case ISD::BITREVERSE: return "bitreverse"; case ISD::BSWAP: return "bswap"; case ISD::CTPOP: return "ctpop"; @@ -366,11 +367,13 @@ static Printable PrintNodeId(const SDNode &Node) { }); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); } -void SDNode::dump(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const { print(dbgs(), G); dbgs() << '\n'; } +#endif void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const { for (unsigned i = 0, e = getNumValues(); i != e; ++i) { @@ -416,7 +419,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { OS << '<' << CSDN->getValueAPF().convertToDouble() << '>'; else { OS << "<APFloat("; - CSDN->getValueAPF().bitcastToAPInt().dump(); + CSDN->getValueAPF().bitcastToAPInt().print(OS, false); OS << ")>"; } } else if (const GlobalAddressSDNode *GADN = @@ -566,6 +569,7 @@ static bool shouldPrintInline(const SDNode &Node) { return Node.getNumOperands() == 0; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) { for (const SDValue &Op : N->op_values()) { if (shouldPrintInline(*Op.getNode())) @@ -592,6 +596,7 @@ LLVM_DUMP_METHOD void SelectionDAG::dump() const { if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this); dbgs() << "\n\n"; } +#endif void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const { OS << PrintNodeId(*this) << ": "; @@ -618,6 +623,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G, } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet; static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, const SelectionDAG *G, VisitedSDNodeSet &once) { @@ -646,15 +652,16 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, DumpNodesr(OS, Op.getNode(), indent+2, G, once); } -void SDNode::dumpr() const { +LLVM_DUMP_METHOD void SDNode::dumpr() const { VisitedSDNodeSet once; DumpNodesr(dbgs(), this, 0, nullptr, once); } -void SDNode::dumpr(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const { VisitedSDNodeSet once; DumpNodesr(dbgs(), this, 0, G, once); } +#endif static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N, const SelectionDAG *G, unsigned depth, @@ -688,14 +695,17 @@ void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const { printrWithDepth(OS, G, 10); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const { printrWithDepth(dbgs(), G, depth); } -void SDNode::dumprFull(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const { // Don't print impossibly deep things. dumprWithDepth(G, 10); } +#endif void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const { printr(OS, G); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 64e6c221229b2..e21204dbb9661 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -11,40 +11,65 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h" #include "ScheduleDAGSDNodes.h" #include "SelectionDAGBuilder.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" -#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/StackProtector.h" -#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -59,6 +84,13 @@ #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <memory> +#include <string> +#include <utility> +#include <vector> using namespace llvm; @@ -73,104 +105,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered"); STATISTIC(NumFastIselFailLowerArguments, "Number of entry blocks where fast isel failed to lower arguments"); -#ifndef NDEBUG -static cl::opt<bool> -EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden, - cl::desc("Enable extra verbose messages in the \"fast\" " - "instruction selector")); - - // Terminators -STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret"); -STATISTIC(NumFastIselFailBr,"Fast isel fails on Br"); -STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch"); -STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr"); -STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke"); -STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume"); -STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable"); - - // Standard binary operators... -STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add"); -STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd"); -STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub"); -STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub"); -STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul"); -STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul"); -STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv"); -STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv"); -STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv"); -STATISTIC(NumFastIselFailURem,"Fast isel fails on URem"); -STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem"); -STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem"); - - // Logical operators... -STATISTIC(NumFastIselFailAnd,"Fast isel fails on And"); -STATISTIC(NumFastIselFailOr,"Fast isel fails on Or"); -STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor"); - - // Memory instructions... -STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca"); -STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load"); -STATISTIC(NumFastIselFailStore,"Fast isel fails on Store"); -STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg"); -STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM"); -STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence"); -STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr"); - - // Convert instructions... -STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc"); -STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt"); -STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt"); -STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc"); -STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt"); -STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI"); -STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI"); -STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP"); -STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP"); -STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr"); -STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt"); -STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast"); - - // Other instructions... -STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp"); -STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp"); -STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI"); -STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select"); -STATISTIC(NumFastIselFailCall,"Fast isel fails on Call"); -STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl"); -STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr"); -STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr"); -STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg"); -STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement"); -STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement"); -STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector"); -STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue"); -STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue"); -STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad"); - -// Intrinsic instructions... -STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call"); -STATISTIC(NumFastIselFailSAddWithOverflow, - "Fast isel fails on sadd.with.overflow"); -STATISTIC(NumFastIselFailUAddWithOverflow, - "Fast isel fails on uadd.with.overflow"); -STATISTIC(NumFastIselFailSSubWithOverflow, - "Fast isel fails on ssub.with.overflow"); -STATISTIC(NumFastIselFailUSubWithOverflow, - "Fast isel fails on usub.with.overflow"); -STATISTIC(NumFastIselFailSMulWithOverflow, - "Fast isel fails on smul.with.overflow"); -STATISTIC(NumFastIselFailUMulWithOverflow, - "Fast isel fails on umul.with.overflow"); -STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress"); -STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call"); -STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call"); -STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call"); -#endif - -static cl::opt<bool> -EnableFastISelVerbose("fast-isel-verbose", cl::Hidden, - cl::desc("Enable verbose messages in the \"fast\" " - "instruction selector")); static cl::opt<int> EnableFastISelAbort( "fast-isel-abort", cl::Hidden, cl::desc("Enable abort calls when \"fast\" instruction selection " @@ -179,6 +113,11 @@ static cl::opt<int> EnableFastISelAbort( "abort for argument lowering, and 3 will never fallback " "to SelectionDAG.")); +static cl::opt<bool> EnableFastISelFallbackReport( + "fast-isel-report-on-fallback", cl::Hidden, + cl::desc("Emit a diagnostic when \"fast\" instruction selection " + "falls back to SelectionDAG.")); + static cl::opt<bool> UseMBPI("use-mbpi", cl::desc("use Machine Branch Probability Info"), @@ -238,7 +177,7 @@ MachinePassRegistry RegisterScheduler::Registry; /// //===---------------------------------------------------------------------===// static cl::opt<RegisterScheduler::FunctionPassCtor, false, - RegisterPassParser<RegisterScheduler> > + RegisterPassParser<RegisterScheduler>> ISHeuristic("pre-RA-sched", cl::init(&createDefaultScheduler), cl::Hidden, cl::desc("Instruction schedulers available (before register" @@ -249,6 +188,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target", createDefaultScheduler); namespace llvm { + //===--------------------------------------------------------------------===// /// \brief This class is used by SelectionDAGISel to temporarily override /// the optimization level on a per-function basis. @@ -318,6 +258,7 @@ namespace llvm { "Unknown sched type!"); return createILPListDAGScheduler(IS, OptLevel); } + } // end namespace llvm // EmitInstrWithCustomInserter - This method should be implemented by targets @@ -431,8 +372,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MachineFunctionProperties::Property::Selected)) return false; // Do some sanity-checking on the command-line options. - assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) && - "-fast-isel-verbose requires -fast-isel"); assert((!EnableFastISelAbort || TM.Options.EnableFastISel) && "-fast-isel-abort > 0 requires -fast-isel"); @@ -457,12 +396,13 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr; + ORE = make_unique<OptimizationRemarkEmitter>(&Fn); DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); SplitCriticalSideEffectEdges(const_cast<Function &>(Fn)); - CurDAG->init(*MF); + CurDAG->init(*MF, *ORE); FuncInfo->set(Fn, *MF, CurDAG); if (UseMBPI && OptLevel != CodeGenOpt::None) @@ -502,6 +442,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TLI->initializeSplitCSR(EntryMBB); SelectAllBasicBlocks(Fn); + if (FastISelFailed && EnableFastISelFallbackReport) { + DiagnosticInfoISelFallback DiagFallback(Fn); + Fn.getContext().diagnose(DiagFallback); + } // If the first basic block in the function has live ins that need to be // copied into vregs, emit the copies into the top of the block before @@ -628,7 +572,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { unsigned To = I->second; // If To is also scheduled to be replaced, find what its ultimate // replacement is. - for (;;) { + while (true) { DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To); if (J == E) break; To = J->second; @@ -666,13 +610,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { return true; } +static void reportFastISelFailure(MachineFunction &MF, + OptimizationRemarkEmitter &ORE, + OptimizationRemarkMissed &R, + bool ShouldAbort) { + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || ShouldAbort) + R << (" (in function: " + MF.getName() + ")").str(); + + if (ShouldAbort) + report_fatal_error(R.getMsg()); + + ORE.emit(R); +} + void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, BasicBlock::const_iterator End, bool &HadTailCall) { // Lower the instructions. If a call is emitted as a tail call, cease emitting // nodes for this block. - for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) - SDB->visit(*I); + for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) { + if (!ElidedArgCopyInstrs.count(&*I)) + SDB->visit(*I); + } // Make sure the root of the DAG is up-to-date. CurDAG->setRoot(SDB->getControlRoot()); @@ -731,6 +692,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { int BlockNumber = -1; (void)BlockNumber; bool MatchFilterBB = false; (void)MatchFilterBB; + + // Pre-type legalization allow creation of any node types. + CurDAG->NewNodesMustHaveLegalTypes = false; + #ifndef NDEBUG MatchFilterBB = (FilterDAGBasicBlockName.empty() || FilterDAGBasicBlockName == @@ -777,6 +742,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump()); + // Only allow creation of legal node types. CurDAG->NewNodesMustHaveLegalTypes = true; if (Changed) { @@ -802,12 +768,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { } if (Changed) { + DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber + << " '" << BlockName << "'\n"; CurDAG->dump()); + { NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName, GroupDescription, TimePassesIsEnabled); CurDAG->LegalizeTypes(); } + DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber + << " '" << BlockName << "'\n"; CurDAG->dump()); + if (ViewDAGCombineLT && MatchFilterBB) CurDAG->viewGraph("dag-combine-lv input for " + BlockName); @@ -907,10 +879,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { } namespace { + /// ISelUpdater - helper class to handle updates of the instruction selection /// graph. class ISelUpdater : public SelectionDAG::DAGUpdateListener { SelectionDAG::allnodes_iterator &ISelPosition; + public: ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp) : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {} @@ -923,8 +897,53 @@ public: ++ISelPosition; } }; + } // end anonymous namespace +static bool isStrictFPOp(SDNode *Node, unsigned &NewOpc) { + unsigned OrigOpc = Node->getOpcode(); + switch (OrigOpc) { + case ISD::STRICT_FADD: NewOpc = ISD::FADD; return true; + case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; return true; + case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; return true; + case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; return true; + case ISD::STRICT_FREM: NewOpc = ISD::FREM; return true; + default: return false; + } +} + +SDNode* SelectionDAGISel::MutateStrictFPToFP(SDNode *Node, unsigned NewOpc) { + assert(((Node->getOpcode() == ISD::STRICT_FADD && NewOpc == ISD::FADD) || + (Node->getOpcode() == ISD::STRICT_FSUB && NewOpc == ISD::FSUB) || + (Node->getOpcode() == ISD::STRICT_FMUL && NewOpc == ISD::FMUL) || + (Node->getOpcode() == ISD::STRICT_FDIV && NewOpc == ISD::FDIV) || + (Node->getOpcode() == ISD::STRICT_FREM && NewOpc == ISD::FREM)) && + "Unexpected StrictFP opcode!"); + + // We're taking this node out of the chain, so we need to re-link things. + SDValue InputChain = Node->getOperand(0); + SDValue OutputChain = SDValue(Node, 1); + CurDAG->ReplaceAllUsesOfValueWith(OutputChain, InputChain); + + SDVTList VTs = CurDAG->getVTList(Node->getOperand(1).getValueType()); + SDValue Ops[2] = { Node->getOperand(1), Node->getOperand(2) }; + SDNode *Res = CurDAG->MorphNodeTo(Node, NewOpc, VTs, Ops); + + // MorphNodeTo can operate in two ways: if an existing node with the + // specified operands exists, it can just return it. Otherwise, it + // updates the node in place to have the requested operands. + if (Res == Node) { + // If we updated the node in place, reset the node ID. To the isel, + // this should be just like a newly allocated machine node. + Res->setNodeId(-1); + } else { + CurDAG->ReplaceAllUsesWith(Node, Res); + CurDAG->RemoveDeadNode(Node); + } + + return Res; +} + void SelectionDAGISel::DoInstructionSelection() { DEBUG(dbgs() << "===== Instruction selection begins: BB#" << FuncInfo->MBB->getNumber() @@ -960,7 +979,23 @@ void SelectionDAGISel::DoInstructionSelection() { if (Node->use_empty()) continue; + // When we are using non-default rounding modes or FP exception behavior + // FP operations are represented by StrictFP pseudo-operations. They + // need to be simplified here so that the target-specific instruction + // selectors know how to handle them. + // + // If the current node is a strict FP pseudo-op, the isStrictFPOp() + // function will provide the corresponding normal FP opcode to which the + // node should be mutated. + unsigned NormalFPOpc = ISD::UNDEF; + bool IsStrictFPOp = isStrictFPOp(Node, NormalFPOpc); + if (IsStrictFPOp) + Node = MutateStrictFPToFP(Node, NormalFPOpc); + Select(Node); + + // FIXME: Add code here to attach an implicit def and use of + // target-specific FP environment registers. } CurDAG->setRoot(Dummy.getValue()); @@ -1046,116 +1081,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I, !FuncInfo->isExportedInst(I); // Exported instrs must be computed. } -#ifndef NDEBUG -// Collect per Instruction statistics for fast-isel misses. Only those -// instructions that cause the bail are accounted for. It does not account for -// instructions higher in the block. Thus, summing the per instructions stats -// will not add up to what is reported by NumFastIselFailures. -static void collectFailStats(const Instruction *I) { - switch (I->getOpcode()) { - default: assert (0 && "<Invalid operator> "); - - // Terminators - case Instruction::Ret: NumFastIselFailRet++; return; - case Instruction::Br: NumFastIselFailBr++; return; - case Instruction::Switch: NumFastIselFailSwitch++; return; - case Instruction::IndirectBr: NumFastIselFailIndirectBr++; return; - case Instruction::Invoke: NumFastIselFailInvoke++; return; - case Instruction::Resume: NumFastIselFailResume++; return; - case Instruction::Unreachable: NumFastIselFailUnreachable++; return; - - // Standard binary operators... - case Instruction::Add: NumFastIselFailAdd++; return; - case Instruction::FAdd: NumFastIselFailFAdd++; return; - case Instruction::Sub: NumFastIselFailSub++; return; - case Instruction::FSub: NumFastIselFailFSub++; return; - case Instruction::Mul: NumFastIselFailMul++; return; - case Instruction::FMul: NumFastIselFailFMul++; return; - case Instruction::UDiv: NumFastIselFailUDiv++; return; - case Instruction::SDiv: NumFastIselFailSDiv++; return; - case Instruction::FDiv: NumFastIselFailFDiv++; return; - case Instruction::URem: NumFastIselFailURem++; return; - case Instruction::SRem: NumFastIselFailSRem++; return; - case Instruction::FRem: NumFastIselFailFRem++; return; - - // Logical operators... - case Instruction::And: NumFastIselFailAnd++; return; - case Instruction::Or: NumFastIselFailOr++; return; - case Instruction::Xor: NumFastIselFailXor++; return; - - // Memory instructions... - case Instruction::Alloca: NumFastIselFailAlloca++; return; - case Instruction::Load: NumFastIselFailLoad++; return; - case Instruction::Store: NumFastIselFailStore++; return; - case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return; - case Instruction::AtomicRMW: NumFastIselFailAtomicRMW++; return; - case Instruction::Fence: NumFastIselFailFence++; return; - case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return; - - // Convert instructions... - case Instruction::Trunc: NumFastIselFailTrunc++; return; - case Instruction::ZExt: NumFastIselFailZExt++; return; - case Instruction::SExt: NumFastIselFailSExt++; return; - case Instruction::FPTrunc: NumFastIselFailFPTrunc++; return; - case Instruction::FPExt: NumFastIselFailFPExt++; return; - case Instruction::FPToUI: NumFastIselFailFPToUI++; return; - case Instruction::FPToSI: NumFastIselFailFPToSI++; return; - case Instruction::UIToFP: NumFastIselFailUIToFP++; return; - case Instruction::SIToFP: NumFastIselFailSIToFP++; return; - case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return; - case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return; - case Instruction::BitCast: NumFastIselFailBitCast++; return; - - // Other instructions... - case Instruction::ICmp: NumFastIselFailICmp++; return; - case Instruction::FCmp: NumFastIselFailFCmp++; return; - case Instruction::PHI: NumFastIselFailPHI++; return; - case Instruction::Select: NumFastIselFailSelect++; return; - case Instruction::Call: { - if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) { - switch (Intrinsic->getIntrinsicID()) { - default: - NumFastIselFailIntrinsicCall++; return; - case Intrinsic::sadd_with_overflow: - NumFastIselFailSAddWithOverflow++; return; - case Intrinsic::uadd_with_overflow: - NumFastIselFailUAddWithOverflow++; return; - case Intrinsic::ssub_with_overflow: - NumFastIselFailSSubWithOverflow++; return; - case Intrinsic::usub_with_overflow: - NumFastIselFailUSubWithOverflow++; return; - case Intrinsic::smul_with_overflow: - NumFastIselFailSMulWithOverflow++; return; - case Intrinsic::umul_with_overflow: - NumFastIselFailUMulWithOverflow++; return; - case Intrinsic::frameaddress: - NumFastIselFailFrameaddress++; return; - case Intrinsic::sqrt: - NumFastIselFailSqrt++; return; - case Intrinsic::experimental_stackmap: - NumFastIselFailStackMap++; return; - case Intrinsic::experimental_patchpoint_void: // fall-through - case Intrinsic::experimental_patchpoint_i64: - NumFastIselFailPatchPoint++; return; - } - } - NumFastIselFailCall++; - return; - } - case Instruction::Shl: NumFastIselFailShl++; return; - case Instruction::LShr: NumFastIselFailLShr++; return; - case Instruction::AShr: NumFastIselFailAShr++; return; - case Instruction::VAArg: NumFastIselFailVAArg++; return; - case Instruction::ExtractElement: NumFastIselFailExtractElement++; return; - case Instruction::InsertElement: NumFastIselFailInsertElement++; return; - case Instruction::ShuffleVector: NumFastIselFailShuffleVector++; return; - case Instruction::ExtractValue: NumFastIselFailExtractValue++; return; - case Instruction::InsertValue: NumFastIselFailInsertValue++; return; - case Instruction::LandingPad: NumFastIselFailLandingPad++; return; - } -} -#endif // NDEBUG - /// Set up SwiftErrorVals by going through the function. If the function has /// swifterror argument, it will be the first entry. static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, @@ -1190,9 +1115,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, } static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, + FastISel *FastIS, const TargetLowering *TLI, const TargetInstrInfo *TII, - const BasicBlock *LLVMBB, SelectionDAGBuilder *SDB) { if (!TLI->supportSwiftError()) return; @@ -1202,22 +1127,27 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, if (FuncInfo->SwiftErrorVals.empty()) return; - if (pred_begin(LLVMBB) == pred_end(LLVMBB)) { - auto &DL = FuncInfo->MF->getDataLayout(); - auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); - for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { - // We will always generate a copy from the argument. It is always used at - // least by the 'return' of the swifterror. - if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) - continue; - unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); - // Assign Undef to Vreg. We construct MI directly to make sure it works - // with FastISel. - BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), - SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), - VReg); - FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); - } + assert(FuncInfo->MBB == &*FuncInfo->MF->begin() && + "expected to insert into entry block"); + auto &DL = FuncInfo->MF->getDataLayout(); + auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); + for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { + // We will always generate a copy from the argument. It is always used at + // least by the 'return' of the swifterror. + if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) + continue; + unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); + // Assign Undef to Vreg. We construct MI directly to make sure it works + // with FastISel. + BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), + SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), + VReg); + + // Keep FastIS informed about the value we just inserted. + if (FastIS) + FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + + FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); } } @@ -1340,6 +1270,7 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) { } void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { + FastISelFailed = false; // Initialize the Fast-ISel state, if needed. FastISel *FastIS = nullptr; if (TM.Options.EnableFastISel) @@ -1347,12 +1278,53 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { setupSwiftErrorVals(Fn, TLI, FuncInfo); - // Iterate over all basic blocks in the function. ReversePostOrderTraversal<const Function*> RPOT(&Fn); - for (ReversePostOrderTraversal<const Function*>::rpo_iterator - I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { - const BasicBlock *LLVMBB = *I; + // Lower arguments up front. An RPO iteration always visits the entry block + // first. + assert(*RPOT.begin() == &Fn.getEntryBlock()); + ++NumEntryBlocks; + + // Set up FuncInfo for ISel. Entry blocks never have PHIs. + FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()]; + FuncInfo->InsertPt = FuncInfo->MBB->begin(); + + if (!FastIS) { + LowerArguments(Fn); + } else { + // See if fast isel can lower the arguments. + FastIS->startNewBlock(); + if (!FastIS->lowerArguments()) { + FastISelFailed = true; + // Fast isel failed to lower these arguments + ++NumFastIselFailLowerArguments; + + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Fn.getSubprogram(), + &Fn.getEntryBlock()); + R << "FastISel didn't lower all arguments: " + << ore::NV("Prototype", Fn.getType()); + reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1); + + // Use SelectionDAG argument lowering + LowerArguments(Fn); + CurDAG->setRoot(SDB->getControlRoot()); + SDB->clear(); + CodeGenAndEmitDAG(); + } + + // If we inserted any instructions at the beginning, make a note of + // where they are, so we can be sure to emit subsequent instructions + // after them. + if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) + FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + else + FastIS->setLastLocalValue(nullptr); + } + createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB); + + // Iterate over all basic blocks in the function. + for (const BasicBlock *LLVMBB : RPOT) { if (OptLevel != CodeGenOpt::None) { bool AllPredsVisited = true; for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB); @@ -1384,8 +1356,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB]; if (!FuncInfo->MBB) continue; // Some blocks like catchpads have no code or MBB. - FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI(); - createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB); + + // Insert new instructions after any phi or argument setup code. + FuncInfo->InsertPt = FuncInfo->MBB->end(); // Setup an EH landing-pad block. FuncInfo->ExceptionPointerVirtReg = 0; @@ -1396,35 +1369,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Before doing SelectionDAG ISel, see if FastISel has been requested. if (FastIS) { - FastIS->startNewBlock(); - - // Emit code for any incoming arguments. This must happen before - // beginning FastISel on the entry block. - if (LLVMBB == &Fn.getEntryBlock()) { - ++NumEntryBlocks; - - // Lower any arguments needed in this block if this is the entry block. - if (!FastIS->lowerArguments()) { - // Fast isel failed to lower these arguments - ++NumFastIselFailLowerArguments; - if (EnableFastISelAbort > 1) - report_fatal_error("FastISel didn't lower all arguments"); - - // Use SelectionDAG argument lowering - LowerArguments(Fn); - CurDAG->setRoot(SDB->getControlRoot()); - SDB->clear(); - CodeGenAndEmitDAG(); - } - - // If we inserted any instructions at the beginning, make a note of - // where they are, so we can be sure to emit subsequent instructions - // after them. - if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) - FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); - else - FastIS->setLastLocalValue(nullptr); - } + if (LLVMBB != &Fn.getEntryBlock()) + FastIS->startNewBlock(); unsigned NumFastIselRemaining = std::distance(Begin, End); // Do FastISel on as many instructions as possible. @@ -1432,7 +1378,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { const Instruction *Inst = &*std::prev(BI); // If we no longer require this instruction, skip it. - if (isFoldedOrDeadInstruction(Inst, FuncInfo)) { + if (isFoldedOrDeadInstruction(Inst, FuncInfo) || + ElidedArgCopyInstrs.count(Inst)) { --NumFastIselRemaining; continue; } @@ -1443,6 +1390,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Try to select the instruction with FastISel. if (FastIS->selectInstruction(Inst)) { + FastISelFailed = true; --NumFastIselRemaining; ++NumFastIselSuccess; // If fast isel succeeded, skip over all the folded instructions, and @@ -1465,22 +1413,22 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } -#ifndef NDEBUG - if (EnableFastISelVerbose2) - collectFailStats(Inst); -#endif - // Then handle certain instructions as single-LLVM-Instruction blocks. if (isa<CallInst>(Inst)) { + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Inst->getDebugLoc(), LLVMBB); + + R << "FastISel missed call"; + + if (R.isEnabled() || EnableFastISelAbort) { + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << *Inst; - if (EnableFastISelVerbose || EnableFastISelAbort) { - dbgs() << "FastISel missed call: "; - Inst->dump(); + R << ": " << InstStr.str(); } - if (EnableFastISelAbort > 2) - // FastISel selector couldn't handle something and bailed. - // For the purpose of debugging, just abort. - report_fatal_error("FastISel didn't select the entire block"); + + reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2); if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() && !Inst->use_empty()) { @@ -1509,35 +1457,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Inst->getDebugLoc(), LLVMBB); + bool ShouldAbort = EnableFastISelAbort; - if (EnableFastISelVerbose || EnableFastISelAbort) { - if (isa<TerminatorInst>(Inst)) { - // Use a different message for terminator misses. - dbgs() << "FastISel missed terminator: "; - // Don't abort unless for terminator unless the level is really high - ShouldAbort = (EnableFastISelAbort > 2); - } else { - dbgs() << "FastISel miss: "; - } - Inst->dump(); + if (isa<TerminatorInst>(Inst)) { + // Use a different message for terminator misses. + R << "FastISel missed terminator"; + // Don't abort for terminator unless the level is really high + ShouldAbort = (EnableFastISelAbort > 2); + } else { + R << "FastISel missed"; + } + + if (R.isEnabled() || EnableFastISelAbort) { + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << *Inst; + R << ": " << InstStr.str(); } - if (ShouldAbort) - // FastISel selector couldn't handle something and bailed. - // For the purpose of debugging, just abort. - report_fatal_error("FastISel didn't select the entire block"); + + reportFastISelFailure(*MF, *ORE, R, ShouldAbort); NumFastIselFailures += NumFastIselRemaining; break; } FastIS->recomputeInsertPt(); - } else { - // Lower any arguments needed in this block if this is the entry block. - if (LLVMBB == &Fn.getEntryBlock()) { - ++NumEntryBlocks; - LowerArguments(Fn); - } } + if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) { bool FunctionBasedInstrumentation = TLI->getSSPStackGuardCheck(*Fn.getParent()); @@ -1556,10 +1504,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // block. bool HadTailCall; SelectBasicBlock(Begin, BI, HadTailCall); + + // But if FastISel was run, we already selected some of the block. + // If we emitted a tail-call, we need to delete any previously emitted + // instruction that follows it. + if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end()) + FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end()); } FinishBasicBlock(); FuncInfo->PHINodesToUpdate.clear(); + ElidedArgCopyInstrs.clear(); } propagateSwiftErrorVRegs(FuncInfo); @@ -2177,7 +2132,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, IgnoreChains = false; } - SmallPtrSet<SDNode*, 16> Visited; return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); } @@ -2554,7 +2508,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, - const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { + const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) { // Accept if it is exactly the same as a previously recorded node. unsigned RecNo = MatcherTable[MatcherIndex++]; assert(RecNo < RecordedNodes.size() && "Invalid CheckSame"); @@ -2564,9 +2518,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, /// CheckChildSame - Implements OP_CheckChildXSame. LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SDValue N, - const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes, - unsigned ChildNo) { + SDValue N, + const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes, + unsigned ChildNo) { if (ChildNo >= N.getNumOperands()) return false; // Match fails if out of range child #. return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo), @@ -2688,7 +2642,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table, unsigned Index, SDValue N, bool &Result, const SelectionDAGISel &SDISel, - SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { + SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) { switch (Table[Index++]) { default: Result = false; @@ -2756,6 +2710,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table, } namespace { + struct MatchScope { /// FailIndex - If this match fails, this is the index to continue with. unsigned FailIndex; @@ -2785,6 +2740,7 @@ class MatchStateUpdater : public SelectionDAG::DAGUpdateListener SDNode **NodeToMatch; SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes; SmallVectorImpl<MatchScope> &MatchScopes; + public: MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch, SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN, @@ -2816,6 +2772,7 @@ public: J.setNode(E); } }; + } // end anonymous namespace void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, @@ -2921,7 +2878,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // with an OPC_SwitchOpcode instruction. Populate the table now, since this // is the first time we're selecting an instruction. unsigned Idx = 1; - while (1) { + while (true) { // Get the size of this case. unsigned CaseSize = MatcherTable[Idx++]; if (CaseSize & 128) @@ -2942,7 +2899,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, MatcherIndex = OpcodeOffset[N.getOpcode()]; } - while (1) { + while (true) { assert(MatcherIndex < TableSize && "Invalid index"); #ifndef NDEBUG unsigned CurrentOpcodeIndex = MatcherIndex; @@ -2957,7 +2914,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // immediately fail, don't even bother pushing a scope for them. unsigned FailIndex; - while (1) { + while (true) { unsigned NumToSkip = MatcherTable[MatcherIndex++]; if (NumToSkip & 128) NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex); @@ -3118,7 +3075,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, unsigned CurNodeOpcode = N.getOpcode(); unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart; unsigned CaseSize; - while (1) { + while (true) { // Get the size of this case. CaseSize = MatcherTable[MatcherIndex++]; if (CaseSize & 128) @@ -3149,7 +3106,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, MVT CurNodeVT = N.getSimpleValueType(); unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart; unsigned CaseSize; - while (1) { + while (true) { // Get the size of this case. CaseSize = MatcherTable[MatcherIndex++]; if (CaseSize & 128) @@ -3215,7 +3172,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // a single use. bool HasMultipleUses = false; for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) - if (!NodeStack[i].hasOneUse()) { + if (!NodeStack[i].getNode()->hasOneUse()) { HasMultipleUses = true; break; } @@ -3381,6 +3338,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr)); continue; } + case OPC_Coverage: { + // This is emitted right before MorphNode/EmitNode. + // So it should be safe to assume that this node has been selected + unsigned index = MatcherTable[MatcherIndex++]; + index |= (MatcherTable[MatcherIndex++] << 8); + dbgs() << "COVERED: " << getPatternForIndex(index) << "\n"; + dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n"; + continue; + } case OPC_EmitNode: case OPC_MorphNodeTo: case OPC_EmitNode0: case OPC_EmitNode1: case OPC_EmitNode2: @@ -3473,7 +3439,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i), nullptr)); } - } else { assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE && "NodeToMatch was removed partway through selection"); @@ -3610,7 +3575,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // find a case to check. DEBUG(dbgs() << " Match failed at index " << CurrentOpcodeIndex << "\n"); ++NumDAGIselRetries; - while (1) { + while (true) { if (MatchScopes.empty()) { CannotYetSelect(NodeToMatch); return; diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 690f0d2c80821..2756e276c6a91 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -55,14 +55,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node, // Conservatively require the attributes of the call to match those of // the return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CallerAttrs = F->getAttributes(); - if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex) - .removeAttribute(Attribute::NoAlias).hasAttributes()) + AttributeList CallerAttrs = F->getAttributes(); + if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias) + .hasAttributes()) return false; // It's not safe to eliminate the sign / zero extension of the return value. - if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || - CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) || + CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) return false; // Check if the only use is a function return node. @@ -96,19 +97,20 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI, /// \brief Set CallLoweringInfo attribute flags based on a call instruction /// and called function attributes. -void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS, - unsigned AttrIdx) { - isSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); - isZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); - isInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); - isSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); - isNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); - isByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); - isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); - isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); - isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); - isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); - Alignment = CS->getParamAlignment(AttrIdx); +void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS, + unsigned ArgIdx) { + IsSExt = CS->paramHasAttr(ArgIdx, Attribute::SExt); + IsZExt = CS->paramHasAttr(ArgIdx, Attribute::ZExt); + IsInReg = CS->paramHasAttr(ArgIdx, Attribute::InReg); + IsSRet = CS->paramHasAttr(ArgIdx, Attribute::StructRet); + IsNest = CS->paramHasAttr(ArgIdx, Attribute::Nest); + IsByVal = CS->paramHasAttr(ArgIdx, Attribute::ByVal); + IsInAlloca = CS->paramHasAttr(ArgIdx, Attribute::InAlloca); + IsReturned = CS->paramHasAttr(ArgIdx, Attribute::Returned); + IsSwiftSelf = CS->paramHasAttr(ArgIdx, Attribute::SwiftSelf); + IsSwiftError = CS->paramHasAttr(ArgIdx, Attribute::SwiftError); + // FIXME: getParamAlignment is off by one from argument index. + Alignment = CS->getParamAlignment(ArgIdx + 1); } /// Generate a libcall taking the given operands as arguments and returning a @@ -125,8 +127,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, for (SDValue Op : Ops) { Entry.Node = Op; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); - Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); Args.push_back(Entry); } @@ -138,10 +140,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed) - .setSExtResult(signExtend).setZExtResult(!signExtend); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) + .setNoReturn(doesNotReturn) + .setDiscardResult(!isReturnValueUsed) + .setSExtResult(signExtend) + .setZExtResult(!signExtend); return LowerCallTo(CLI); } @@ -334,34 +339,35 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // Optimization Methods //===----------------------------------------------------------------------===// -/// Check to see if the specified operand of the specified instruction is a -/// constant integer. If so, check to see if there are any bits set in the -/// constant that are not demanded. If so, shrink the constant and return true. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, - const APInt &Demanded) { - SDLoc dl(Op); +/// If the specified instruction has a constant integer operand and there are +/// bits set in that constant that are not demanded, then clear those bits and +/// return true. +bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant( + SDValue Op, const APInt &Demanded) { + SDLoc DL(Op); + unsigned Opcode = Op.getOpcode(); // FIXME: ISD::SELECT, ISD::SELECT_CC - switch (Op.getOpcode()) { - default: break; + switch (Opcode) { + default: + break; case ISD::XOR: case ISD::AND: case ISD::OR: { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!C) return false; + auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Op1C) + return false; - if (Op.getOpcode() == ISD::XOR && - (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) + // If this is a 'not' op, don't touch it because that's a canonical form. + const APInt &C = Op1C->getAPIntValue(); + if (Opcode == ISD::XOR && (C | ~Demanded).isAllOnesValue()) return false; - // if we can expand it to have all bits set, do it - if (C->getAPIntValue().intersects(~Demanded)) { + if (C.intersects(~Demanded)) { EVT VT = Op.getValueType(); - SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0), - DAG.getConstant(Demanded & - C->getAPIntValue(), - dl, VT)); - return CombineTo(Op, New); + SDValue NewC = DAG.getConstant(Demanded & C, DL, VT); + SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC); + return CombineTo(Op, NewOp); } break; @@ -470,6 +476,21 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, return true; } +bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask, + DAGCombinerInfo &DCI) const { + + SelectionDAG &DAG = DCI.DAG; + TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + APInt KnownZero, KnownOne; + + bool Simplified = SimplifyDemandedBits(Op, DemandedMask, KnownZero, KnownOne, + TLO); + if (Simplified) + DCI.CommitTargetLoweringOpt(TLO); + return Simplified; +} + /// Look at Op. At this point, we know that only the DemandedMask bits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the @@ -711,8 +732,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - KnownZero = KnownZeroOut; - KnownOne = KnownOneOut; + KnownZero = std::move(KnownZeroOut); + KnownOne = std::move(KnownOneOut); break; case ISD::SELECT: if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero, @@ -750,6 +771,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; + case ISD::SETCC: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + // If (1) we only need the sign-bit, (2) the setcc operands are the same + // width as the setcc result, and (3) the result of a setcc conforms to 0 or + // -1, we may be able to bypass the setcc. + if (NewMask.isSignBit() && Op0.getScalarValueSizeInBits() == BitWidth && + getBooleanContents(Op.getValueType()) == + BooleanContent::ZeroOrNegativeOneBooleanContent) { + // If we're testing X < 0, then this compare isn't needed - just use X! + // FIXME: We're limiting to integer types here, but this should also work + // if we don't care about FP signed-zero. The use of SETLT with FP means + // that we don't care about NaNs. + if (CC == ISD::SETLT && Op1.getValueType().isInteger() && + (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode()))) + return TLO.CombineTo(Op, Op0); + + // TODO: Should we check for other forms of sign-bit comparisons? + // Examples: X <= -1, X >= 0 + } + if (getBooleanContents(Op0.getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + KnownZero.setBitsFrom(1); + break; + } case ISD::SHL: if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); @@ -834,7 +882,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, KnownZero <<= SA->getZExtValue(); KnownOne <<= SA->getZExtValue(); // low bits known zero. - KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue()); + KnownZero.setLowBits(SA->getZExtValue()); } break; case ISD::SRL: @@ -853,7 +901,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If the shift is exact, then it does demand the low bits (and knows that // they are zero). if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) - InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); + InDemandedMask.setLowBits(ShAmt); // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a // single shift. We can do this if the top bits (which are shifted out) @@ -884,8 +932,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, KnownZero = KnownZero.lshr(ShAmt); KnownOne = KnownOne.lshr(ShAmt); - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); - KnownZero |= HighBits; // High bits known zero. + KnownZero.setHighBits(ShAmt); // High bits known zero. } break; case ISD::SRA: @@ -911,7 +958,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If the shift is exact, then it does demand the low bits (and knows that // they are zero). if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) - InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); + InDemandedMask.setLowBits(ShAmt); // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. @@ -1075,7 +1122,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); APInt InMask = APInt::getLowBitsSet(BitWidth, InBits); - APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits); + APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1); APInt NewBits = ~InMask & NewMask; // If none of the top bits are demanded, convert this into an any_extend. @@ -1191,7 +1238,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return true; assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero |= ~InMask & NewMask; + KnownZero |= ~InMask; break; } case ISD::BITCAST: @@ -1281,6 +1328,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -1295,6 +1343,7 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, /// This method can be implemented by targets that want to expose additional /// information about sign bits to the DAG Combiner. unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, + const APInt &, const SelectionDAG &, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -2050,6 +2099,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if (Cond == ISD::SETO || Cond == ISD::SETUO) return DAG.getSetCC(dl, VT, N0, N0, Cond); + // setcc (fneg x), C -> setcc swap(pred) x, -C + if (N0.getOpcode() == ISD::FNEG) { + ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(SwapCond, N0.getSimpleValueType())) { + SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1); + return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond); + } + } + // If the condition is not legal, see if we can find an equivalent one // which is legal. if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) { @@ -2470,10 +2529,7 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr)); // Figure out which register class contains this reg. - for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(), - E = RI->regclass_end(); RCI != E; ++RCI) { - const TargetRegisterClass *RC = *RCI; - + for (const TargetRegisterClass *RC : RI->regclasses()) { // If none of the value types for this register class are valid, we // can't use it. For example, 64-bit reg classes on 32-bit targets. if (!isLegalRC(RC)) @@ -2933,7 +2989,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d, SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -3808,7 +3864,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()); - CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); + CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index 209bbe54ea236..ab578df4069d5 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -64,6 +64,7 @@ public: private: bool setupEntryBlockAndCallSites(Function &F); + bool undoSwiftErrorSelect(Function &F); void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal); Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads); void lowerIncomingArguments(Function &F); @@ -174,8 +175,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F, // because the value needs to be added to the global context list. auto &DL = F.getParent()->getDataLayout(); unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy); - FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context", - &EntryBB->front()); + FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), + nullptr, Align, "fn_context", &EntryBB->front()); // Fill in the function context structure. for (LandingPadInst *LPI : LPads) { @@ -458,14 +459,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) { return true; } +bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) { + // We have inserted dummy copies 'select true, arg, undef' in the entry block + // for arguments to simplify this pass. + // swifterror arguments cannot be used in this way. Undo the select for the + // swifterror argument. + for (auto &AI : F.args()) { + if (AI.isSwiftError()) { + assert(AI.hasOneUse() && "Must have converted the argument to a select"); + auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser()); + assert(Select && "There must be single select user"); + auto *OrigSwiftError = cast<Argument>(Select->getTrueValue()); + Select->replaceAllUsesWith(OrigSwiftError); + Select->eraseFromParent(); + return true; + } + } + return false; +} + bool SjLjEHPrepare::runOnFunction(Function &F) { Module &M = *F.getParent(); RegisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy), nullptr); + PointerType::getUnqual(FunctionContextTy)); UnregisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy), nullptr); + PointerType::getUnqual(FunctionContextTy)); FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress); StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); @@ -476,5 +496,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) { FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); bool Res = setupEntryBlockAndCallSites(F); + if (Res) + Res |= undoSwiftErrorSelect(F); return Res; } diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp index dba103e9bfb15..bc2a1d09056bd 100644 --- a/lib/CodeGen/SlotIndexes.cpp +++ b/lib/CodeGen/SlotIndexes.cpp @@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) { return false; } +void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) { + assert(!MI.isBundledWithPred() && + "Use removeSingleMachineInstrFromMaps() instread"); + Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); + if (mi2iItr == mi2iMap.end()) + return; + + SlotIndex MIIndex = mi2iItr->second; + IndexListEntry &MIEntry = *MIIndex.listEntry(); + assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); + mi2iMap.erase(mi2iItr); + // FIXME: Eventually we want to actually delete these indexes. + MIEntry.setInstr(nullptr); +} + +void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) { + Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); + if (mi2iItr == mi2iMap.end()) + return; + + SlotIndex MIIndex = mi2iItr->second; + IndexListEntry &MIEntry = *MIIndex.listEntry(); + assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); + mi2iMap.erase(mi2iItr); + + // When removing the first instruction of a bundle update mapping to next + // instruction. + if (MI.isBundledWithSucc()) { + // Only the first instruction of a bundle should have an index assigned. + assert(!MI.isBundledWithPred() && "Should have first bundle isntruction"); + + MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator()); + MachineInstr &NextMI = *Next; + MIEntry.setInstr(&NextMI); + mi2iMap.insert(std::make_pair(&NextMI, MIIndex)); + return; + } else { + // FIXME: Eventually we want to actually delete these indexes. + MIEntry.setInstr(nullptr); + } +} + void SlotIndexes::renumberIndexes() { // Renumber updates the index of every element of the index list. DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n"); diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp index 1c6a84e539440..3a50aaa69985d 100644 --- a/lib/CodeGen/SplitKit.cpp +++ b/lib/CodeGen/SplitKit.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -487,12 +488,126 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) { VFP = ValueForcePair(nullptr, true); } +SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + bool FirstCopy = !Def.isValid(); + MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) + .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) + | getInternalReadRegState(!FirstCopy), SubIdx) + .addReg(FromReg, 0, SubIdx); + + BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); + if (FirstCopy) { + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } else { + CopyMI->bundleWithPred(); + } + LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx); + DestLI.refineSubRanges(Allocator, LaneMask, + [Def, &Allocator](LiveInterval::SubRange& SR) { + SR.createDeadDef(Def, Allocator); + }); + return Def; +} + +SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg, + LaneBitmask LaneMask, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { + // The full vreg is copied. + MachineInstr *CopyMI = + BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg); + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } + + // Only a subset of lanes needs to be copied. The following is a simple + // heuristic to construct a sequence of COPYs. We could add a target + // specific callback if this turns out to be suboptimal. + LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx)); + + // First pass: Try to find a perfectly matching subregister index. If none + // exists find the one covering the most lanemask bits. + SmallVector<unsigned, 8> PossibleIndexes; + unsigned BestIdx = 0; + unsigned BestCover = 0; + const TargetRegisterClass *RC = MRI.getRegClass(FromReg); + assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class"); + for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI.getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LaneMask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside \p LaneMask. + if ((SubRegMask & ~LaneMask).any()) + continue; + + unsigned PopCount = countPopulation(SubRegMask.getAsInteger()); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, + BestIdx, DestLI, Late, SlotIndex()); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + LaneBitmask LanesLeft = + LaneMask & ~(TRI.getSubRegIndexLaneMask(BestCover)); + while (LanesLeft.any()) { + unsigned BestIdx = 0; + int BestCover = INT_MIN; + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LanesLeft) { + BestIdx = Idx; + break; + } + + // Try to cover as much of the remaining lanes as possible but + // as few of the already covered lanes as possible. + int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger()) + - countPopulation((SubRegMask & ~LanesLeft).getAsInteger()); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, + DestLI, Late, Def); + LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx); + } + + return Def; +} + VNInfo *SplitEditor::defFromParent(unsigned RegIdx, VNInfo *ParentVNI, SlotIndex UseIdx, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - MachineInstr *CopyMI = nullptr; SlotIndex Def; LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx)); @@ -505,24 +620,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); + unsigned Reg = LI->reg; bool DidRemat = false; if (OrigVNI) { LiveRangeEdit::Remat RM(ParentVNI); RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def); if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) { - Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late); + Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late); ++NumRemats; DidRemat = true; } } if (!DidRemat) { - // Can't remat, just insert a copy from parent. - CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg) - .addReg(Edit->getReg()); - Def = LIS.getSlotIndexes() - ->insertMachineInstrInMaps(*CopyMI, Late) - .getRegSlot(); + LaneBitmask LaneMask; + if (LI->hasSubRanges()) { + LaneMask = LaneBitmask::getNone(); + for (LiveInterval::SubRange &S : LI->subranges()) + LaneMask |= S.LaneMask; + } else { + LaneMask = LaneBitmask::getAll(); + } + ++NumCopies; + Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx); } // Define the value in Reg. diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h index a75738aaf4467..9d409e924a3d8 100644 --- a/lib/CodeGen/SplitKit.h +++ b/lib/CodeGen/SplitKit.h @@ -405,6 +405,17 @@ private: /// deleteRematVictims - Delete defs that are dead after rematerializing. void deleteRematVictims(); + /// Add a copy instruction copying \p FromReg to \p ToReg before + /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it + /// necessary to construct a sequence of copies to cover it exactly. + SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + bool Late, unsigned RegIdx); + + SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy); + public: /// Create a new SplitEditor for editing the LiveInterval analyzed by SA. /// Newly created intervals will be appended to newIntervals. diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp index 89c4b574f17f4..f51d959a089aa 100644 --- a/lib/CodeGen/StackColoring.cpp +++ b/lib/CodeGen/StackColoring.cpp @@ -23,7 +23,6 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -385,14 +384,13 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -#ifndef NDEBUG - +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag, const BitVector &BV) const { - DEBUG(dbgs() << tag << " : { "); + dbgs() << tag << " : { "; for (unsigned I = 0, E = BV.size(); I != E; ++I) - DEBUG(dbgs() << BV.test(I) << " "); - DEBUG(dbgs() << "}\n"); + dbgs() << BV.test(I) << " "; + dbgs() << "}\n"; } LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { @@ -408,20 +406,19 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { LLVM_DUMP_METHOD void StackColoring::dump() const { for (MachineBasicBlock *MBB : depth_first(MF)) { - DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " [" - << MBB->getName() << "]\n"); - DEBUG(dumpBB(MBB)); + dbgs() << "Inspecting block #" << MBB->getNumber() << " [" + << MBB->getName() << "]\n"; + dumpBB(MBB); } } LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const { for (unsigned I = 0, E = Intervals.size(); I != E; ++I) { - DEBUG(dbgs() << "Interval[" << I << "]:\n"); - DEBUG(Intervals[I]->dump()); + dbgs() << "Interval[" << I << "]:\n"; + Intervals[I]->dump(); } } - -#endif // not NDEBUG +#endif static inline int getStartOrEndSlot(const MachineInstr &MI) { @@ -570,9 +567,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) // Step 2: compute begin/end sets for each block - // NOTE: We use a reverse-post-order iteration to ensure that we obtain a - // deterministic numbering, and because we'll need a post-order iteration - // later for solving the liveness dataflow problem. + // NOTE: We use a depth-first iteration to ensure that we obtain a + // deterministic numbering. for (MachineBasicBlock *MBB : depth_first(MF)) { // Assign a serial number to this basic block. diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp index 9b7dd400fc92e..1a8ec5bff3229 100644 --- a/lib/CodeGen/StackMaps.cpp +++ b/lib/CodeGen/StackMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- StackMaps.cpp ---------------------------===// +//===- StackMaps.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,23 +7,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackMaps.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> #include <iterator> +#include <utility> using namespace llvm; @@ -276,7 +287,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const { } LiveOuts.erase( - remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }), + llvm::remove_if(LiveOuts, + [](const LiveOutReg &LO) { return LO.Reg == 0; }), LiveOuts.end()); return LiveOuts; @@ -286,7 +298,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID, MachineInstr::const_mop_iterator MOI, MachineInstr::const_mop_iterator MOE, bool recordResult) { - MCContext &OutContext = AP.OutStreamer->getContext(); MCSymbol *MILabel = OutContext.createTempSymbol(); AP.OutStreamer->EmitLabel(MILabel); @@ -378,6 +389,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) { } #endif } + void StackMaps::recordStatepoint(const MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint"); diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp index c2c010a29d44b..a8aafe78748dc 100644 --- a/lib/CodeGen/StackProtector.cpp +++ b/lib/CodeGen/StackProtector.cpp @@ -1,4 +1,4 @@ -//===-- StackProtector.cpp - Stack Protector Insertion --------------------===// +//===- StackProtector.cpp - Stack Protector Insertion ---------------------===// // // The LLVM Compiler Infrastructure // @@ -14,30 +14,38 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackProtector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <cstdlib> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "stack-protector" @@ -51,7 +59,7 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp", char StackProtector::ID = 0; INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors", - false, true) + false, true) FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) { return new StackProtector(TM); @@ -222,7 +230,16 @@ bool StackProtector::RequiresStackProtector() { if (F->hasFnAttribute(Attribute::SafeStack)) return false; + // We are constructing the OptimizationRemarkEmitter on the fly rather than + // using the analysis pass to avoid building DominatorTree and LoopInfo which + // are not available this late in the IR pipeline. + OptimizationRemarkEmitter ORE(F); + if (F->hasFnAttribute(Attribute::StackProtectReq)) { + ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a function attribute or command-line switch"); NeedsProtector = true; Strong = true; // Use the same heuristic as strong to determine SSPLayout } else if (F->hasFnAttribute(Attribute::StackProtectStrong)) @@ -236,20 +253,29 @@ bool StackProtector::RequiresStackProtector() { for (const Instruction &I : BB) { if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { if (AI->isArrayAllocation()) { + OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray", + &I); + Remark + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a call to alloca or use of a variable length array"; if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) { // A call to alloca with size >= SSPBufferSize requires // stack protectors. Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); + ORE.emit(Remark); NeedsProtector = true; } else if (Strong) { // Require protectors for all alloca calls in strong mode. Layout.insert(std::make_pair(AI, SSPLK_SmallArray)); + ORE.emit(Remark); NeedsProtector = true; } } else { // A call to alloca with a variable size requires protectors. Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); + ORE.emit(Remark); NeedsProtector = true; } continue; @@ -259,6 +285,11 @@ bool StackProtector::RequiresStackProtector() { if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) { Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray : SSPLK_SmallArray)); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a stack allocated buffer or struct containing a " + "buffer"); NeedsProtector = true; continue; } @@ -266,6 +297,11 @@ bool StackProtector::RequiresStackProtector() { if (Strong && HasAddressTaken(AI)) { ++NumAddrTaken; Layout.insert(std::make_pair(AI, SSPLK_AddrOf)); + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to the address of a local variable being taken"); NeedsProtector = true; } } @@ -448,13 +484,13 @@ BasicBlock *StackProtector::CreateFailBB() { Constant *StackChkFail = M->getOrInsertFunction("__stack_smash_handler", Type::getVoidTy(Context), - Type::getInt8PtrTy(Context), nullptr); + Type::getInt8PtrTy(Context)); B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH")); } else { Constant *StackChkFail = - M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context), - nullptr); + M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context)); + B.CreateCall(StackChkFail, {}); } B.CreateUnreachable(); diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp index 7709236bbaa83..d2414200e9d57 100644 --- a/lib/CodeGen/TailDuplicator.cpp +++ b/lib/CodeGen/TailDuplicator.cpp @@ -725,6 +725,7 @@ bool TailDuplicator::duplicateSimpleBB( if (PredTBB == NextBB && PredFBB == nullptr) PredTBB = nullptr; + auto DL = PredBB->findBranchDebugLoc(); TII->removeBranch(*PredBB); if (!PredBB->isSuccessor(NewTarget)) @@ -735,7 +736,7 @@ bool TailDuplicator::duplicateSimpleBB( } if (PredTBB) - TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); + TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL); TDBBs.push_back(PredBB); } diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp index f082add8c7dd5..e5def6752e071 100644 --- a/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -73,7 +73,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF, return; // Get the callee saved register list... - const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); // Early exit if there are no callee saved registers. if (!CSRegs || CSRegs[0] == 0) diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index 01f91b96b58a7..711144a347430 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -470,7 +470,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, // No need to fold return, the meta data, and function arguments for (unsigned i = 0; i < StartIdx; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) { MachineOperand &MO = MI.getOperand(i); @@ -490,7 +490,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, MIB.addImm(SpillOffset); } else - MIB.addOperand(MO); + MIB.add(MO); } return NewMI; } @@ -941,12 +941,10 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const { unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); - if (MI.getOpcode() != FrameSetupOpcode && - MI.getOpcode() != FrameDestroyOpcode) + if (!isFrameInstr(MI)) return 0; - int SPAdj = MI.getOperand(0).getImm(); - SPAdj = TFI->alignSPAdjust(SPAdj); + int SPAdj = TFI->alignSPAdjust(getFrameSize(MI)); if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) || (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode)) diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 003311b157fc2..27630a3055cb3 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -838,7 +838,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; - MaskAndBranchFoldingIsLegal = false; EnableExtLdPromotion = false; HasFloatingPointExceptions = true; StackPointerRegisterToSaveRestore = 0; @@ -851,7 +850,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { MinFunctionAlignment = 0; PrefFunctionAlignment = 0; PrefLoopAlignment = 0; - GatherAllAliasesMaxDepth = 6; + GatherAllAliasesMaxDepth = 18; MinStackArgumentAlignment = 1; // TODO: the default will be switched to 0 in the next commit, along // with the Target-specific changes necessary. @@ -901,6 +900,7 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SMAX, VT, Expand); setOperationAction(ISD::UMIN, VT, Expand); setOperationAction(ISD::UMAX, VT, Expand); + setOperationAction(ISD::ABS, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); @@ -1227,7 +1227,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, // Copy operands before the frame-index. for (unsigned i = 0; i < OperIdx; ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); // Add frame index operands recognized by stackmaps.cpp if (MFI.isStatepointSpillSlotObjectIndex(FI)) { // indirect-mem-ref tag, size, #FI, offset. @@ -1237,18 +1237,18 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity"); MIB.addImm(StackMaps::IndirectMemRefOp); MIB.addImm(MFI.getObjectSize(FI)); - MIB.addOperand(MI->getOperand(OperIdx)); + MIB.add(MI->getOperand(OperIdx)); MIB.addImm(0); } else { // direct-mem-ref tag, #FI, offset. // Used by patchpoint, and direct alloca arguments to statepoints MIB.addImm(StackMaps::DirectMemRefOp); - MIB.addOperand(MI->getOperand(OperIdx)); + MIB.add(MI->getOperand(OperIdx)); MIB.addImm(0); } // Copy the operands after the frame index. for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); // Inherit previous memory operands. MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); @@ -1589,7 +1589,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT /// type of the given function. This does not require a DAG or a return value, /// and is suitable for use before any DAGs for the function are constructed. /// TODO: Move this out of TargetLowering.cpp. -void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, +void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr, SmallVectorImpl<ISD::OutputArg> &Outs, const TargetLowering &TLI, const DataLayout &DL) { SmallVector<EVT, 4> ValueVTs; @@ -1601,9 +1601,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, EVT VT = ValueVTs[j]; ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) + else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; // FIXME: C calling convention requires the return type to be promoted to @@ -1621,13 +1621,13 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg)) Flags.setInReg(); // Propagate extension type if any - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) Flags.setSExt(); - else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) + else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) @@ -1818,7 +1818,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Type *StackPtrTy = Type::getInt8PtrTy(M->getContext()); Value *Fn = M->getOrInsertFunction("__safestack_pointer_address", - StackPtrTy->getPointerTo(0), nullptr); + StackPtrTy->getPointerTo(0)); return IRB.CreateCall(Fn); } @@ -1918,11 +1918,7 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) { /// override the target defaults. static StringRef getRecipEstimateForFunc(MachineFunction &MF) { const Function *F = MF.getFunction(); - StringRef RecipAttrName = "reciprocal-estimates"; - if (!F->hasFnAttribute(RecipAttrName)) - return StringRef(); - - return F->getFnAttribute(RecipAttrName).getValueAsString(); + return F->getFnAttribute("reciprocal-estimates").getValueAsString(); } /// Construct a string for the given reciprocal operation of the given type. diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index eb2a28f574a54..34892680aceb3 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===// +//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===// // // The LLVM Compiler Infrastructure // @@ -12,36 +12,52 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/Comdat.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <string> + using namespace llvm; using namespace dwarf; @@ -53,10 +69,10 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol( const GlobalValue *GV, const TargetMachine &TM, MachineModuleInfo *MMI) const { unsigned Encoding = getPersonalityEncoding(); - if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect) + if ((Encoding & 0x80) == DW_EH_PE_indirect) return getContext().getOrCreateSymbol(StringRef("DW.ref.") + TM.getSymbol(GV)->getName()); - if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr) + if ((Encoding & 0x70) == DW_EH_PE_absptr) return TM.getSymbol(GV); report_fatal_error("We do not support this DWARF encoding yet!"); } @@ -86,8 +102,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue( const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, MachineModuleInfo *MMI, MCStreamer &Streamer) const { - - if (Encoding & dwarf::DW_EH_PE_indirect) { + if (Encoding & DW_EH_PE_indirect) { MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>(); MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM); @@ -102,7 +117,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference( return TargetLoweringObjectFile:: getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + Encoding & ~DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -117,8 +132,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { // section(".eh_frame") gcc will produce: // // .section .eh_frame,"a",@progbits - - if (Name == getInstrProfCoverageSectionName(false)) + + if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF, + /*AddSegmentInfo=*/false)) return SectionKind::getMetadata(); if (Name.empty() || Name[0] != '.') return K; @@ -149,7 +165,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { return K; } - static unsigned getELFSectionType(StringRef Name, SectionKind K) { // Use SHT_NOTE for section whose name starts with ".note" to allow // emitting ELF notes from C variable declaration. @@ -211,6 +226,20 @@ static const Comdat *getELFComdat(const GlobalValue *GV) { return C; } +static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, + const TargetMachine &TM) { + MDNode *MD = GO->getMetadata(LLVMContext::MD_associated); + if (!MD) + return nullptr; + + auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0)); + if (!VM) + report_fatal_error("MD_associated operand is not ValueAsMetadata"); + + GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue()); + return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr; +} + MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { StringRef SectionName = GO->getSection(); @@ -224,9 +253,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( Group = C->getName(); Flags |= ELF::SHF_GROUP; } - return getContext().getELFSection(SectionName, - getELFSectionType(SectionName, Kind), Flags, - /*EntrySize=*/0, Group); + + // A section can have at most one associated section. Put each global with + // MD_associated in a unique section. + unsigned UniqueID = MCContext::GenericSectionID; + const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); + if (AssociatedSymbol) { + UniqueID = NextUniqueID++; + Flags |= ELF::SHF_LINK_ORDER; + } + + MCSectionELF *Section = getContext().getELFSection( + SectionName, getELFSectionType(SectionName, Kind), Flags, + /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); + // Make sure that we did not get some other section with incompatible sh_link. + // This should not be possible due to UniqueID code above. + assert(Section->getAssociatedSymbol() == AssociatedSymbol); + return Section; } /// Return the section prefix name used by options FunctionsSections and @@ -248,11 +291,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro"; } -static MCSectionELF * -selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, - SectionKind Kind, Mangler &Mang, - const TargetMachine &TM, bool EmitUniqueSection, - unsigned Flags, unsigned *NextUniqueID) { +static MCSectionELF *selectELFSectionForGlobal( + MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, + const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, + unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) { unsigned EntrySize = 0; if (Kind.isMergeableCString()) { if (Kind.isMergeable2ByteCString()) { @@ -319,7 +361,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, if (Kind.isExecuteOnly()) UniqueID = 0; return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags, - EntrySize, Group, UniqueID); + EntrySize, Group, UniqueID, AssociatedSymbol); } MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( @@ -337,8 +379,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( } EmitUniqueSection |= GO->hasComdat(); - return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, - EmitUniqueSection, Flags, &NextUniqueID); + const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); + if (AssociatedSymbol) { + EmitUniqueSection = true; + Flags |= ELF::SHF_LINK_ORDER; + } + + MCSectionELF *Section = selectELFSectionForGlobal( + getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags, + &NextUniqueID, AssociatedSymbol); + assert(Section->getAssociatedSymbol() == AssociatedSymbol); + return Section; } MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( @@ -351,8 +402,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( return ReadOnlySection; return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(), - getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC, - &NextUniqueID); + getMangler(), TM, EmitUniqueSection, + ELF::SHF_ALLOC, &NextUniqueID, + /* AssociatedSymbol */ nullptr); } bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection( @@ -723,7 +775,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference( return TargetLoweringObjectFile:: getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + Encoding & ~DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -1122,33 +1174,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection( void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal( raw_ostream &OS, const GlobalValue *GV) const { - if (!GV->hasDLLExportStorageClass() || GV->isDeclaration()) - return; + emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler()); +} - const Triple &TT = getTargetTriple(); +//===----------------------------------------------------------------------===// +// Wasm +//===----------------------------------------------------------------------===// - if (TT.isKnownWindowsMSVCEnvironment()) - OS << " /EXPORT:"; - else - OS << " -export:"; - - if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) { - std::string Flag; - raw_string_ostream FlagOS(Flag); - getMangler().getNameWithPrefix(FlagOS, GV, false); - FlagOS.flush(); - if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix()) - OS << Flag.substr(1); - else - OS << Flag; - } else { - getMangler().getNameWithPrefix(OS, GV, false); +static const Comdat *getWasmComdat(const GlobalValue *GV) { + const Comdat *C = GV->getComdat(); + if (!C) + return nullptr; + + if (C->getSelectionKind() != Comdat::Any) + report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" + + C->getName() + "' cannot be lowered."); + + return C; +} + +MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + llvm_unreachable("getExplicitSectionGlobal not yet implemented"); + return nullptr; +} + +static MCSectionWasm * +selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM, bool EmitUniqueSection, + unsigned Flags, unsigned *NextUniqueID) { + StringRef Group = ""; + if (getWasmComdat(GO)) + llvm_unreachable("comdat not yet supported for wasm"); + + bool UniqueSectionNames = TM.getUniqueSectionNames(); + SmallString<128> Name = getSectionPrefixForGlobal(Kind); + + if (const auto *F = dyn_cast<Function>(GO)) { + const auto &OptionalPrefix = F->getSectionPrefix(); + if (OptionalPrefix) + Name += *OptionalPrefix; } - if (!GV->getValueType()->isFunctionTy()) { - if (TT.isKnownWindowsMSVCEnvironment()) - OS << ",DATA"; - else - OS << ",data"; + if (EmitUniqueSection && UniqueSectionNames) { + Name.push_back('.'); + TM.getNameWithPrefix(Name, GO, Mang, true); + } + unsigned UniqueID = MCContext::GenericSectionID; + if (EmitUniqueSection && !UniqueSectionNames) { + UniqueID = *NextUniqueID; + (*NextUniqueID)++; } + return Ctx.getWasmSection(Name, /*Type=*/0, Flags, + Group, UniqueID); +} + +MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + + if (Kind.isCommon()) + report_fatal_error("mergable sections not supported yet on wasm"); + + // If we have -ffunction-section or -fdata-section then we should emit the + // global value to a uniqued section specifically for it. + bool EmitUniqueSection = false; + if (Kind.isText()) + EmitUniqueSection = TM.getFunctionSections(); + else + EmitUniqueSection = TM.getDataSections(); + EmitUniqueSection |= GO->hasComdat(); + + return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, + EmitUniqueSection, /*Flags=*/0, + &NextUniqueID); +} + +bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection( + bool UsesLabelDifference, const Function &F) const { + // We can always create relative relocations, so use another section + // that can be marked non-executable. + return false; +} + +const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference( + const GlobalValue *LHS, const GlobalValue *RHS, + const TargetMachine &TM) const { + // We may only use a PLT-relative relocation to refer to unnamed_addr + // functions. + if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) + return nullptr; + + // Basic sanity checks. + if (LHS->getType()->getPointerAddressSpace() != 0 || + RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || + RHS->isThreadLocal()) + return nullptr; + + return MCBinaryExpr::createSub( + MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None, + getContext()), + MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext()); +} + +void +TargetLoweringObjectFileWasm::InitializeWasm() { + // TODO: Initialize StaticCtorSection and StaticDtorSection. } diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp index b6da8e0aa60db..c20d5ab814f82 100644 --- a/lib/CodeGen/TargetOptionsImpl.cpp +++ b/lib/CodeGen/TargetOptionsImpl.cpp @@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { return false; } -/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option -/// is specified on the command line. When this flag is off(default), the -/// code generator is not allowed to generate mad (multiply add) if the -/// result is "less precise" than doing those operations individually. -bool TargetOptions::LessPreciseFPMAD() const { - return UnsafeFPMath || LessPreciseFPMADOption; -} - /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume /// that the rounding mode of the FPU can change from its default. bool TargetOptions::HonorSignDependentRoundingFPMath() const { diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index e7ea2b4563f96..150195f5f85bc 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -92,6 +92,9 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden, cl::desc("Verify generated machine code"), cl::init(false), cl::ZeroOrMore); +static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner", + cl::Hidden, + cl::desc("Enable machine outliner")); static cl::opt<std::string> PrintMachineInstrs("print-machineinstrs", cl::ValueOptional, @@ -261,7 +264,8 @@ TargetPassConfig::~TargetPassConfig() { TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false), AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), - DisableVerify(false), EnableTailMerge(true) { + DisableVerify(false), EnableTailMerge(true), + RequireCodeGenSCCOrder(false) { Impl = new PassConfigImpl(); @@ -279,6 +283,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) if (StringRef(PrintMachineInstrs.getValue()).equals("")) TM->Options.PrintMachineCode = true; + + if (TM->Options.EnableIPRA) + setRequiresCodeGenSCCOrder(); } CodeGenOpt::Level TargetPassConfig::getOptLevel() const { @@ -531,7 +538,7 @@ void TargetPassConfig::addISelPrepare() { addPreISel(); // Force codegen to run according to the callgraph. - if (TM->Options.EnableIPRA) + if (requiresCodeGenSCCOrder()) addPass(new DummyCGSCCPass); // Add both the safe stack and the stack protection passes: each of them will @@ -668,9 +675,15 @@ void TargetPassConfig::addMachinePasses() { addPass(&StackMapLivenessID, false); addPass(&LiveDebugValuesID, false); + // Insert before XRay Instrumentation. + addPass(&FEntryInserterID, false); + addPass(&XRayInstrumentationID, false); addPass(&PatchableFunctionID, false); + if (EnableMachineOutliner) + PM->add(createMachineOutlinerPass()); + AddingMachinePasses = false; } @@ -704,6 +717,10 @@ void TargetPassConfig::addMachineSSAOptimization() { addPass(&MachineLICMID, false); addPass(&MachineCSEID, false); + + // Coalesce basic blocks with the same branch condition + addPass(&BranchCoalescingID); + addPass(&MachineSinkingID); addPass(&PeepholeOptimizerID); @@ -730,7 +747,7 @@ MachinePassRegistry RegisterRegAlloc::Registry; /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. -LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag); +static llvm::once_flag InitializeDefaultRegisterAllocatorFlag; static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } static RegisterRegAlloc defaultRegAlloc("default", @@ -903,6 +920,11 @@ void TargetPassConfig::addBlockPlacement() { //===---------------------------------------------------------------------===// /// GlobalISel Configuration //===---------------------------------------------------------------------===// + +bool TargetPassConfig::isGlobalISelEnabled() const { + return false; +} + bool TargetPassConfig::isGlobalISelAbortEnabled() const { return EnableGlobalISelAbort == 1; } diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index cd50c5b6571dc..66cdad278e8da 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -155,8 +155,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const { // Pick the most sub register class of the right type that contains // this physreg. const TargetRegisterClass* BestRC = nullptr; - for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){ - const TargetRegisterClass* RC = *I; + for (const TargetRegisterClass* RC : regclasses()) { if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC))) BestRC = RC; @@ -185,10 +184,9 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, if (SubClass) getAllocatableSetForRC(MF, SubClass, Allocatable); } else { - for (TargetRegisterInfo::regclass_iterator I = regclass_begin(), - E = regclass_end(); I != E; ++I) - if ((*I)->isAllocatable()) - getAllocatableSetForRC(MF, *I, Allocatable); + for (const TargetRegisterClass *C : regclasses()) + if (C->isAllocatable()) + getAllocatableSetForRC(MF, C, Allocatable); } // Mask out the reserved registers @@ -415,9 +413,9 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void -TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, - const TargetRegisterInfo *TRI) { +LLVM_DUMP_METHOD +void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, + const TargetRegisterInfo *TRI) { dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n"; } #endif diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp index 83e52d3353548..0df34ce43112a 100644 --- a/lib/CodeGen/TargetSchedule.cpp +++ b/lib/CodeGen/TargetSchedule.cpp @@ -1,4 +1,4 @@ -//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===// +//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +12,22 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; @@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const { static unsigned gcd(unsigned Dividend, unsigned Divisor) { // Dividend and Divisor will be naturally swapped as needed. - while(Divisor) { + while (Divisor) { unsigned Rem = Dividend % Divisor; Dividend = Divisor; Divisor = Rem; }; return Dividend; } + static unsigned lcm(unsigned A, unsigned B) { unsigned LCM = (uint64_t(A) * B) / gcd(A, B); assert((LCM >= A && LCM >= B) && "LCM overflow"); @@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm, } } +/// Returns true only if instruction is specified as single issue. +bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->BeginGroup; + } + return false; +} + +bool TargetSchedModel::mustEndGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->EndGroup; + } + return false; +} + unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC) const { if (hasInstrItineraries()) { @@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) { /// evaluation of predicates that depend on instruction operands or flags. const MCSchedClassDesc *TargetSchedModel:: resolveSchedClass(const MachineInstr *MI) const { - // Get the definition's scheduling class descriptor from this machine model. unsigned SchedClass = MI->getDesc().getSchedClass(); const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); @@ -244,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const { if (SCDesc->isValid() && !SCDesc->isVariant()) return computeInstrLatency(*SCDesc); - llvm_unreachable("No MI sched latency"); + if (SCDesc->isValid()) { + assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()"); + return computeInstrLatency(*SCDesc); + } + return 0; } unsigned @@ -298,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx, } return 0; } + +static Optional<double> +getRTroughputFromItineraries(unsigned schedClass, + const InstrItineraryData *IID){ + double Unknown = std::numeric_limits<double>::infinity(); + double Throughput = Unknown; + + for (const InstrStage *IS = IID->beginStage(schedClass), + *E = IID->endStage(schedClass); + IS != E; ++IS) { + unsigned Cycles = IS->getCycles(); + if (!Cycles) + continue; + Throughput = + std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +static Optional<double> +getRTroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc, + const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel) { + double Unknown = std::numeric_limits<double>::infinity(); + double Throughput = Unknown; + + for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), + *WEnd = STI->getWriteProcResEnd(SCDesc); + WPR != WEnd; ++WPR) { + unsigned Cycles = WPR->Cycles; + if (!Cycles) + return Optional<double>(); + + unsigned NumUnits = + SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; + Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const { + if (hasInstrItineraries()) + return getRTroughputFromItineraries(MI->getDesc().getSchedClass(), + getInstrItineraries()); + if (hasInstrSchedModel()) + return getRTroughputFromInstrSchedModel(resolveSchedClass(MI), STI, + SchedModel); + return Optional<double>(); +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const { + unsigned SchedClass = TII->get(Opcode).getSchedClass(); + if (hasInstrItineraries()) + return getRTroughputFromItineraries(SchedClass, getInstrItineraries()); + if (hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); + if (SCDesc->isValid() && !SCDesc->isVariant()) + return getRTroughputFromInstrSchedModel(SCDesc, STI, SchedModel); + } + return Optional<double>(); +} diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp index c74707d95b9e7..0a444e0fff07e 100644 --- a/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/lib/CodeGen/TargetSubtargetInfo.cpp @@ -11,6 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -52,3 +55,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const { bool TargetSubtargetInfo::useAA() const { return false; } + +static std::string createSchedInfoStr(unsigned Latency, + Optional<double> RThroughput) { + static const char *SchedPrefix = " sched: ["; + std::string Comment; + raw_string_ostream CS(Comment); + if (Latency > 0 && RThroughput.hasValue()) + CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue()) + << "]"; + else if (Latency > 0) + CS << SchedPrefix << Latency << ":?]"; + else if (RThroughput.hasValue()) + CS << SchedPrefix << "?:" << RThroughput.getValue() << "]"; + CS.flush(); + return Comment; +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { + if (MI.isPseudo() || MI.isTerminator()) + return std::string(); + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI); + return createSchedInfoStr(Latency, RThroughput); +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + if (!TSchedModel.hasInstrSchedModel()) + return std::string(); + unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode()); + Optional<double> RThroughput = + TSchedModel.computeInstrRThroughput(MCI.getOpcode()); + return createSchedInfoStr(Latency, RThroughput); +} diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 0f1b2ed994b7e..75359fe3c0ea6 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -905,7 +905,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, ++End; } - // Check if the reschedule will not break depedencies. + // Check if the reschedule will not break dependencies. unsigned NumVisited = 0; MachineBasicBlock::iterator KillPos = KillMI; ++KillPos; @@ -1785,7 +1785,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY)) .addReg(DstReg, RegState::Define, SubIdx) - .addOperand(UseMO); + .add(UseMO); // The first def needs an <undef> flag because there is no live register // before it. diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index 0d506d6466598..c8946010e9d15 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -167,6 +167,7 @@ class VirtRegRewriter : public MachineFunctionPass { bool readsUndefSubreg(const MachineOperand &MO) const; void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const; void handleIdentityCopy(MachineInstr &MI) const; + void expandCopyBundle(MachineInstr &MI) const; public: static char ID; @@ -367,11 +368,41 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const { } if (Indexes) - Indexes->removeMachineInstrFromMaps(MI); - MI.eraseFromParent(); + Indexes->removeSingleMachineInstrFromMaps(MI); + MI.eraseFromBundle(); DEBUG(dbgs() << " deleted.\n"); } +/// The liverange splitting logic sometimes produces bundles of copies when +/// subregisters are involved. Expand these into a sequence of copy instructions +/// after processing the last in the bundle. Does not update LiveIntervals +/// which we shouldn't need for this instruction anymore. +void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const { + if (!MI.isCopy()) + return; + + if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) { + // Only do this when the complete bundle is made out of COPYs. + MachineBasicBlock &MBB = *MI.getParent(); + for (MachineBasicBlock::reverse_instr_iterator I = + std::next(MI.getReverseIterator()), E = MBB.instr_rend(); + I != E && I->isBundledWithSucc(); ++I) { + if (!I->isCopy()) + return; + } + + for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator(); + I->isBundledWithPred(); ) { + MachineInstr &MI = *I; + ++I; + + MI.unbundleFromPred(); + if (Indexes) + Indexes->insertMachineInstrInMaps(MI); + } + } +} + void VirtRegRewriter::rewrite() { bool NoSubRegLiveness = !MRI->subRegLivenessEnabled(); SmallVector<unsigned, 8> SuperDeads; @@ -431,12 +462,14 @@ void VirtRegRewriter::rewrite() { } } - // The <def,undef> flag only makes sense for sub-register defs, and - // we are substituting a full physreg. An <imp-use,kill> operand - // from the SuperKills list will represent the partial read of the - // super-register. - if (MO.isDef()) + // The <def,undef> and <def,internal> flags only make sense for + // sub-register defs, and we are substituting a full physreg. An + // <imp-use,kill> operand from the SuperKills list will represent the + // partial read of the super-register. + if (MO.isDef()) { MO.setIsUndef(false); + MO.setIsInternalRead(false); + } // PhysReg operands cannot have subregister indexes. PhysReg = TRI->getSubReg(PhysReg, SubReg); @@ -461,6 +494,8 @@ void VirtRegRewriter::rewrite() { DEBUG(dbgs() << "> " << *MI); + expandCopyBundle(*MI); + // We can remove identity copies right now. handleIdentityCopy(*MI); } diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp index 568720c66e551..ae07e8b2fa032 100644 --- a/lib/CodeGen/WinEHPrepare.cpp +++ b/lib/CodeGen/WinEHPrepare.cpp @@ -86,6 +86,7 @@ private: // All fields are reset by runOnFunction. EHPersonality Personality = EHPersonality::Unknown; + const DataLayout *DL = nullptr; DenseMap<BasicBlock *, ColorVector> BlockColors; MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks; }; @@ -111,6 +112,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) { if (!isFuncletEHPersonality(Personality)) return false; + DL = &Fn.getParent()->getDataLayout(); return prepareExplicitEH(Fn); } @@ -1070,7 +1072,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) { if (!isa<TerminatorInst>(EHPad)) { // If the EHPad isn't a terminator, then we can insert a load in this block // that will dominate all uses. - SpillSlot = new AllocaInst(PN->getType(), nullptr, + SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr, Twine(PN->getName(), ".wineh.spillslot"), &F.getEntryBlock().front()); Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"), @@ -1157,7 +1159,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot, Function &F) { // Lazilly create the spill slot. if (!SpillSlot) - SpillSlot = new AllocaInst(V->getType(), nullptr, + SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr, Twine(V->getName(), ".wineh.spillslot"), &F.getEntryBlock().front()); diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp index 63bd762eeb2b8..7d2848bdc13b1 100644 --- a/lib/CodeGen/XRayInstrumentation.cpp +++ b/lib/CodeGen/XRayInstrumentation.cpp @@ -81,7 +81,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc)) .addImm(T.getOpcode()); for (auto &MO : T.operands()) - MIB.addOperand(MO); + MIB.add(MO); Terminators.push_back(&T); } } @@ -157,6 +157,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { case Triple::ArchType::arm: case Triple::ArchType::thumb: case Triple::ArchType::aarch64: + case Triple::ArchType::ppc64le: + case Triple::ArchType::mips: + case Triple::ArchType::mipsel: + case Triple::ArchType::mips64: + case Triple::ArchType::mips64el: // For the architectures which don't have a single return instruction prependRetWithPatchableExit(MF, TII); break; |