diff options
Diffstat (limited to 'contrib/llvm/lib/CodeGen')
192 files changed, 18432 insertions, 8849 deletions
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index bb908618b679..43b245c66400 100644 --- a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -128,8 +128,7 @@ AggressiveAntiDepBreaker::AggressiveAntiDepBreaker( } DEBUG(dbgs() << "AntiDep Critical-Path Registers:"); - DEBUG(for (int r = CriticalPathSet.find_first(); r != -1; - r = CriticalPathSet.find_next(r)) + DEBUG(for (unsigned r : CriticalPathSet.set_bits()) dbgs() << " " << TRI->getName(r)); DEBUG(dbgs() << '\n'); } @@ -163,9 +162,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // callee-saved register that is not saved in the prolog. const MachineFrameInfo &MFI = MF.getFrameInfo(); BitVector Pristine = MFI.getPristineRegs(MF); - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { + for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; + ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !Pristine.test(Reg)) continue; + if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { unsigned AliasReg = *AI; State->UnionGroups(AliasReg, 0); @@ -569,7 +570,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( DEBUG({ dbgs() << " ::"; - for (int r = BV.find_first(); r != -1; r = BV.find_next(r)) + for (unsigned r : BV.set_bits()) dbgs() << " " << TRI->getName(r); dbgs() << "\n"; }); @@ -962,10 +963,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // sure to update that as well. const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()]; if (!SU) continue; - for (DbgValueVector::iterator DVI = DbgValues.begin(), - DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q.second.Operand->getParent()) - UpdateDbgValue(*DVI->first, AntiDepReg, NewReg); + UpdateDbgValues(DbgValues, Q.second.Operand->getParent(), + AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp index 79ecc4308fe7..09a37a77e9fb 100644 --- a/contrib/llvm/lib/CodeGen/Analysis.cpp +++ b/contrib/llvm/lib/CodeGen/Analysis.cpp @@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I, bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS; ADS = true; - AttrBuilder CallerAttrs(F->getAttributes(), - AttributeSet::ReturnIndex); + AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex); AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(), - AttributeSet::ReturnIndex); + AttributeList::ReturnIndex); // Noalias is completely benign as far as calling convention goes, it // shouldn't affect whether the call is a tail call. @@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, return true; } -bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) { - if (!GV->hasLinkOnceODRLinkage()) - return false; - - // We assume that anyone who sets global unnamed_addr on a non-constant knows - // what they're doing. - if (GV->hasGlobalUnnamedAddr()) - return true; - - // If it is a non constant variable, it needs to be uniqued across shared - // objects. - if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) { - if (!Var->isConstant()) - return false; - } - - return GV->hasAtLeastLocalUnnamedAddr(); -} - static void collectFuncletMembers( DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet, const MachineBasicBlock *MBB) { diff --git a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h index 04f7f419f5ea..d14d93100adb 100644 --- a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h +++ b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h @@ -60,6 +60,25 @@ public: if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg) MI.getOperand(0).setReg(NewReg); } + + /// Update all DBG_VALUE instructions that may be affected by the dependency + /// breaker's update of ParentMI to use NewReg. + void UpdateDbgValues(const DbgValueVector &DbgValues, MachineInstr *ParentMI, + unsigned OldReg, unsigned NewReg) { + // The following code is dependent on the order in which the DbgValues are + // constructed in ScheduleDAGInstrs::buildSchedGraph. + MachineInstr *PrevDbgMI = nullptr; + for (const auto &DV : make_range(DbgValues.crbegin(), DbgValues.crend())) { + MachineInstr *PrevMI = DV.second; + if ((PrevMI == ParentMI) || (PrevMI == PrevDbgMI)) { + MachineInstr *DbgMI = DV.first; + UpdateDbgValue(*DbgMI, OldReg, NewReg); + PrevDbgMI = DbgMI; + } else if (PrevDbgMI) { + break; // If no match and already found a DBG_VALUE, we're done. + } + } + } }; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 24fdbfc901fd..d72cf5922987 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -11,48 +11,102 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/AsmPrinter.h" +#include "AsmPrinterHandler.h" #include "CodeViewDebug.h" #include "DwarfDebug.h" #include "DwarfException.h" #include "WinException.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ObjectUtils.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalIFunc.h" +#include "llvm/IR/GlobalIndirectSymbol.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Value.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cinttypes> +#include <cstdint> +#include <limits> +#include <memory> +#include <string> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -69,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription = STATISTIC(EmittedInsts, "Number of machine instrs printed"); +static cl::opt<bool> + PrintSchedule("print-schedule", cl::Hidden, cl::init(false), + cl::desc("Print 'sched: [latency:throughput]' in .s output")); + char AsmPrinter::ID = 0; typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type; @@ -78,7 +136,6 @@ static gcp_map_type &getGCMap(void *&P) { return *(gcp_map_type*)P; } - /// getGVAlignmentLog2 - Return the alignment to use for the specified global /// value in log2 form. This rounds up to the preferred alignment if possible /// and legal. @@ -107,16 +164,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL, AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer) : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()), - OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)), - isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) { - DD = nullptr; - MMI = nullptr; - LI = nullptr; - MF = nullptr; - CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr; - CurrentFnBegin = nullptr; - CurrentFnEnd = nullptr; - GCMetadataPrinters = nullptr; + OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) { VerboseAsm = OutStreamer->isVerboseAsm(); } @@ -171,6 +219,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired<MachineModuleInfo>(); + AU.addRequired<MachineOptimizationRemarkEmitterPass>(); AU.addRequired<GCModuleInfo>(); if (isVerbose()) AU.addRequired<MachineLoopInfo>(); @@ -223,7 +272,7 @@ bool AsmPrinter::doInitialization(Module &M) { // don't, this at least helps the user find where a global came from. if (MAI->hasSingleParameterDotFile()) { // .file "foo.c" - OutStreamer->EmitFileDirective(M.getModuleIdentifier()); + OutStreamer->EmitFileDirective(M.getSourceFileName()); } GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>(); @@ -571,7 +620,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { /// /// \p Value - The value to emit. /// \p Size - The size of the integer (in bytes) to emit. -void AsmPrinter::EmitDebugValue(const MCExpr *Value, +void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const { OutStreamer->EmitValue(Value, Size); } @@ -579,12 +628,15 @@ void AsmPrinter::EmitDebugValue(const MCExpr *Value, /// EmitFunctionHeader - This method emits the header for the current /// function. void AsmPrinter::EmitFunctionHeader() { + const Function *F = MF->getFunction(); + + if (isVerbose()) + OutStreamer->GetCommentOS() << "-- Begin function " << F->getName() << '\n'; + // Print out constants referenced by the function EmitConstantPool(); // Print the 'header' of function. - const Function *F = MF->getFunction(); - OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(F, TM)); EmitVisibility(CurrentFnSym, F->getVisibility()); @@ -602,8 +654,23 @@ void AsmPrinter::EmitFunctionHeader() { } // Emit the prefix data. - if (F->hasPrefixData()) - EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + if (F->hasPrefixData()) { + if (MAI->hasSubsectionsViaSymbols()) { + // Preserving prefix data on platforms which use subsections-via-symbols + // is a bit tricky. Here we introduce a symbol for the prefix data + // and use the .alt_entry attribute to mark the function's real entry point + // as an alternative entry point to the prefix-data symbol. + MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol(); + OutStreamer->EmitLabel(PrefixSym); + + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + + // Emit an .alt_entry directive for the actual function symbol. + OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry); + } else { + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); + } + } // Emit the CurrentFnSym. This is a virtual function to allow targets to // do their wild and crazy things as required. @@ -660,7 +727,8 @@ void AsmPrinter::EmitFunctionEntryLabel() { } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS, + AsmPrinter *AP) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -668,6 +736,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { int FI; const MachineFrameInfo &MFI = MF->getFrameInfo(); + bool Commented = false; // We assume a single instruction only has a spill or reload, not // both. @@ -675,24 +744,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { if (TII->isLoadFromStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Reload\n"; + CommentOS << MMO->getSize() << "-byte Reload"; + Commented = true; } } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Reload\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Reload"; + Commented = true; + } } else if (TII->isStoreToStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Spill\n"; + CommentOS << MMO->getSize() << "-byte Spill"; + Commented = true; } } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Spill\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Spill"; + Commented = true; + } } // Check for spill-induced copies - if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) - CommentOS << " Reload Reuse\n"; + if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) { + Commented = true; + CommentOS << " Reload Reuse"; + } + + if (Commented && AP->EnablePrintSchedInfo) + // If any comment was added above and we need sched info comment then + // add this new comment just after the above comment w/o "\n" between them. + CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n"; + else if (Commented) + CommentOS << "\n"; } /// emitImplicitDef - This method emits the specified machine instruction @@ -739,46 +823,30 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { const DILocalVariable *V = MI->getDebugVariable(); if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) { - StringRef Name = SP->getDisplayName(); + StringRef Name = SP->getName(); if (!Name.empty()) OS << Name << ":"; } OS << V->getName(); - - const DIExpression *Expr = MI->getDebugExpression(); - auto Fragment = Expr->getFragmentInfo(); - if (Fragment) - OS << " [fragment offset=" << Fragment->OffsetInBits - << " size=" << Fragment->SizeInBits << "]"; OS << " <- "; // The second operand is only an offset if it's an immediate. - bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm(); - int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0; - - for (unsigned i = 0; i < Expr->getNumElements(); ++i) { - uint64_t Op = Expr->getElement(i); - if (Op == dwarf::DW_OP_LLVM_fragment) { - // There can't be any operands after this in a valid expression - break; - } else if (Deref) { - // We currently don't support extra Offsets or derefs after the first - // one. Bail out early instead of emitting an incorrect comment - OS << " [complex expression]"; - AP.OutStreamer->emitRawComment(OS.str()); - return true; - } else if (Op == dwarf::DW_OP_deref) { - Deref = true; - continue; - } - - uint64_t ExtraOffset = Expr->getElement(i++); - if (Op == dwarf::DW_OP_plus) - Offset += ExtraOffset; - else { - assert(Op == dwarf::DW_OP_minus); - Offset -= ExtraOffset; + bool MemLoc = MI->getOperand(0).isReg() && MI->getOperand(1).isImm(); + int64_t Offset = MemLoc ? MI->getOperand(1).getImm() : 0; + const DIExpression *Expr = MI->getDebugExpression(); + if (Expr->getNumElements()) { + OS << '['; + bool NeedSep = false; + for (auto Op : Expr->expr_ops()) { + if (NeedSep) + OS << ", "; + else + NeedSep = true; + OS << dwarf::OperationEncodingString(Op.getOp()); + for (unsigned I = 0; I < Op.getNumArgs(); ++I) + OS << ' ' << Op.getArg(I); } + OS << "] "; } // Register or immediate value. Register 0 means undef. @@ -809,7 +877,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering(); Offset += TFI->getFrameIndexReference(*AP.MF, MI->getOperand(0).getIndex(), Reg); - Deref = true; + MemLoc = true; } if (Reg == 0) { // Suppress offset, it is not meaningful here. @@ -818,12 +886,12 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { AP.OutStreamer->emitRawComment(OS.str()); return true; } - if (Deref) + if (MemLoc) OS << '['; OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo()); } - if (Deref) + if (MemLoc) OS << '+' << Offset << ']'; // NOTE: Want this comment at start of line, don't emit with AddComment. @@ -855,6 +923,16 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) { if (needsCFIMoves() == CFI_M_None) return; + // If there is no "real" instruction following this CFI instruction, skip + // emitting it; it would be beyond the end of the function's FDE range. + auto *MBB = MI.getParent(); + auto I = std::next(MI.getIterator()); + while (I != MBB->end() && I->isTransient()) + ++I; + if (I == MBB->instr_end() && + MBB->getReverseIterator() == MBB->getParent()->rbegin()) + return; + const std::vector<MCCFIInstruction> &Instrs = MF->getFrameInstructions(); unsigned CFIIndex = MI.getOperand(0).getCFIIndex(); const MCCFIInstruction &CFI = Instrs[CFIIndex]; @@ -883,6 +961,7 @@ void AsmPrinter::EmitFunctionBody() { // Print out code for the function. bool HasAnyRealCode = false; + int NumInstsInFunction = 0; for (auto &MBB : *MF) { // Print a label for the basic block. EmitBasicBlockStart(MBB); @@ -892,7 +971,7 @@ void AsmPrinter::EmitFunctionBody() { if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() && !MI.isDebugValue()) { HasAnyRealCode = true; - ++EmittedInsts; + ++NumInstsInFunction; } if (ShouldPrintDebugScopes) { @@ -905,7 +984,7 @@ void AsmPrinter::EmitFunctionBody() { } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(MI, OutStreamer->GetCommentOS(), this); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -953,18 +1032,34 @@ void AsmPrinter::EmitFunctionBody() { EmitBasicBlockEnd(MBB); } + EmittedInsts += NumInstsInFunction; + MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount", + MF->getFunction()->getSubprogram(), + &MF->front()); + R << ore::NV("NumInstructions", NumInstsInFunction) + << " instructions in function"; + ORE->emit(R); + // If the function is empty and the object file uses .subsections_via_symbols, // then we need to emit *something* to the function body to prevent the // labels from collapsing together. Just emit a noop. - if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) { + // Similarly, don't emit empty functions on Windows either. It can lead to + // duplicate entries (two functions with the same RVA) in the Guard CF Table + // after linking, causing the kernel not to load the binary: + // https://developercommunity.visualstudio.com/content/problem/45366/vc-linker-creates-invalid-dll-with-clang-cl.html + // FIXME: Hide this behind some API in e.g. MCAsmInfo or MCTargetStreamer. + const Triple &TT = TM.getTargetTriple(); + if (!HasAnyRealCode && (MAI->hasSubsectionsViaSymbols() || + (TT.isOSWindows() && TT.isOSBinFormatCOFF()))) { MCInst Noop; - MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop); - OutStreamer->AddComment("avoids zero-length function"); + MF->getSubtarget().getInstrInfo()->getNoop(Noop); // Targets can opt-out of emitting the noop here by leaving the opcode // unspecified. - if (Noop.getOpcode()) + if (Noop.getOpcode()) { + OutStreamer->AddComment("avoids zero-length function"); OutStreamer->EmitInstruction(Noop, getSubtargetInfo()); + } } const Function *F = MF->getFunction(); @@ -1015,6 +1110,9 @@ void AsmPrinter::EmitFunctionBody() { HI.Handler->endFunction(MF); } + if (isVerbose()) + OutStreamer->GetCommentOS() << "-- End function\n"; + OutStreamer->AddBlankLine(); } @@ -1238,7 +1336,7 @@ bool AsmPrinter::doFinalization(Module &M) { break; AliasStack.push_back(Cur); } - for (const GlobalAlias *AncestorAlias : reverse(AliasStack)) + for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack)) emitGlobalIndirectSymbol(M, *AncestorAlias); AliasStack.clear(); } @@ -1266,7 +1364,7 @@ bool AsmPrinter::doFinalization(Module &M) { OutContext.getOrCreateSymbol(StringRef("__morestack_addr")); OutStreamer->EmitLabel(AddrSymbol); - unsigned PtrSize = M.getDataLayout().getPointerSize(0); + unsigned PtrSize = MAI->getCodePointerSize(); OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"), PtrSize); } @@ -1311,19 +1409,28 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnSymForSize = CurrentFnBegin; } + ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); if (isVerbose()) LI = &getAnalysis<MachineLoopInfo>(); + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() + ? PrintSchedule + : STI.supportPrintSchedInfo(); } namespace { + // Keep track the alignment, constpool entries per Section. struct SectionCPs { MCSection *S; unsigned Alignment; SmallVector<unsigned, 4> CPEs; + SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {} }; -} + +} // end anonymous namespace /// EmitConstantPool - Print to the current output stream assembly /// representations of the constants in the constant pool MCP. This is @@ -1547,7 +1654,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI, OutStreamer->EmitValue(Value, EntrySize); } - /// EmitSpecialLLVMGlobal - Check to see if the specified global is a /// special global used by LLVM. If so, emit it and return true, otherwise /// do nothing and return false. @@ -1598,13 +1704,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) { } namespace { + struct Structor { - Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {} - int Priority; - llvm::Constant *Func; - llvm::GlobalValue *ComdatKey; + int Priority = 0; + Constant *Func = nullptr; + GlobalValue *ComdatKey = nullptr; + + Structor() = default; }; -} // end namespace + +} // end anonymous namespace /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init /// priority. @@ -1653,8 +1762,11 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List, const TargetLoweringObjectFile &Obj = getObjFileLowering(); const MCSymbol *KeySym = nullptr; if (GlobalValue *GV = S.ComdatKey) { - if (GV->hasAvailableExternallyLinkage()) - // If the associated variable is available_externally, some other TU + if (GV->isDeclarationForLinker()) + // If the associated variable is not defined in this module + // (it might be available_externally, or have been an + // available_externally definition that was dropped by the + // EliminateAvailableExternally pass), some other TU // will provide its dynamic initializer. continue; @@ -1931,7 +2043,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) { return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1. } - /// isRepeatedByteSequence - Determine whether the given value is /// composed of a repeated sequence of identical bytes and return the /// byte value. If it is not a repeated sequence, return -1. @@ -1972,7 +2083,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) { static void emitGlobalConstantDataSequential(const DataLayout &DL, const ConstantDataSequential *CDS, AsmPrinter &AP) { - // See if we can aggregate this into a .fill, if so, emit it as such. int Value = isRepeatedByteSequence(CDS, DL); if (Value != -1) { @@ -2006,7 +2116,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL, CDS->getNumElements(); if (unsigned Padding = Size - EmittedSize) AP.OutStreamer->EmitZeros(Padding); - } static void emitGlobalConstantArray(const DataLayout &DL, @@ -2145,7 +2254,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { // chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN] ExtraBits = Realigned.getRawData()[0] & (((uint64_t)-1) >> (64 - ExtraBitsSize)); - Realigned = Realigned.lshr(ExtraBitsSize); + Realigned.lshrInPlace(ExtraBitsSize); } else ExtraBits = Realigned.getRawData()[BitWidth / 64]; } @@ -2420,8 +2529,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const { return OutContext.getOrCreateSymbol(NameStr); } - - /// PrintParentLoopComment - Print comments about parent loops of this one. static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop, unsigned FunctionNumber) { @@ -2486,7 +2593,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB, PrintChildLoopComment(OS, Loop, AP.getFunctionNumber()); } - /// EmitBasicBlockStart - This method prints the label for the specified /// MachineBasicBlock, an alignment (if present) and a comment describing /// it if appropriate. @@ -2607,8 +2713,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { return true; } - - GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { if (!S.usesMetadata()) return nullptr; @@ -2639,7 +2743,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) { } /// Pin vtable to this file. -AsmPrinterHandler::~AsmPrinterHandler() {} +AsmPrinterHandler::~AsmPrinterHandler() = default; void AsmPrinterHandler::markFunctionEnd() {} @@ -2663,37 +2767,63 @@ void AsmPrinter::emitXRayTable() { auto PrevSection = OutStreamer->getCurrentSectionOnly(); auto Fn = MF->getFunction(); - MCSection *Section = nullptr; + MCSection *InstMap = nullptr; + MCSection *FnSledIndex = nullptr; if (MF->getSubtarget().getTargetTriple().isOSBinFormatELF()) { if (Fn->hasComdat()) { - Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, + InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Fn->getComdat()->getName()); + FnSledIndex = OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, + Fn->getComdat()->getName()); } else { - Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, + InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + FnSledIndex = OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC); } } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) { - Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0, + InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map", 0, SectionKind::getReadOnlyWithRel()); + FnSledIndex = OutContext.getMachOSection("__DATA", "xray_fn_idx", 0, + SectionKind::getReadOnlyWithRel()); } else { llvm_unreachable("Unsupported target"); } // Before we switch over, we force a reference to a label inside the - // xray_instr_map section. Since this function is always called just - // before the function's end, we assume that this is happening after - // the last return instruction. - - auto WordSizeBytes = TM.getPointerSize(); - MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true); + // xray_instr_map and xray_fn_idx sections. Since this function is always + // called just before the function's end, we assume that this is happening + // after the last return instruction. We also use the synthetic label in the + // xray_inster_map as a delimeter for the range of sleds for this function in + // the index. + auto WordSizeBytes = MAI->getCodePointerSize(); + MCSymbol *SledsStart = OutContext.createTempSymbol("xray_synthetic_", true); + MCSymbol *IdxRef = OutContext.createTempSymbol("xray_fn_idx_synth_", true); OutStreamer->EmitCodeAlignment(16); - OutStreamer->EmitSymbolValue(Tmp, WordSizeBytes, false); - OutStreamer->SwitchSection(Section); - OutStreamer->EmitLabel(Tmp); + OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes, false); + OutStreamer->EmitSymbolValue(IdxRef, WordSizeBytes, false); + + // Now we switch to the instrumentation map section. Because this is done + // per-function, we are able to create an index entry that will represent the + // range of sleds associated with a function. + OutStreamer->SwitchSection(InstMap); + OutStreamer->EmitLabel(SledsStart); for (const auto &Sled : Sleds) Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym); - + MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_synthetic_end", true); + OutStreamer->EmitLabel(SledsEnd); + + // We then emit a single entry in the index per function. We use the symbols + // that bound the instrumentation map as the range for a specific function. + // Each entry here will be 2 * word size aligned, as we're writing down two + // pointers. This should work for both 32-bit and 64-bit platforms. + OutStreamer->SwitchSection(FnSledIndex); + OutStreamer->EmitCodeAlignment(2 * WordSizeBytes); + OutStreamer->EmitLabel(IdxRef); + OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes); + OutStreamer->EmitSymbolValue(SledsEnd, WordSizeBytes); OutStreamer->SwitchSection(PrevSection); Sleds.clear(); } @@ -2702,8 +2832,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind) { auto Fn = MI.getParent()->getParent()->getFunction(); auto Attr = Fn->getFnAttribute("function-instrument"); + bool LogArgs = Fn->hasFnAttribute("xray-log-args"); bool AlwaysInstrument = Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always"; + if (Kind == SledKind::FUNCTION_ENTER && LogArgs) + Kind = SledKind::LOG_ARGS_ENTER; Sleds.emplace_back( XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn }); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 165b8eea0943..a0bf1632dff3 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -48,10 +48,16 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) { static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo); assert(DiagInfo && "Diagnostic context not passed down?"); + // Look up a LocInfo for the buffer this diagnostic is coming from. + unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc()); + const MDNode *LocInfo = nullptr; + if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size()) + LocInfo = DiagInfo->LocInfos[BufNum-1]; + // If the inline asm had metadata associated with it, pull out a location // cookie corresponding to which line the error occurred on. unsigned LocCookie = 0; - if (const MDNode *LocInfo = DiagInfo->LocInfo) { + if (LocInfo) { unsigned ErrorLine = Diag.getLineNo()-1; if (ErrorLine >= LocInfo->getNumOperands()) ErrorLine = 0; @@ -108,7 +114,6 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, SourceMgr &SrcMgr = DiagInfo->SrcMgr; SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); - DiagInfo->LocInfo = LocMDNode; std::unique_ptr<MemoryBuffer> Buffer; // The inline asm source manager will outlive Str, so make a copy of the @@ -118,6 +123,12 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, // Tell SrcMgr about this buffer, it takes ownership of the buffer. unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + // Store LocMDNode in DiagInfo, using BufNum as an identifier. + if (LocMDNode) { + DiagInfo->LocInfos.resize(BufNum); + DiagInfo->LocInfos[BufNum-1] = LocMDNode; + } + std::unique_ptr<MCAsmParser> Parser( createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum)); @@ -133,6 +144,9 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, " we don't have an asm parser for this target\n"); Parser->setAssemblerDialect(Dialect); Parser->setTargetParser(*TAP.get()); + if (Dialect == InlineAsm::AD_Intel) + // We need this flag to be able to parse numbers like "0bH" + Parser->setParsingInlineAsm(true); if (MF) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); TAP->SetFrameRegister(TRI->getFrameRegister(*MF)); @@ -144,11 +158,6 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, /*NoFinalize*/ true); emitInlineAsmEnd(STI, &TAP->getSTI()); - // LocInfo cannot be used for error generation from the backend. - // FIXME: associate LocInfo with the SourceBuffer to improve backend - // messages. - DiagInfo->LocInfo = nullptr; - if (Res && !DiagInfo->DiagHandler) report_fatal_error("Error parsing inline asm\n"); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 83440513225c..114aea391a86 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -13,23 +13,24 @@ #include "CodeViewDebug.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/DebugInfo/CodeView/CVTypeDumper.h" #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/Line.h" +#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" #include "llvm/DebugInfo/CodeView/TypeDatabase.h" #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/TypeTableCollection.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" -#include "llvm/DebugInfo/MSF/ByteStream.h" -#include "llvm/DebugInfo/MSF/StreamReader.h" #include "llvm/IR/Constants.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/BinaryByteStream.h" +#include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/COFF.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetFrameLowering.h" @@ -38,7 +39,6 @@ using namespace llvm; using namespace llvm::codeview; -using namespace llvm::msf; CodeViewDebug::CodeViewDebug(AsmPrinter *AP) : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(), @@ -238,7 +238,7 @@ TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) { // The display name includes function template arguments. Drop them to match // MSVC. - StringRef DisplayName = SP->getDisplayName().split('<').first; + StringRef DisplayName = SP->getName().split('<').first; const DIScope *Scope = SP->getScope().resolve(); TypeIndex TI; @@ -393,7 +393,7 @@ void CodeViewDebug::endModule() { // subprograms. switchToDebugSectionForSymbol(nullptr); - MCSymbol *CompilerInfo = beginCVSubsection(ModuleSubstreamKind::Symbols); + MCSymbol *CompilerInfo = beginCVSubsection(ModuleDebugFragmentKind::Symbols); emitCompilerInformation(); endCVSubsection(CompilerInfo); @@ -417,7 +417,7 @@ void CodeViewDebug::endModule() { // Emit UDT records for any types used by global variables. if (!GlobalUDTs.empty()) { - MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols); + MCSymbol *SymbolsEnd = beginCVSubsection(ModuleDebugFragmentKind::Symbols); emitDebugInfoForUDTs(GlobalUDTs); endCVSubsection(SymbolsEnd); } @@ -469,17 +469,21 @@ void CodeViewDebug::emitTypeInformation() { CommentPrefix += ' '; } - TypeDatabase TypeDB; - CVTypeDumper CVTD(TypeDB); - TypeTable.ForEachRecord([&](TypeIndex Index, ArrayRef<uint8_t> Record) { + TypeTableCollection Table(TypeTable.records()); + Optional<TypeIndex> B = Table.getFirst(); + while (B) { + // This will fail if the record data is invalid. + CVType Record = Table.getType(*B); + if (OS.isVerboseAsm()) { // Emit a block comment describing the type record for readability. SmallString<512> CommentBlock; raw_svector_ostream CommentOS(CommentBlock); ScopedPrinter SP(CommentOS); SP.setPrefix(CommentPrefix); - TypeDumpVisitor TDV(TypeDB, &SP, false); - Error E = CVTD.dump(Record, TDV); + TypeDumpVisitor TDV(Table, &SP, false); + + Error E = codeview::visitTypeRecord(Record, *B, TDV); if (E) { logAllUnhandledErrors(std::move(E), errs(), "error: "); llvm_unreachable("produced malformed type record"); @@ -489,29 +493,10 @@ void CodeViewDebug::emitTypeInformation() { // newline. OS.emitRawComment( CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim()); - } else { -#ifndef NDEBUG - // Assert that the type data is valid even if we aren't dumping - // comments. The MSVC linker doesn't do much type record validation, - // so the first link of an invalid type record can succeed while - // subsequent links will fail with LNK1285. - ByteStream Stream(Record); - CVTypeArray Types; - StreamReader Reader(Stream); - Error E = Reader.readArray(Types, Reader.getLength()); - if (!E) { - TypeVisitorCallbacks C; - E = CVTypeVisitor(C).visitTypeStream(Types); - } - if (E) { - logAllUnhandledErrors(std::move(E), errs(), "error: "); - llvm_unreachable("produced malformed type record"); - } -#endif } - StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size()); - OS.EmitBinaryData(S); - }); + OS.EmitBinaryData(Record.str_data()); + B = Table.getNext(*B); + } } namespace { @@ -645,7 +630,8 @@ void CodeViewDebug::emitInlineeLinesSubsection() { return; OS.AddComment("Inlinee lines subsection"); - MCSymbol *InlineEnd = beginCVSubsection(ModuleSubstreamKind::InlineeLines); + MCSymbol *InlineEnd = + beginCVSubsection(ModuleDebugFragmentKind::InlineeLines); // We don't provide any extra file info. // FIXME: Find out if debuggers use this info. @@ -658,7 +644,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() { OS.AddBlankLine(); unsigned FileId = maybeRecordFile(SP->getFile()); - OS.AddComment("Inlined function " + SP->getDisplayName() + " starts at " + + OS.AddComment("Inlined function " + SP->getName() + " starts at " + SP->getFilename() + Twine(':') + Twine(SP->getLine())); OS.AddBlankLine(); // The filechecksum table uses 8 byte entries for now, and file ids start at @@ -760,17 +746,17 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, // If we have a display name, build the fully qualified name by walking the // chain of scopes. - if (!SP->getDisplayName().empty()) + if (!SP->getName().empty()) FuncName = - getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName()); + getFullyQualifiedName(SP->getScope().resolve(), SP->getName()); // If our DISubprogram name is empty, use the mangled name. if (FuncName.empty()) - FuncName = GlobalValue::getRealLinkageName(GV->getName()); + FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName()); // Emit a symbol subsection, required by VS2012+ to find function boundaries. OS.AddComment("Symbol subsection for " + Twine(FuncName)); - MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols); + MCSymbol *SymbolsEnd = beginCVSubsection(ModuleDebugFragmentKind::Symbols); { MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(), *ProcRecordEnd = MMI->getContext().createTempSymbol(); @@ -887,13 +873,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable( if (!Scope) continue; + // If the variable has an attached offset expression, extract it. + // FIXME: Try to handle DW_OP_deref as well. + int64_t ExprOffset = 0; + if (VI.Expr) + if (!VI.Expr->extractIfOffset(ExprOffset)) + continue; + // Get the frame register used and the offset. unsigned FrameReg = 0; int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg); uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg); // Calculate the label ranges. - LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset); + LocalVarDefRange DefRange = + createDefRangeMem(CVReg, FrameOffset + ExprOffset); for (const InsnRange &Range : Scope->getRanges()) { const MCSymbol *Begin = getLabelBeforeInsn(Range.first); const MCSymbol *End = getLabelAfterInsn(Range.second); @@ -948,10 +942,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { // Handle fragments. auto Fragment = DIExpr->getFragmentInfo(); - if (DIExpr && Fragment) { + if (Fragment) { IsSubfield = true; StructOffset = Fragment->OffsetInBits / 8; - } else if (DIExpr && DIExpr->getNumElements() > 0) { + } else if (DIExpr->getNumElements() > 0) { continue; // Ignore unrecognized exprs. } @@ -1014,14 +1008,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { } } -void CodeViewDebug::beginFunction(const MachineFunction *MF) { - assert(!CurFn && "Can't process two functions at once!"); - - if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram()) - return; - - DebugHandlerBase::beginFunction(MF); - +void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) { const Function *GV = MF->getFunction(); assert(FnDebugInfo.count(GV) == false); CurFn = &FnDebugInfo[GV]; @@ -1038,11 +1025,11 @@ void CodeViewDebug::beginFunction(const MachineFunction *MF) { bool EmptyPrologue = true; for (const auto &MBB : *MF) { for (const auto &MI : MBB) { - if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && + if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) { PrologEndLoc = MI.getDebugLoc(); break; - } else if (!MI.isDebugValue()) { + } else if (!MI.isMetaInstruction()) { EmptyPrologue = false; } } @@ -1144,33 +1131,12 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { DITypeRef ElementTypeRef = Ty->getBaseType(); TypeIndex ElementTypeIndex = getTypeIndex(ElementTypeRef); // IndexType is size_t, which depends on the bitness of the target. - TypeIndex IndexType = Asm->MAI->getPointerSize() == 8 + TypeIndex IndexType = Asm->TM.getPointerSize() == 8 ? TypeIndex(SimpleTypeKind::UInt64Quad) : TypeIndex(SimpleTypeKind::UInt32Long); uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8; - - // We want to assert that the element type multiplied by the array lengths - // match the size of the overall array. However, if we don't have complete - // type information for the base type, we can't make this assertion. This - // happens if limited debug info is enabled in this case: - // struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); }; - // VTableOptzn array[3]; - // The DICompositeType of VTableOptzn will have size zero, and the array will - // have size 3 * sizeof(void*), and we should avoid asserting. - // - // There is a related bug in the front-end where an array of a structure, - // which was declared as incomplete structure first, ends up not getting a - // size assigned to it. (PR28303) - // Example: - // struct A(*p)[3]; - // struct A { int f; } a[3]; - bool PartiallyIncomplete = false; - if (Ty->getSizeInBits() == 0 || ElementSize == 0) { - PartiallyIncomplete = true; - } - // Add subranges to array type. DINodeArray Elements = Ty->getElements(); for (int i = Elements.size() - 1; i >= 0; --i) { @@ -1185,16 +1151,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { // Variable Length Array (VLA) has Count equal to '-1'. // Replace with Count '1', assume it is the minimum VLA length. // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU. - if (Count == -1) { + if (Count == -1) Count = 1; - PartiallyIncomplete = true; - } // Update the element size and element type index for subsequent subranges. ElementSize *= Count; // If this is the outermost array, use the size from the array. It will be - // more accurate if PartiallyIncomplete is true. + // more accurate if we had a VLA or an incomplete element type size. uint64_t ArraySize = (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize; @@ -1203,9 +1167,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { ElementTypeIndex = TypeTable.writeKnownType(AR); } - (void)PartiallyIncomplete; - assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8)); - return ElementTypeIndex; } @@ -1376,8 +1337,8 @@ TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty) { assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type); TypeIndex ClassTI = getTypeIndex(Ty->getClassType()); TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType()); - PointerKind PK = Asm->MAI->getPointerSize() == 8 ? PointerKind::Near64 - : PointerKind::Near32; + PointerKind PK = Asm->TM.getPointerSize() == 8 ? PointerKind::Near64 + : PointerKind::Near32; bool IsPMF = isa<DISubroutineType>(Ty->getBaseType()); PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction : PointerMode::PointerToDataMember; @@ -1492,7 +1453,8 @@ TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty, } TypeIndex CodeViewDebug::lowerTypeVFTableShape(const DIDerivedType *Ty) { - unsigned VSlotCount = Ty->getSizeInBits() / (8 * Asm->MAI->getPointerSize()); + unsigned VSlotCount = + Ty->getSizeInBits() / (8 * Asm->MAI->getCodePointerSize()); SmallVector<VFTableSlotKind, 4> Slots(VSlotCount, VFTableSlotKind::Near); VFTableShapeRecord VFTSR(Slots); @@ -1600,7 +1562,7 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) { EnumeratorCount++; } } - FTI = FLRB.end(); + FTI = FLRB.end(true); } std::string FullName = getFullyQualifiedName(Ty); @@ -1736,10 +1698,12 @@ TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) { SizeInBytes, FullName, Ty->getIdentifier()); TypeIndex ClassTI = TypeTable.writeKnownType(CR); - StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(Ty->getFile())); - TypeIndex SIDI = TypeTable.writeKnownType(SIDR); - UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine()); - TypeTable.writeKnownType(USLR); + if (const auto *File = Ty->getFile()) { + StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(File)); + TypeIndex SIDI = TypeTable.writeKnownType(SIDR); + UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine()); + TypeTable.writeKnownType(USLR); + } addToUDTs(Ty, ClassTI); @@ -1905,7 +1869,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) { MemberCount++; } - TypeIndex FieldTI = FLBR.end(); + TypeIndex FieldTI = FLBR.end(true); return std::make_tuple(FieldTI, Info.VShapeTI, MemberCount, !Info.NestedClasses.empty()); } @@ -2115,18 +2079,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) { } } -void CodeViewDebug::endFunction(const MachineFunction *MF) { - if (!Asm || !CurFn) // We haven't created any debug info for this function. - return; - +void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) { const Function *GV = MF->getFunction(); assert(FnDebugInfo.count(GV)); assert(CurFn == &FnDebugInfo[GV]); collectVariableInfo(GV->getSubprogram()); - DebugHandlerBase::endFunction(MF); - // Don't emit anything if we don't have any line tables. if (!CurFn->HaveLineInfo) { FnDebugInfo.erase(GV); @@ -2152,7 +2111,7 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) { maybeRecordLocation(DL, Asm->MF); } -MCSymbol *CodeViewDebug::beginCVSubsection(ModuleSubstreamKind Kind) { +MCSymbol *CodeViewDebug::beginCVSubsection(ModuleDebugFragmentKind Kind) { MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(), *EndLabel = MMI->getContext().createTempSymbol(); OS.EmitIntValue(unsigned(Kind), 4); @@ -2212,7 +2171,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() { if (!GV->hasComdat() && !GV->isDeclarationForLinker()) { if (!EndLabel) { OS.AddComment("Symbol subsection for globals"); - EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols); + EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols); } // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions. emitDebugInfoForGlobal(GVE->getVariable(), GV, Asm->getSymbol(GV)); @@ -2228,9 +2187,9 @@ void CodeViewDebug::emitDebugInfoForGlobals() { if (GV->hasComdat()) { MCSymbol *GVSym = Asm->getSymbol(GV); OS.AddComment("Symbol subsection for " + - Twine(GlobalValue::getRealLinkageName(GV->getName()))); + Twine(GlobalValue::dropLLVMManglingEscape(GV->getName()))); switchToDebugSectionForSymbol(GVSym); - EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols); + EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols); // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions. emitDebugInfoForGlobal(GVE->getVariable(), GV, GVSym); endCVSubsection(EndLabel); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 3dd4315e4c2f..46b2daa1e007 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -216,7 +216,7 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { /// Opens a subsection of the given kind in a .debug$S codeview section. /// Returns an end label for use with endCVSubsection when the subsection is /// finished. - MCSymbol *beginCVSubsection(codeview::ModuleSubstreamKind Kind); + MCSymbol *beginCVSubsection(codeview::ModuleDebugFragmentKind Kind); void endCVSubsection(MCSymbol *EndLabel); @@ -299,6 +299,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { unsigned getPointerSizeInBytes(); +protected: + /// \brief Gather pre-function debug information. + void beginFunctionImpl(const MachineFunction *MF) override; + + /// \brief Gather post-function debug information. + void endFunctionImpl(const MachineFunction *) override; + public: CodeViewDebug(AsmPrinter *Asm); @@ -307,12 +314,6 @@ public: /// \brief Emit the COFF section that holds the line table information. void endModule() override; - /// \brief Gather pre-function debug information. - void beginFunction(const MachineFunction *MF) override; - - /// \brief Gather post-function debug information. - void endFunction(const MachineFunction *) override; - /// \brief Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; }; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index 879918995472..30bfd7c94e68 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -31,6 +31,8 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +#define DEBUG_TYPE "dwarfdebug" + //===----------------------------------------------------------------------===// // DIEAbbrevData Implementation //===----------------------------------------------------------------------===// @@ -42,6 +44,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const { // overloads. Otherwise MSVC 2010 thinks this call is ambiguous. ID.AddInteger(unsigned(Attribute)); ID.AddInteger(unsigned(Form)); + if (Form == dwarf::DW_FORM_implicit_const) + ID.AddInteger(Value); } //===----------------------------------------------------------------------===// @@ -77,15 +81,22 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const { dwarf::AttributeString(AttrData.getAttribute()).data()); // Emit form type. +#ifndef NDEBUG + // Could be an assertion, but this way we can see the failing form code + // easily, which helps track down where it came from. + if (!dwarf::isValidFormForVersion(AttrData.getForm(), + AP->getDwarfVersion())) { + DEBUG(dbgs() << "Invalid form " << format("0x%x", AttrData.getForm()) + << " for DWARF version " << AP->getDwarfVersion() << "\n"); + llvm_unreachable("Invalid form for specified DWARF version"); + } +#endif AP->EmitULEB128(AttrData.getForm(), dwarf::FormEncodingString(AttrData.getForm()).data()); // Emit value for DW_FORM_implicit_const. - if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) { - assert(AP->getDwarfVersion() >= 5 && - "DW_FORM_implicit_const is supported starting from DWARFv5"); + if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) AP->EmitSLEB128(AttrData.getValue()); - } } // Mark end of abbreviation. @@ -107,13 +118,20 @@ void DIEAbbrev::print(raw_ostream &O) { O << " " << dwarf::AttributeString(Data[i].getAttribute()) << " " - << dwarf::FormEncodingString(Data[i].getForm()) - << '\n'; + << dwarf::FormEncodingString(Data[i].getForm()); + + if (Data[i].getForm() == dwarf::DW_FORM_implicit_const) + O << " " << Data[i].getValue(); + + O << '\n'; } } -LLVM_DUMP_METHOD -void DIEAbbrev::dump() { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEAbbrev::dump() { + print(dbgs()); +} +#endif //===----------------------------------------------------------------------===// // DIEAbbrevSet Implementation @@ -249,10 +267,11 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const { O << "\n"; } -LLVM_DUMP_METHOD -void DIE::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIE::dump() { print(dbgs()); } +#endif unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP, DIEAbbrevSet &AbbrevSet, @@ -340,10 +359,11 @@ void DIEValue::print(raw_ostream &O) const { } } -LLVM_DUMP_METHOD -void DIEValue::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DIEValue::dump() const { print(dbgs()); } +#endif //===----------------------------------------------------------------------===// // DIEInteger Implementation @@ -354,57 +374,42 @@ void DIEValue::dump() const { void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { switch (Form) { case dwarf::DW_FORM_implicit_const: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_flag_present: // Emit something to keep the lines and comments in sync. // FIXME: Is there a better way to do this? Asm->OutStreamer->AddBlankLine(); return; case dwarf::DW_FORM_flag: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref1: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data1: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_strx1: + case dwarf::DW_FORM_addrx1: case dwarf::DW_FORM_ref2: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data2: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_strx2: + case dwarf::DW_FORM_addrx2: case dwarf::DW_FORM_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref4: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data4: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_ref_sup4: + case dwarf::DW_FORM_strx4: + case dwarf::DW_FORM_addrx4: case dwarf::DW_FORM_ref8: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_sig8: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_data8: - LLVM_FALLTHROUGH; + case dwarf::DW_FORM_ref_sup8: case dwarf::DW_FORM_GNU_ref_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_strp_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_line_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_sec_offset: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp_sup: - LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sup: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_addr: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_addr: Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form)); return; case dwarf::DW_FORM_GNU_str_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_addr_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_udata: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return; @@ -419,35 +424,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { /// unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { switch (Form) { - case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_flag_present: return 0; - case dwarf::DW_FORM_flag: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref1: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data1: return sizeof(int8_t); - case dwarf::DW_FORM_ref2: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data2: return sizeof(int16_t); - case dwarf::DW_FORM_ref4: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data4: return sizeof(int32_t); - case dwarf::DW_FORM_ref8: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sig8: LLVM_FALLTHROUGH; - case dwarf::DW_FORM_data8: return sizeof(int64_t); + case dwarf::DW_FORM_implicit_const: + case dwarf::DW_FORM_flag_present: + return 0; + case dwarf::DW_FORM_flag: + case dwarf::DW_FORM_ref1: + case dwarf::DW_FORM_data1: + case dwarf::DW_FORM_strx1: + case dwarf::DW_FORM_addrx1: + return sizeof(int8_t); + case dwarf::DW_FORM_ref2: + case dwarf::DW_FORM_data2: + case dwarf::DW_FORM_strx2: + case dwarf::DW_FORM_addrx2: + return sizeof(int16_t); + case dwarf::DW_FORM_ref4: + case dwarf::DW_FORM_data4: + case dwarf::DW_FORM_ref_sup4: + case dwarf::DW_FORM_strx4: + case dwarf::DW_FORM_addrx4: + return sizeof(int32_t); + case dwarf::DW_FORM_ref8: + case dwarf::DW_FORM_ref_sig8: + case dwarf::DW_FORM_data8: + case dwarf::DW_FORM_ref_sup8: + return sizeof(int64_t); case dwarf::DW_FORM_ref_addr: if (AP->getDwarfVersion() == 2) return AP->getPointerSize(); LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_ref_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_strp_alt: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_line_strp: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_sec_offset: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_strp_sup: - LLVM_FALLTHROUGH; - case dwarf::DW_FORM_ref_sup: switch (AP->OutStreamer->getContext().getDwarfFormat()) { case dwarf::DWARF32: return 4; @@ -456,11 +467,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { } llvm_unreachable("Invalid DWARF format"); case dwarf::DW_FORM_GNU_str_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_GNU_addr_index: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_ref_udata: - LLVM_FALLTHROUGH; case dwarf::DW_FORM_udata: return getULEB128Size(Integer); case dwarf::DW_FORM_sdata: @@ -484,7 +492,7 @@ void DIEInteger::print(raw_ostream &O) const { /// EmitValue - Emit expression value. /// void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->EmitDebugValue(Expr, SizeOf(AP, Form)); + AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form)); } /// SizeOf - Determine size of expression value in bytes. @@ -519,7 +527,7 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); } LLVM_DUMP_METHOD @@ -541,7 +549,7 @@ unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); } LLVM_DUMP_METHOD @@ -647,20 +655,12 @@ void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_ref_addr: { // Get the absolute offset for this DIE within the debug info/types section. unsigned Addr = Entry->getDebugSectionOffset(); - if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) { - const DwarfDebug *DD = AP->getDwarfDebug(); - if (DD) - assert(!DD->useSplitDwarf() && - "TODO: dwo files can't have relocations."); - const DIEUnit *Unit = Entry->getUnit(); - assert(Unit && "CUDie should belong to a CU."); - MCSection *Section = Unit->getSection(); - if (Section) { - const MCSymbol *SectionSym = Section->getBeginSymbol(); - AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); - return; - } + if (const MCSymbol *SectionSym = + Entry->getUnit()->getCrossSectionRelativeBaseAddress()) { + AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); + return; } + AP->OutStreamer->EmitIntValue(Addr, SizeOf(AP, Form)); return; } @@ -683,7 +683,7 @@ unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return getULEB128Size(Entry->getOffset()); case dwarf::DW_FORM_ref_addr: if (AP->getDwarfVersion() == 2) - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); switch (AP->OutStreamer->getContext().getDwarfFormat()) { case dwarf::DWARF32: return 4; @@ -809,7 +809,7 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; - return AP->getPointerSize(); + return AP->MAI->getCodePointerSize(); } /// EmitValue - Emit label value. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp index d8ecc7ccfb9b..201030f0ac5c 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -116,65 +116,17 @@ void DIEHash::addParentContext(const DIE &Parent) { // Collect all of the attributes for a particular DIE in single structure. void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) { -#define COLLECT_ATTR(NAME) \ - case dwarf::NAME: \ - Attrs.NAME = V; \ - break for (const auto &V : Die.values()) { DEBUG(dbgs() << "Attribute: " << dwarf::AttributeString(V.getAttribute()) << " added.\n"); switch (V.getAttribute()) { - COLLECT_ATTR(DW_AT_name); - COLLECT_ATTR(DW_AT_accessibility); - COLLECT_ATTR(DW_AT_address_class); - COLLECT_ATTR(DW_AT_allocated); - COLLECT_ATTR(DW_AT_artificial); - COLLECT_ATTR(DW_AT_associated); - COLLECT_ATTR(DW_AT_binary_scale); - COLLECT_ATTR(DW_AT_bit_offset); - COLLECT_ATTR(DW_AT_bit_size); - COLLECT_ATTR(DW_AT_bit_stride); - COLLECT_ATTR(DW_AT_byte_size); - COLLECT_ATTR(DW_AT_byte_stride); - COLLECT_ATTR(DW_AT_const_expr); - COLLECT_ATTR(DW_AT_const_value); - COLLECT_ATTR(DW_AT_containing_type); - COLLECT_ATTR(DW_AT_count); - COLLECT_ATTR(DW_AT_data_bit_offset); - COLLECT_ATTR(DW_AT_data_location); - COLLECT_ATTR(DW_AT_data_member_location); - COLLECT_ATTR(DW_AT_decimal_scale); - COLLECT_ATTR(DW_AT_decimal_sign); - COLLECT_ATTR(DW_AT_default_value); - COLLECT_ATTR(DW_AT_digit_count); - COLLECT_ATTR(DW_AT_discr); - COLLECT_ATTR(DW_AT_discr_list); - COLLECT_ATTR(DW_AT_discr_value); - COLLECT_ATTR(DW_AT_encoding); - COLLECT_ATTR(DW_AT_enum_class); - COLLECT_ATTR(DW_AT_endianity); - COLLECT_ATTR(DW_AT_explicit); - COLLECT_ATTR(DW_AT_is_optional); - COLLECT_ATTR(DW_AT_location); - COLLECT_ATTR(DW_AT_lower_bound); - COLLECT_ATTR(DW_AT_mutable); - COLLECT_ATTR(DW_AT_ordering); - COLLECT_ATTR(DW_AT_picture_string); - COLLECT_ATTR(DW_AT_prototyped); - COLLECT_ATTR(DW_AT_small); - COLLECT_ATTR(DW_AT_segment); - COLLECT_ATTR(DW_AT_string_length); - COLLECT_ATTR(DW_AT_threads_scaled); - COLLECT_ATTR(DW_AT_upper_bound); - COLLECT_ATTR(DW_AT_use_location); - COLLECT_ATTR(DW_AT_use_UTF8); - COLLECT_ATTR(DW_AT_variable_parameter); - COLLECT_ATTR(DW_AT_virtuality); - COLLECT_ATTR(DW_AT_visibility); - COLLECT_ATTR(DW_AT_vtable_elem_location); - COLLECT_ATTR(DW_AT_type); +#define HANDLE_DIE_HASH_ATTR(NAME) \ + case dwarf::NAME: \ + Attrs.NAME = V; \ + break; +#include "DIEHashAttributes.def" default: break; } @@ -366,62 +318,12 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) { // Go through the attributes from \param Attrs in the order specified in 7.27.4 // and hash them. void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) { -#define ADD_ATTR(ATTR) \ +#define HANDLE_DIE_HASH_ATTR(NAME) \ { \ - if (ATTR) \ - hashAttribute(ATTR, Tag); \ + if (Attrs.NAME) \ + hashAttribute(Attrs.NAME, Tag); \ } - - ADD_ATTR(Attrs.DW_AT_name); - ADD_ATTR(Attrs.DW_AT_accessibility); - ADD_ATTR(Attrs.DW_AT_address_class); - ADD_ATTR(Attrs.DW_AT_allocated); - ADD_ATTR(Attrs.DW_AT_artificial); - ADD_ATTR(Attrs.DW_AT_associated); - ADD_ATTR(Attrs.DW_AT_binary_scale); - ADD_ATTR(Attrs.DW_AT_bit_offset); - ADD_ATTR(Attrs.DW_AT_bit_size); - ADD_ATTR(Attrs.DW_AT_bit_stride); - ADD_ATTR(Attrs.DW_AT_byte_size); - ADD_ATTR(Attrs.DW_AT_byte_stride); - ADD_ATTR(Attrs.DW_AT_const_expr); - ADD_ATTR(Attrs.DW_AT_const_value); - ADD_ATTR(Attrs.DW_AT_containing_type); - ADD_ATTR(Attrs.DW_AT_count); - ADD_ATTR(Attrs.DW_AT_data_bit_offset); - ADD_ATTR(Attrs.DW_AT_data_location); - ADD_ATTR(Attrs.DW_AT_data_member_location); - ADD_ATTR(Attrs.DW_AT_decimal_scale); - ADD_ATTR(Attrs.DW_AT_decimal_sign); - ADD_ATTR(Attrs.DW_AT_default_value); - ADD_ATTR(Attrs.DW_AT_digit_count); - ADD_ATTR(Attrs.DW_AT_discr); - ADD_ATTR(Attrs.DW_AT_discr_list); - ADD_ATTR(Attrs.DW_AT_discr_value); - ADD_ATTR(Attrs.DW_AT_encoding); - ADD_ATTR(Attrs.DW_AT_enum_class); - ADD_ATTR(Attrs.DW_AT_endianity); - ADD_ATTR(Attrs.DW_AT_explicit); - ADD_ATTR(Attrs.DW_AT_is_optional); - ADD_ATTR(Attrs.DW_AT_location); - ADD_ATTR(Attrs.DW_AT_lower_bound); - ADD_ATTR(Attrs.DW_AT_mutable); - ADD_ATTR(Attrs.DW_AT_ordering); - ADD_ATTR(Attrs.DW_AT_picture_string); - ADD_ATTR(Attrs.DW_AT_prototyped); - ADD_ATTR(Attrs.DW_AT_small); - ADD_ATTR(Attrs.DW_AT_segment); - ADD_ATTR(Attrs.DW_AT_string_length); - ADD_ATTR(Attrs.DW_AT_threads_scaled); - ADD_ATTR(Attrs.DW_AT_upper_bound); - ADD_ATTR(Attrs.DW_AT_use_location); - ADD_ATTR(Attrs.DW_AT_use_UTF8); - ADD_ATTR(Attrs.DW_AT_variable_parameter); - ADD_ATTR(Attrs.DW_AT_virtuality); - ADD_ATTR(Attrs.DW_AT_visibility); - ADD_ATTR(Attrs.DW_AT_vtable_elem_location); - ADD_ATTR(Attrs.DW_AT_type); - +#include "DIEHashAttributes.def" // FIXME: Add the extended attributes. } @@ -478,10 +380,12 @@ void DIEHash::computeHash(const DIE &Die) { /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE /// with the inclusion of the full CU and all top level CU entities. // TODO: Initialize the type chain at 0 instead of 1 for CU signatures. -uint64_t DIEHash::computeCUSignature(const DIE &Die) { +uint64_t DIEHash::computeCUSignature(StringRef DWOName, const DIE &Die) { Numbering.clear(); Numbering[&Die] = 1; + if (!DWOName.empty()) + Hash.update(DWOName); // Hash the DIE. computeHash(Die); @@ -490,9 +394,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) { Hash.final(Result); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); + // implementation always returns its results in little endian, so we actually + // need the "high" word. + return Result.high(); } /// This is based on the type signature computation given in section 7.27 of the @@ -514,7 +418,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) { Hash.final(Result); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); + // implementation always returns its results in little endian, so we actually + // need the "high" word. + return Result.high(); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h index 996cd7ef3d2e..29337ae38a99 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h @@ -28,64 +28,15 @@ class CompileUnit; class DIEHash { // Collection of all attributes used in hashing a particular DIE. struct DIEAttrs { - DIEValue DW_AT_name; - DIEValue DW_AT_accessibility; - DIEValue DW_AT_address_class; - DIEValue DW_AT_allocated; - DIEValue DW_AT_artificial; - DIEValue DW_AT_associated; - DIEValue DW_AT_binary_scale; - DIEValue DW_AT_bit_offset; - DIEValue DW_AT_bit_size; - DIEValue DW_AT_bit_stride; - DIEValue DW_AT_byte_size; - DIEValue DW_AT_byte_stride; - DIEValue DW_AT_const_expr; - DIEValue DW_AT_const_value; - DIEValue DW_AT_containing_type; - DIEValue DW_AT_count; - DIEValue DW_AT_data_bit_offset; - DIEValue DW_AT_data_location; - DIEValue DW_AT_data_member_location; - DIEValue DW_AT_decimal_scale; - DIEValue DW_AT_decimal_sign; - DIEValue DW_AT_default_value; - DIEValue DW_AT_digit_count; - DIEValue DW_AT_discr; - DIEValue DW_AT_discr_list; - DIEValue DW_AT_discr_value; - DIEValue DW_AT_encoding; - DIEValue DW_AT_enum_class; - DIEValue DW_AT_endianity; - DIEValue DW_AT_explicit; - DIEValue DW_AT_is_optional; - DIEValue DW_AT_location; - DIEValue DW_AT_lower_bound; - DIEValue DW_AT_mutable; - DIEValue DW_AT_ordering; - DIEValue DW_AT_picture_string; - DIEValue DW_AT_prototyped; - DIEValue DW_AT_small; - DIEValue DW_AT_segment; - DIEValue DW_AT_string_length; - DIEValue DW_AT_threads_scaled; - DIEValue DW_AT_upper_bound; - DIEValue DW_AT_use_location; - DIEValue DW_AT_use_UTF8; - DIEValue DW_AT_variable_parameter; - DIEValue DW_AT_virtuality; - DIEValue DW_AT_visibility; - DIEValue DW_AT_vtable_elem_location; - DIEValue DW_AT_type; - - // Insert any additional ones here... +#define HANDLE_DIE_HASH_ATTR(NAME) DIEValue NAME; +#include "DIEHashAttributes.def" }; public: DIEHash(AsmPrinter *A = nullptr) : AP(A) {} /// \brief Computes the CU signature. - uint64_t computeCUSignature(const DIE &Die); + uint64_t computeCUSignature(StringRef DWOName, const DIE &Die); /// \brief Computes the type signature. uint64_t computeTypeSignature(const DIE &Die); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def new file mode 100644 index 000000000000..28a02390fccb --- /dev/null +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def @@ -0,0 +1,55 @@ +#ifndef HANDLE_DIE_HASH_ATTR +#error "Missing macro definition of HANDLE_DIE_HASH_ATTR" +#endif + +HANDLE_DIE_HASH_ATTR(DW_AT_name) +HANDLE_DIE_HASH_ATTR(DW_AT_accessibility) +HANDLE_DIE_HASH_ATTR(DW_AT_address_class) +HANDLE_DIE_HASH_ATTR(DW_AT_allocated) +HANDLE_DIE_HASH_ATTR(DW_AT_artificial) +HANDLE_DIE_HASH_ATTR(DW_AT_associated) +HANDLE_DIE_HASH_ATTR(DW_AT_binary_scale) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_offset) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_size) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_stride) +HANDLE_DIE_HASH_ATTR(DW_AT_byte_size) +HANDLE_DIE_HASH_ATTR(DW_AT_byte_stride) +HANDLE_DIE_HASH_ATTR(DW_AT_const_expr) +HANDLE_DIE_HASH_ATTR(DW_AT_const_value) +HANDLE_DIE_HASH_ATTR(DW_AT_containing_type) +HANDLE_DIE_HASH_ATTR(DW_AT_count) +HANDLE_DIE_HASH_ATTR(DW_AT_data_bit_offset) +HANDLE_DIE_HASH_ATTR(DW_AT_data_location) +HANDLE_DIE_HASH_ATTR(DW_AT_data_member_location) +HANDLE_DIE_HASH_ATTR(DW_AT_decimal_scale) +HANDLE_DIE_HASH_ATTR(DW_AT_decimal_sign) +HANDLE_DIE_HASH_ATTR(DW_AT_default_value) +HANDLE_DIE_HASH_ATTR(DW_AT_digit_count) +HANDLE_DIE_HASH_ATTR(DW_AT_discr) +HANDLE_DIE_HASH_ATTR(DW_AT_discr_list) +HANDLE_DIE_HASH_ATTR(DW_AT_discr_value) +HANDLE_DIE_HASH_ATTR(DW_AT_encoding) +HANDLE_DIE_HASH_ATTR(DW_AT_enum_class) +HANDLE_DIE_HASH_ATTR(DW_AT_endianity) +HANDLE_DIE_HASH_ATTR(DW_AT_explicit) +HANDLE_DIE_HASH_ATTR(DW_AT_is_optional) +HANDLE_DIE_HASH_ATTR(DW_AT_location) +HANDLE_DIE_HASH_ATTR(DW_AT_lower_bound) +HANDLE_DIE_HASH_ATTR(DW_AT_mutable) +HANDLE_DIE_HASH_ATTR(DW_AT_ordering) +HANDLE_DIE_HASH_ATTR(DW_AT_picture_string) +HANDLE_DIE_HASH_ATTR(DW_AT_prototyped) +HANDLE_DIE_HASH_ATTR(DW_AT_small) +HANDLE_DIE_HASH_ATTR(DW_AT_segment) +HANDLE_DIE_HASH_ATTR(DW_AT_string_length) +HANDLE_DIE_HASH_ATTR(DW_AT_threads_scaled) +HANDLE_DIE_HASH_ATTR(DW_AT_upper_bound) +HANDLE_DIE_HASH_ATTR(DW_AT_use_location) +HANDLE_DIE_HASH_ATTR(DW_AT_use_UTF8) +HANDLE_DIE_HASH_ATTR(DW_AT_variable_parameter) +HANDLE_DIE_HASH_ATTR(DW_AT_virtuality) +HANDLE_DIE_HASH_ATTR(DW_AT_visibility) +HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location) +HANDLE_DIE_HASH_ATTR(DW_AT_type) + +#undef HANDLE_DIE_HASH_ATTR diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp index 22fd7bb46056..20e1467b30c3 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp @@ -209,8 +209,7 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF, } else if (MO.isRegMask()) { // If this is a register mask operand, clobber all debug values in // non-CSRs. - for (int I = ChangingRegs.find_first(); I != -1; - I = ChangingRegs.find_next(I)) { + for (unsigned I : ChangingRegs.set_bits()) { // Don't consider SP to be clobbered by register masks. if (unsigned(I) != SP && TRI->isPhysicalRegister(I) && MO.clobbersPhysReg(I)) { diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 94190981e88e..0971c5942203 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) { return getBaseTypeSize(BaseType); } +static bool hasDebugInfo(const MachineModuleInfo *MMI, + const MachineFunction *MF) { + if (!MMI->hasDebugInfo()) + return false; + auto *SP = MF->getFunction()->getSubprogram(); + if (!SP) + return false; + assert(SP->getUnit()); + auto EK = SP->getUnit()->getEmissionKind(); + if (EK == DICompileUnit::NoDebug) + return false; + return true; +} + void DebugHandlerBase::beginFunction(const MachineFunction *MF) { + PrevInstBB = nullptr; + + if (!Asm || !hasDebugInfo(MMI, MF)) { + skippedNonDebugFunction(); + return; + } + // Grab the lexical scopes for the function, if we don't have any of those // then we're not going to be able to do anything. LScopes.initialize(*MF); - if (LScopes.empty()) + if (LScopes.empty()) { + beginFunctionImpl(MF); return; + } // Make sure that each lexical scope will have a begin/end label. identifyScopeMarkers(); @@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) { PrevInstLoc = DebugLoc(); PrevLabel = Asm->getFunctionBegin(); + beginFunctionImpl(MF); } void DebugHandlerBase::beginInstruction(const MachineInstr *MI) { @@ -200,9 +224,9 @@ void DebugHandlerBase::endInstruction() { return; assert(CurMI != nullptr); - // Don't create a new label after DBG_VALUE instructions. - // They don't generate code. - if (!CurMI->isDebugValue()) { + // Don't create a new label after DBG_VALUE and other instructions that don't + // generate code. + if (!CurMI->isMetaInstruction()) { PrevLabel = nullptr; PrevInstBB = CurMI->getParent(); } @@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() { } void DebugHandlerBase::endFunction(const MachineFunction *MF) { + if (hasDebugInfo(MMI, MF)) + endFunctionImpl(MF); DbgValues.clear(); LabelsBeforeInsn.clear(); LabelsAfterInsn.clear(); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h index c00fa189d94a..659a921e1fc5 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h @@ -80,6 +80,10 @@ protected: LabelsAfterInsn.insert(std::make_pair(MI, nullptr)); } + virtual void beginFunctionImpl(const MachineFunction *MF) = 0; + virtual void endFunctionImpl(const MachineFunction *MF) = 0; + virtual void skippedNonDebugFunction() {} + // AsmPrinterHandler overrides. public: void beginInstruction(const MachineInstr *MI) override; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h index 36fb1507ddc6..a68e8cc6b4b3 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -76,7 +76,8 @@ public: const DIExpression *getExpression() const { return Expression; } friend bool operator==(const Value &, const Value &); friend bool operator<(const Value &, const Value &); - void dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { if (isLocation()) { llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " "; if (Loc.isIndirect()) @@ -90,6 +91,7 @@ public: if (Expression) Expression->dump(); } +#endif }; private: diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index d904372af589..04073b3aed68 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1,3 +1,16 @@ +//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for constructing a dwarf compile unit. +// +//===----------------------------------------------------------------------===// + #include "DwarfCompileUnit.h" #include "DwarfExpression.h" #include "llvm/CodeGen/MachineFunction.h" @@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE( bool addToAccelTable = false; DIELoc *Loc = nullptr; std::unique_ptr<DIEDwarfExpression> DwarfExpr; - bool AllConstant = std::all_of( - GlobalExprs.begin(), GlobalExprs.end(), - [&](const GlobalExpr GE) { - return GE.Expr && GE.Expr->isConstant(); - }); - for (const auto &GE : GlobalExprs) { const GlobalVariable *Global = GE.Var; const DIExpression *Expr = GE.Expr; + // For compatibility with DWARF 3 and earlier, // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes // DW_AT_const_value(X). if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) { + addToAccelTable = true; addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1)); - // We cannot describe the location of dllimport'd variables: the - // computation of their address requires loads from the IAT. - } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) { - if (!Loc) { - Loc = new (DIEValueAllocator) DIELoc; - DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); - } + break; + } + + // We cannot describe the location of dllimport'd variables: the + // computation of their address requires loads from the IAT. + if (Global && Global->hasDLLImportStorageClass()) + continue; + + // Nothing to describe without address or constant. + if (!Global && (!Expr || !Expr->isConstant())) + continue; + + if (!Loc) { addToAccelTable = true; - if (Global) { - const MCSymbol *Sym = Asm->getSymbol(Global); - if (Global->isThreadLocal()) { - if (Asm->TM.Options.EmulatedTLS) { - // TODO: add debug info for emulated thread local mode. - } else { - // FIXME: Make this work with -gsplit-dwarf. - unsigned PointerSize = Asm->getDataLayout().getPointerSize(); - assert((PointerSize == 4 || PointerSize == 8) && - "Add support for other sizes if necessary"); - // Based on GCC's support for TLS: - if (!DD->useSplitDwarf()) { - // 1) Start with a constNu of the appropriate pointer size - addUInt(*Loc, dwarf::DW_FORM_data1, - PointerSize == 4 ? dwarf::DW_OP_const4u - : dwarf::DW_OP_const8u); - // 2) containing the (relocated) offset of the TLS variable - // within the module's TLS block. - addExpr(*Loc, dwarf::DW_FORM_udata, - Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); - } else { - addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); - addUInt(*Loc, dwarf::DW_FORM_udata, - DD->getAddressPool().getIndex(Sym, /* TLS */ true)); - } - // 3) followed by an OP to make the debugger do a TLS lookup. + Loc = new (DIEValueAllocator) DIELoc; + DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc); + } + + if (Global) { + const MCSymbol *Sym = Asm->getSymbol(Global); + if (Global->isThreadLocal()) { + if (Asm->TM.Options.EmulatedTLS) { + // TODO: add debug info for emulated thread local mode. + } else { + // FIXME: Make this work with -gsplit-dwarf. + unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + assert((PointerSize == 4 || PointerSize == 8) && + "Add support for other sizes if necessary"); + // Based on GCC's support for TLS: + if (!DD->useSplitDwarf()) { + // 1) Start with a constNu of the appropriate pointer size addUInt(*Loc, dwarf::DW_FORM_data1, - DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address - : dwarf::DW_OP_form_tls_address); + PointerSize == 4 ? dwarf::DW_OP_const4u + : dwarf::DW_OP_const8u); + // 2) containing the (relocated) offset of the TLS variable + // within the module's TLS block. + addExpr(*Loc, dwarf::DW_FORM_udata, + Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); + } else { + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); + addUInt(*Loc, dwarf::DW_FORM_udata, + DD->getAddressPool().getIndex(Sym, /* TLS */ true)); } - } else { - DD->addArangeLabel(SymbolCU(this, Sym)); - addOpAddress(*Loc, Sym); + // 3) followed by an OP to make the debugger do a TLS lookup. + addUInt(*Loc, dwarf::DW_FORM_data1, + DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address + : dwarf::DW_OP_form_tls_address); } + } else { + DD->addArangeLabel(SymbolCU(this, Sym)); + addOpAddress(*Loc, Sym); } - if (Expr) { - DwarfExpr->addFragmentOffset(Expr); - DwarfExpr->AddExpression(Expr); - } + } + if (Expr) { + DwarfExpr->addFragmentOffset(Expr); + DwarfExpr->addExpression(Expr); } } if (Loc) @@ -422,7 +440,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) { auto *InlinedSP = getDISubprogram(DS); // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP]; + DIE *OriginDIE = getAbstractSPDies()[InlinedSP]; assert(OriginDIE && "Unable to find original DIE for an inlined subprogram."); auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine); @@ -507,8 +525,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); // If there is an expression, emit raw unsigned bytes. DwarfExpr.addFragmentOffset(Expr); - DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm()); - DwarfExpr.AddExpression(Expr); + DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm()); + DwarfExpr.addExpression(Expr); addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); } else addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType()); @@ -529,12 +547,19 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); for (auto &Fragment : DV.getFrameIndexExprs()) { unsigned FrameReg = 0; + const DIExpression *Expr = Fragment.Expr; const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering(); int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg); - DwarfExpr.addFragmentOffset(Fragment.Expr); - DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - FrameReg, Offset); - DwarfExpr.AddExpression(Fragment.Expr); + DwarfExpr.addFragmentOffset(Expr); + SmallVector<uint64_t, 8> Ops; + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Offset); + Ops.append(Expr->elements_begin(), Expr->elements_end()); + DIExpressionCursor Cursor(Ops); + DwarfExpr.setMemoryLocationKind(); + DwarfExpr.addMachineRegExpression( + *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg); + DwarfExpr.addExpression(std::move(Cursor)); } addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); @@ -609,7 +634,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( LexicalScope *Scope) { - DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()]; + DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()]; if (AbsDef) return; @@ -671,7 +696,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { DIE *D = getDIE(SP); - if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) { + if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) { if (D) // If this subprogram has an abstract definition, reference that addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE); @@ -683,6 +708,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { } } +void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) { + DbgVariable *AbsVar = getExistingAbstractVariable( + InlinedVariable(Var.getVariable(), Var.getInlinedAt())); + auto *VariableDie = Var.getDIE(); + if (AbsVar && AbsVar->getDIE()) { + addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, + *AbsVar->getDIE()); + } else + applyVariableAttributes(Var, *VariableDie); +} + +DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) { + const DILocalVariable *Cleansed; + return getExistingAbstractVariable(IV, Cleansed); +} + +// Find abstract variable, if any, associated with Var. +DbgVariable *DwarfCompileUnit::getExistingAbstractVariable( + InlinedVariable IV, const DILocalVariable *&Cleansed) { + // More then one inlined variable corresponds to one abstract variable. + Cleansed = IV.first; + auto &AbstractVariables = getAbstractVariables(); + auto I = AbstractVariables.find(Cleansed); + if (I != AbstractVariables.end()) + return I->second.get(); + return nullptr; +} + +void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var, + LexicalScope *Scope) { + assert(Scope && Scope->isAbstractScope()); + auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr); + DU->addScopeVariable(Scope, AbsDbgVariable.get()); + getAbstractVariables()[Var] = std::move(AbsDbgVariable); +} + void DwarfCompileUnit::emitHeader(bool UseOffsets) { // Don't bother labeling the .dwo unit, as its offset isn't used. if (!Skeleton) { @@ -690,27 +751,54 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) { Asm->OutStreamer->EmitLabel(LabelBegin); } - DwarfUnit::emitHeader(UseOffsets); + dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile + : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton + : dwarf::DW_UT_compile; + DwarfUnit::emitCommonHeader(UseOffsets, UT); } /// addGlobalName - Add a new global name to the compile unit. -void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die, +void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Name.str(); GlobalNames[FullName] = &Die; } +void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name, + const DIScope *Context) { + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) + return; + std::string FullName = getParentContextString(Context) + Name.str(); + // Insert, allowing the entry to remain as-is if it's already present + // This way the CU-level type DIE is preferred over the "can't describe this + // type as a unit offset because it's not really in the CU at all, it's only + // in a type unit" + GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} + /// Add a new global type to the unit. void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Ty->getName().str(); GlobalTypes[FullName] = &Die; } +void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty, + const DIScope *Context) { + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) + return; + std::string FullName = getParentContextString(Context) + Ty->getName().str(); + // Insert, allowing the entry to remain as-is if it's already present + // This way the CU-level type DIE is preferred over the "can't describe this + // type as a unit offset because it's not really in the CU at all, it's only + // in a type unit" + GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie())); +} + /// addVariableAddress - Add DW_AT_location attribute for a /// DbgVariable based on provided MachineLocation. void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, @@ -727,22 +815,23 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die, void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute, const MachineLocation &Location) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression Expr(*Asm, *this, *Loc); - - bool validReg; - if (Location.isReg()) - validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg()); - else - validReg = - Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg(), Location.getOffset()); + DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); - if (!validReg) + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + } + DIExpressionCursor Cursor(Ops); + const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) return; + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Expr.finalize()); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Start with the address based on the location provided, and generate the @@ -754,23 +843,25 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die, const MachineLocation &Location) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); - const DIExpression *Expr = DV.getSingleExpression(); - DIExpressionCursor ExprCursor(Expr); + const DIExpression *DIExpr = DV.getSingleExpression(); + DwarfExpr.addFragmentOffset(DIExpr); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); + + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + } + Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpressionCursor Cursor(Ops); const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); - auto Reg = Location.getReg(); - DwarfExpr.addFragmentOffset(Expr); - bool ValidReg = - Location.getOffset() - ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset()) - : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg); - - if (!ValidReg) + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) return; - - DwarfExpr.AddExpression(std::move(ExprCursor)); + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Loc); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Add a Dwarf loclistptr attribute data and value. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index a8025f1d1521..b8f57472f17c 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -28,7 +28,7 @@ class DwarfFile; class MCSymbol; class LexicalScope; -class DwarfCompileUnit : public DwarfUnit { +class DwarfCompileUnit final : public DwarfUnit { /// A numeric ID unique among all CUs in the module unsigned UniqueID; @@ -68,13 +68,26 @@ class DwarfCompileUnit : public DwarfUnit { // ranges/locs. const MCSymbol *BaseAddress; + DenseMap<const MDNode *, DIE *> AbstractSPDies; + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables; + /// \brief Construct a DIE for the given DbgVariable without initializing the /// DbgVariable's DIE reference. DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract); bool isDwoUnit() const override; - bool includeMinimalInlineScopes() const; + DenseMap<const MDNode *, DIE *> &getAbstractSPDies() { + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return AbstractSPDies; + return DU->getAbstractSPDies(); + } + + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() { + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return AbstractVariables; + return DU->getAbstractVariables(); + } public: DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A, @@ -86,6 +99,8 @@ public: return Skeleton; } + bool includeMinimalInlineScopes() const; + void initStmtList(); /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE. @@ -189,6 +204,13 @@ public: DIE *constructImportedEntityDIE(const DIImportedEntity *Module); void finishSubprogramDefinition(const DISubprogram *SP); + void finishVariableDefinition(const DbgVariable &Var); + /// Find abstract variable associated with Var. + typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; + DbgVariable *getExistingAbstractVariable(InlinedVariable IV, + const DILocalVariable *&Cleansed); + DbgVariable *getExistingAbstractVariable(InlinedVariable IV); + void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope); /// Set the skeleton unit associated with this unit. void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; } @@ -210,12 +232,19 @@ public: } /// Add a new global name to the compile unit. - void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override; + void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) override; + + /// Add a new global name present in a type unit to this compile unit. + void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context); /// Add a new global type to the compile unit. void addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) override; + /// Add a new global type present in a type unit to this compile unit. + void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context); + const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; } const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 91a3d0989cc5..bf27516e1ccd 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -39,7 +39,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/LEB128.h" @@ -72,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section", cl::desc("Generate dwarf aranges"), cl::init(false)); +static cl::opt<bool> SplitDwarfCrossCuReferences( + "split-dwarf-cross-cu-references", cl::Hidden, + cl::desc("Enable cross-cu references in DWO files"), cl::init(false)); + namespace { enum DefaultOnOff { Default, Enable, Disable }; } @@ -92,14 +95,6 @@ DwarfAccelTables("dwarf-accel-tables", cl::Hidden, cl::init(Default)); static cl::opt<DefaultOnOff> -SplitDwarf("split-dwarf", cl::Hidden, - cl::desc("Output DWARF5 split debug info."), - cl::values(clEnumVal(Default, "Default for platform"), - clEnumVal(Enable, "Enabled"), - clEnumVal(Disable, "Disabled")), - cl::init(Default)); - -static cl::opt<DefaultOnOff> DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden, cl::desc("Generate DWARF pubnames and pubtypes sections"), cl::values(clEnumVal(Default, "Default for platform"), @@ -127,17 +122,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission"; static const char *const DbgTimerName = "writer"; static const char *const DbgTimerDescription = "DWARF Debug Writer"; -void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) { +void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) { BS.EmitInt8( Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op) : dwarf::OperationEncodingString(Op)); } -void DebugLocDwarfExpression::EmitSigned(int64_t Value) { +void DebugLocDwarfExpression::emitSigned(int64_t Value) { BS.EmitSLEB128(Value, Twine(Value)); } -void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) { +void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) { BS.EmitULEB128(Value, Twine(Value)); } @@ -200,6 +195,12 @@ const DIType *DbgVariable::getType() const { } ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const { + if (FrameIndexExprs.size() == 1) + return FrameIndexExprs; + + assert(all_of(FrameIndexExprs, + [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) && + "multiple FI expressions without DW_OP_LLVM_fragment"); std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(), [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool { return A.Expr->getFragmentInfo()->OffsetInBits < @@ -248,17 +249,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) HasAppleExtensionAttributes = tuneForLLDB(); - // Handle split DWARF. Off by default for now. - if (SplitDwarf == Default) - HasSplitDwarf = false; - else - HasSplitDwarf = SplitDwarf == Enable; - - // Pubnames/pubtypes on by default for GDB. - if (DwarfPubSections == Default) - HasDwarfPubSections = tuneForGDB(); - else - HasDwarfPubSections = DwarfPubSections == Enable; + // Handle split DWARF. + HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty(); // SCE defaults to linkage names only for abstract subprograms. if (DwarfLinkageNames == DefaultLinkageNames) @@ -368,25 +360,49 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) { F(*SkelCU); } -void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) { +bool DwarfDebug::shareAcrossDWOCUs() const { + return SplitDwarfCrossCuReferences; +} + +void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, + LexicalScope *Scope) { assert(Scope && Scope->getScopeNode()); assert(Scope->isAbstractScope()); assert(!Scope->getInlinedAt()); auto *SP = cast<DISubprogram>(Scope->getScopeNode()); - ProcessedSPNodes.insert(SP); - // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - auto &CU = *CUMap.lookup(SP->getUnit()); - forBothCUs(CU, [&](DwarfCompileUnit &CU) { - CU.constructAbstractSubprogramScopeDIE(Scope); - }); + if (useSplitDwarf() && !shareAcrossDWOCUs() && !SP->getUnit()->getSplitDebugInlining()) + // Avoid building the original CU if it won't be used + SrcCU.constructAbstractSubprogramScopeDIE(Scope); + else { + auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); + if (auto *SkelCU = CU.getSkeleton()) { + (shareAcrossDWOCUs() ? CU : SrcCU) + .constructAbstractSubprogramScopeDIE(Scope); + if (CU.getCUNode()->getSplitDebugInlining()) + SkelCU->constructAbstractSubprogramScopeDIE(Scope); + } else + CU.constructAbstractSubprogramScopeDIE(Scope); + } } -void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { - if (!GenerateGnuPubSections) +bool DwarfDebug::hasDwarfPubSections(bool includeMinimalInlineScopes) const { + // Opting in to GNU Pubnames/types overrides the default to ensure these are + // generated for things like Gold's gdb_index generation. + if (GenerateGnuPubSections) + return true; + + if (DwarfPubSections == Default) + return tuneForGDB() && !includeMinimalInlineScopes; + + return DwarfPubSections == Enable; +} + +void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const { + if (!hasDwarfPubSections(U.includeMinimalInlineScopes())) return; U.addFlag(D, dwarf::DW_AT_GNU_pubnames); @@ -395,7 +411,9 @@ void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { // Create new DwarfCompileUnit for the given metadata node with tag // DW_TAG_compile_unit. DwarfCompileUnit & -DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { +DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { + if (auto *CU = CUMap.lookup(DIUnit)) + return *CU; StringRef FN = DIUnit->getFilename(); CompilationDir = DIUnit->getDirectory(); @@ -407,7 +425,7 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { if (useSplitDwarf()) { NewCU.setSkeleton(constructSkeletonCU(NewCU)); NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name, - DIUnit->getSplitDebugFilename()); + Asm->TM.Options.MCOptions.SplitDwarfFile); } // LTO with assembly output shares a single line table amongst multiple CUs. @@ -418,7 +436,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { Asm->OutStreamer->getContext().setMCLineTableCompilationDir( NewCU.getUniqueID(), CompilationDir); - NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer()); + StringRef Producer = DIUnit->getProducer(); + StringRef Flags = DIUnit->getFlags(); + if (!Flags.empty()) { + std::string ProducerWithFlags = Producer.str() + " " + Flags.str(); + NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags); + } else + NewCU.addString(Die, dwarf::DW_AT_producer, Producer); + NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2, DIUnit->getSourceLanguage()); NewCU.addString(Die, dwarf::DW_AT_name, FN); @@ -521,7 +546,12 @@ void DwarfDebug::beginModule() { } for (DICompileUnit *CUNode : M->debug_compile_units()) { - DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode); + if (CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() && + CUNode->getGlobalVariables().empty() && + CUNode->getImportedEntities().empty() && CUNode->getMacros().empty()) + continue; + + DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(CUNode); for (auto *IE : CUNode->getImportedEntities()) CU.addImportedEntity(IE); @@ -544,7 +574,6 @@ void DwarfDebug::beginModule() { // The retained types array by design contains pointers to // MDNodes rather than DIRefs. Unique them here. if (DIType *RT = dyn_cast<DIType>(Ty)) - if (!RT->isExternalTypeRef()) // There is no point in force-emitting a forward declaration. CU.getOrCreateTypeDIE(RT); } @@ -564,22 +593,17 @@ void DwarfDebug::finishVariableDefinitions() { // DIE::getUnit isn't simple - it walks parent pointers, etc. DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie()); assert(Unit); - DbgVariable *AbsVar = getExistingAbstractVariable( - InlinedVariable(Var->getVariable(), Var->getInlinedAt())); - if (AbsVar && AbsVar->getDIE()) { - Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, - *AbsVar->getDIE()); - } else - Unit->applyVariableAttributes(*Var, *VariableDie); + Unit->finishVariableDefinition(*Var); } } void DwarfDebug::finishSubprogramDefinitions() { - for (const DISubprogram *SP : ProcessedSPNodes) - if (SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug) - forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) { - CU.finishSubprogramDefinition(SP); - }); + for (const DISubprogram *SP : ProcessedSPNodes) { + assert(SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug); + forBothCUs( + getOrCreateDwarfCompileUnit(SP->getUnit()), + [&](DwarfCompileUnit &CU) { CU.finishSubprogramDefinition(SP); }); + } } void DwarfDebug::finalizeModuleInfo() { @@ -589,6 +613,13 @@ void DwarfDebug::finalizeModuleInfo() { finishVariableDefinitions(); + // Include the DWO file name in the hash if there's more than one CU. + // This handles ThinLTO's situation where imported CUs may very easily be + // duplicate with the same CU partially imported into another ThinLTO unit. + StringRef DWOName; + if (CUMap.size() > 1) + DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile; + // Handle anything that needs to be done on a per-unit basis after // all other generation. for (const auto &P : CUMap) { @@ -603,7 +634,8 @@ void DwarfDebug::finalizeModuleInfo() { auto *SkCU = TheCU.getSkeleton(); if (useSplitDwarf()) { // Emit a unique identifier for this CU. - uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie()); + uint64_t ID = + DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie()); TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, ID); SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id, @@ -712,63 +744,40 @@ void DwarfDebug::endModule() { } // Emit the pubnames and pubtypes sections if requested. - if (HasDwarfPubSections) { + // The condition is optimistically correct - any CU not using GMLT (& + // implicit/default pubnames state) might still have pubnames. + if (hasDwarfPubSections(/* gmlt */ false)) { emitDebugPubNames(GenerateGnuPubSections); emitDebugPubTypes(GenerateGnuPubSections); } // clean up. - AbstractVariables.clear(); -} - -// Find abstract variable, if any, associated with Var. -DbgVariable * -DwarfDebug::getExistingAbstractVariable(InlinedVariable IV, - const DILocalVariable *&Cleansed) { - // More then one inlined variable corresponds to one abstract variable. - Cleansed = IV.first; - auto I = AbstractVariables.find(Cleansed); - if (I != AbstractVariables.end()) - return I->second.get(); - return nullptr; -} - -DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) { - const DILocalVariable *Cleansed; - return getExistingAbstractVariable(IV, Cleansed); + // FIXME: AbstractVariables.clear(); } -void DwarfDebug::createAbstractVariable(const DILocalVariable *Var, - LexicalScope *Scope) { - auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr); - InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get()); - AbstractVariables[Var] = std::move(AbsDbgVariable); -} - -void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV, +void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV, const MDNode *ScopeNode) { const DILocalVariable *Cleansed = nullptr; - if (getExistingAbstractVariable(IV, Cleansed)) + if (CU.getExistingAbstractVariable(IV, Cleansed)) return; - createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope( + CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope( cast<DILocalScope>(ScopeNode))); } -void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped( +void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable IV, const MDNode *ScopeNode) { const DILocalVariable *Cleansed = nullptr; - if (getExistingAbstractVariable(IV, Cleansed)) + if (CU.getExistingAbstractVariable(IV, Cleansed)) return; if (LexicalScope *Scope = LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode))) - createAbstractVariable(Cleansed, Scope); + CU.createAbstractVariable(Cleansed, Scope); } - // Collect variable information from side table maintained by MF. void DwarfDebug::collectVariableInfoFromMFTable( - DenseSet<InlinedVariable> &Processed) { + DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) { for (const auto &VI : Asm->MF->getVariableDbgInfo()) { if (!VI.Var) continue; @@ -783,7 +792,7 @@ void DwarfDebug::collectVariableInfoFromMFTable( if (!Scope) continue; - ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode()); + ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode()); auto RegVar = make_unique<DbgVariable>(Var.first, Var.second); RegVar->initializeMMI(VI.Expr, VI.Slot); if (InfoHolder.addScopeVariable(Scope, RegVar.get())) @@ -954,9 +963,10 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc, } } -DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope, +DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU, + LexicalScope &Scope, InlinedVariable IV) { - ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode()); + ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode()); ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second)); InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get()); return ConcreteVariables.back().get(); @@ -979,7 +989,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP, DenseSet<InlinedVariable> &Processed) { // Grab the variable info that was squirreled away in the MMI side-table. - collectVariableInfoFromMFTable(Processed); + collectVariableInfoFromMFTable(TheCU, Processed); for (const auto &I : DbgValues) { InlinedVariable IV = I.first; @@ -1001,7 +1011,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, continue; Processed.insert(IV); - DbgVariable *RegVar = createConcreteVariable(*Scope, IV); + DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV); const MachineInstr *MInsn = Ranges.front().first; assert(MInsn->isDebugValue() && "History must begin with debug value"); @@ -1037,7 +1047,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, for (const DILocalVariable *DV : SP->getVariables()) { if (Processed.insert(InlinedVariable(DV, nullptr)).second) if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope())) - createConcreteVariable(*Scope, InlinedVariable(DV, nullptr)); + createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr)); } } @@ -1046,8 +1056,12 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); assert(CurMI); + const auto *SP = MI->getParent()->getParent()->getFunction()->getSubprogram(); + if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) + return; + // Check if source location changes, but ignore DBG_VALUE and CFI locations. - if (MI->isDebugValue() || MI->isCFIInstruction()) + if (MI->isMetaInstruction()) return; const DebugLoc &DL = MI->getDebugLoc(); // When we emit a line-0 record, we don't update PrevInstLoc; so look at @@ -1129,7 +1143,7 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { // the beginning of the function body. for (const auto &MBB : *MF) for (const auto &MI : MBB) - if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && + if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) return MI.getDebugLoc(); return DebugLoc(); @@ -1137,75 +1151,50 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { // Gather pre-function debug information. Assumes being called immediately // after the function entry point has been emitted. -void DwarfDebug::beginFunction(const MachineFunction *MF) { +void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) { CurFn = MF; - // If there's no debug info for the function we're not going to do anything. - if (!MMI->hasDebugInfo()) - return; - - auto DI = MF->getFunction()->getSubprogram(); - if (!DI) + auto *SP = MF->getFunction()->getSubprogram(); + assert(LScopes.empty() || SP == LScopes.getCurrentFunctionScope()->getScopeNode()); + if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) return; - // Grab the lexical scopes for the function, if we don't have any of those - // then we're not going to be able to do anything. - DebugHandlerBase::beginFunction(MF); - if (LScopes.empty()) - return; + DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function // belongs to so that we add to the correct per-cu line table in the // non-asm case. - LexicalScope *FnScope = LScopes.getCurrentFunctionScope(); - // FnScope->getScopeNode() and DI->second should represent the same function, - // though they may not be the same MDNode due to inline functions merged in - // LTO where the debug info metadata still differs (either due to distinct - // written differences - two versions of a linkonce_odr function - // written/copied into two separate files, or some sub-optimal metadata that - // isn't structurally identical (see: file path/name info from clang, which - // includes the directory of the cpp file being built, even when the file name - // is absolute (such as an <> lookup header))) - auto *SP = cast<DISubprogram>(FnScope->getScopeNode()); - DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit()); - if (!TheCU) { - assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug && - "DICompileUnit missing from llvm.dbg.cu?"); - return; - } if (Asm->OutStreamer->hasRawTextSupport()) // Use a single line table if we are generating assembly. Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); else - Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID()); + Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID()); // Record beginning of function. PrologEndLoc = findPrologueEndLoc(MF); - if (DILocation *L = PrologEndLoc) { + if (PrologEndLoc) { // We'd like to list the prologue as "not statements" but GDB behaves // poorly if we do that. Revisit this with caution/GDB (7.5+) testing. - auto *SP = L->getInlinedAtScope()->getSubprogram(); + auto *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram(); recordSourceLine(SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT); } } +void DwarfDebug::skippedNonDebugFunction() { + // If we don't have a subprogram for this function then there will be a hole + // in the range information. Keep note of this by setting the previously used + // section to nullptr. + PrevCU = nullptr; + CurFn = nullptr; +} + // Gather and emit post-function debug information. -void DwarfDebug::endFunction(const MachineFunction *MF) { +void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { + const DISubprogram *SP = MF->getFunction()->getSubprogram(); + assert(CurFn == MF && "endFunction should be called with the same function as beginFunction"); - const DISubprogram *SP = MF->getFunction()->getSubprogram(); - if (!MMI->hasDebugInfo() || !SP || - SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) { - // If we don't have a subprogram for this function then there will be a hole - // in the range information. Keep note of this by setting the previously - // used section to nullptr. - PrevCU = nullptr; - CurFn = nullptr; - DebugHandlerBase::endFunction(MF); - return; - } - // Set DwarfDwarfCompileUnitID in MCContext to default value. Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); @@ -1220,17 +1209,14 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd())); // Under -gmlt, skip building the subprogram if there are no inlined - // subroutines inside it. - if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && + // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram + // is still needed as we need its source location. + if (!TheCU.getCUNode()->getDebugInfoForProfiling() && + TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && LScopes.getAbstractScopesList().empty() && !IsDarwin) { assert(InfoHolder.getScopeVariables().empty()); - assert(DbgValues.empty()); - // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed - // by a -gmlt CU. Add a test and remove this assertion. - assert(AbstractVariables.empty()); PrevLabel = nullptr; CurFn = nullptr; - DebugHandlerBase::endFunction(MF); return; } @@ -1244,12 +1230,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { for (const DILocalVariable *DV : SP->getVariables()) { if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second) continue; - ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr), + ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr), DV->getScope()); assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes && "ensureAbstractVariableIsCreated inserted abstract scopes"); } - constructAbstractSubprogramScopeDIE(AScope); + constructAbstractSubprogramScopeDIE(TheCU, AScope); } ProcessedSPNodes.insert(SP); @@ -1266,7 +1252,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { InfoHolder.getScopeVariables().clear(); PrevLabel = nullptr; CurFn = nullptr; - DebugHandlerBase::endFunction(MF); } // Register a source line with debug info. Returns the unique label that was @@ -1361,6 +1346,18 @@ void DwarfDebug::emitAccelTypes() { /// computeIndexValue - Compute the gdb index value for the DIE and CU. static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU, const DIE *Die) { + // Entities that ended up only in a Type Unit reference the CU instead (since + // the pub entry has offsets within the CU there's no real offset that can be + // provided anyway). As it happens all such entities (namespaces and types, + // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out + // not to be true it would be necessary to persist this information from the + // point at which the entry is added to the index data structure - since by + // the time the index is built from that, the original type/namespace DIE in a + // type unit has already been destroyed so it can't be queried for properties + // like tag, etc. + if (Die->getTag() == dwarf::DW_TAG_compile_unit) + return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, + dwarf::GIEL_EXTERNAL); dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC; // We could have a specification DIE that has our most of our knowledge, @@ -1418,7 +1415,7 @@ void DwarfDebug::emitDebugPubSection( const auto &Globals = (TheU->*Accessor)(); - if (Globals.empty()) + if (!hasDwarfPubSections(TheU->includeMinimalInlineScopes())) continue; if (auto *Skeleton = TheU->getSkeleton()) @@ -1498,27 +1495,36 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, ByteStreamer &Streamer, const DebugLocEntry::Value &Value, DwarfExpression &DwarfExpr) { - DIExpressionCursor ExprCursor(Value.getExpression()); - DwarfExpr.addFragmentOffset(Value.getExpression()); + auto *DIExpr = Value.getExpression(); + DIExpressionCursor ExprCursor(DIExpr); + DwarfExpr.addFragmentOffset(DIExpr); // Regular entry. if (Value.isInt()) { if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed || BT->getEncoding() == dwarf::DW_ATE_signed_char)) - DwarfExpr.AddSignedConstant(Value.getInt()); + DwarfExpr.addSignedConstant(Value.getInt()); else - DwarfExpr.AddUnsignedConstant(Value.getInt()); + DwarfExpr.addUnsignedConstant(Value.getInt()); } else if (Value.isLocation()) { - MachineLocation Loc = Value.getLoc(); + MachineLocation Location = Value.getLoc(); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); + SmallVector<uint64_t, 8> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + } + Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + DIExpressionCursor Cursor(Ops); const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo(); - if (Loc.getOffset()) - DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset()); - else - DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg()); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + return; + return DwarfExpr.addExpression(std::move(Cursor)); } else if (Value.isConstantFP()) { APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt(); - DwarfExpr.AddUnsignedConstant(RawBytes); + DwarfExpr.addUnsignedConstant(RawBytes); } - DwarfExpr.AddExpression(std::move(ExprCursor)); + DwarfExpr.addExpression(std::move(ExprCursor)); } void DebugLocEntry::finalize(const AsmPrinter &AP, @@ -1558,10 +1564,13 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) { // Emit locations into the debug loc section. void DwarfDebug::emitDebugLoc() { + if (DebugLocs.getLists().empty()) + return; + // Start the dwarf loc section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfLocSection()); - unsigned char Size = Asm->getDataLayout().getPointerSize(); + unsigned char Size = Asm->MAI->getCodePointerSize(); for (const auto &List : DebugLocs.getLists()) { Asm->OutStreamer->EmitLabel(List.Label); const DwarfCompileUnit *CU = List.CU; @@ -1691,7 +1700,7 @@ void DwarfDebug::emitDebugARanges() { Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfARangesSection()); - unsigned PtrSize = Asm->getDataLayout().getPointerSize(); + unsigned PtrSize = Asm->MAI->getCodePointerSize(); // Build a list of CUs used. std::vector<DwarfCompileUnit *> CUs; @@ -1769,12 +1778,15 @@ void DwarfDebug::emitDebugARanges() { /// Emit address ranges into a debug ranges section. void DwarfDebug::emitDebugRanges() { + if (CUMap.empty()) + return; + // Start the dwarf ranges section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfRangesSection()); // Size for our labels. - unsigned char Size = Asm->getDataLayout().getPointerSize(); + unsigned char Size = Asm->MAI->getCodePointerSize(); // Grab the specific ranges for the compile units in the module. for (const auto &I : CUMap) { @@ -1848,6 +1860,9 @@ void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) { /// Emit macros into a debug macinfo section. void DwarfDebug::emitDebugMacinfo() { + if (CUMap.empty()) + return; + // Start the dwarf macinfo section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfMacinfoSection()); @@ -1869,7 +1884,7 @@ void DwarfDebug::emitDebugMacinfo() { void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die, std::unique_ptr<DwarfCompileUnit> NewU) { NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name, - U.getCUNode()->getSplitDebugFilename()); + Asm->TM.Options.MCOptions.SplitDwarfFile); if (!CompilationDir.empty()) NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir); @@ -1940,11 +1955,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) { MD5 Hash; Hash.update(Identifier); // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. + // implementation always returns its results in little endian, so we actually + // need the "high" word. MD5::MD5Result Result; Hash.final(Result); - return support::endian::read64le(Result + 8); + return Result.high(); } void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU, diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 253e3f06200e..ebfba4cfc275 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -89,7 +89,7 @@ public: assert(!MInsn && "Already initialized?"); assert((!E || E->isValid()) && "Expected valid expression"); - assert(~FI && "Expected valid index"); + assert(FI != INT_MAX && "Expected valid index"); FrameIndexExprs.push_back({FI, E}); } @@ -210,7 +210,6 @@ class DwarfDebug : public DebugHandlerBase { DenseMap<const MCSymbol *, uint64_t> SymSize; /// Collection of abstract variables. - DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables; SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables; /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists @@ -247,9 +246,6 @@ class DwarfDebug : public DebugHandlerBase { std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1> TypeUnitsUnderConstruction; - /// Whether to emit the pubnames/pubtypes sections. - bool HasDwarfPubSections; - /// Whether to use the GNU TLS opcode (instead of the standard opcode). bool UseGNUTLSOpcode; @@ -313,20 +309,16 @@ class DwarfDebug : public DebugHandlerBase { typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; - /// Find abstract variable associated with Var. - DbgVariable *getExistingAbstractVariable(InlinedVariable IV, - const DILocalVariable *&Cleansed); - DbgVariable *getExistingAbstractVariable(InlinedVariable IV); - void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope); - void ensureAbstractVariableIsCreated(InlinedVariable Var, + void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var, const MDNode *Scope); - void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var, + void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var, const MDNode *Scope); - DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV); + DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU, + LexicalScope &Scope, InlinedVariable IV); /// Construct a DIE for this abstract scope. - void constructAbstractSubprogramScopeDIE(LexicalScope *Scope); + void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope); void finishVariableDefinitions(); @@ -420,11 +412,11 @@ class DwarfDebug : public DebugHandlerBase { /// Flags to let the linker know we have emitted new style pubnames. Only /// emit it here if we don't have a skeleton CU for split dwarf. - void addGnuPubAttributes(DwarfUnit &U, DIE &D) const; + void addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const; /// Create new DwarfCompileUnit for the given metadata node with tag /// DW_TAG_compile_unit. - DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit); + DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit); /// Construct imported_module or imported_declaration DIE. void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU, @@ -446,7 +438,17 @@ class DwarfDebug : public DebugHandlerBase { const DbgValueHistoryMap::InstrRanges &Ranges); /// Collect variable information from the side table maintained by MF. - void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P); + void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU, + DenseSet<InlinedVariable> &P); + +protected: + /// Gather pre-function debug information. + void beginFunctionImpl(const MachineFunction *MF) override; + + /// Gather and emit post-function debug information. + void endFunctionImpl(const MachineFunction *MF) override; + + void skippedNonDebugFunction() override; public: //===--------------------------------------------------------------------===// @@ -463,12 +465,6 @@ public: /// Emit all Dwarf sections that should come after the content. void endModule() override; - /// Gather pre-function debug information. - void beginFunction(const MachineFunction *MF) override; - - /// Gather and emit post-function debug information. - void endFunction(const MachineFunction *MF) override; - /// Process beginning of an instruction. void beginInstruction(const MachineInstr *MI) override; @@ -515,6 +511,8 @@ public: /// split dwarf proposal support. bool useSplitDwarf() const { return HasSplitDwarf; } + bool shareAcrossDWOCUs() const; + /// Returns the Dwarf Version. uint16_t getDwarfVersion() const; @@ -555,6 +553,8 @@ public: /// A helper function to check whether the DIE for a given Scope is /// going to be null. bool isLexicalScopeDIENull(LexicalScope *Scope); + + bool hasDwarfPubSections(bool includeMinimalInlineScopes) const; }; } // End of namespace llvm diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 61b2c7e65842..ccd326917bfd 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -22,79 +22,80 @@ using namespace llvm; -void DwarfExpression::AddReg(int DwarfReg, const char *Comment) { - assert(DwarfReg >= 0 && "invalid negative dwarf register number"); - if (DwarfReg < 32) { - EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); +void DwarfExpression::addReg(int DwarfReg, const char *Comment) { + assert(DwarfReg >= 0 && "invalid negative dwarf register number"); + assert((LocationKind == Unknown || LocationKind == Register) && + "location description already locked down"); + LocationKind = Register; + if (DwarfReg < 32) { + emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment); } else { - EmitOp(dwarf::DW_OP_regx, Comment); - EmitUnsigned(DwarfReg); + emitOp(dwarf::DW_OP_regx, Comment); + emitUnsigned(DwarfReg); } } -void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) { +void DwarfExpression::addBReg(int DwarfReg, int Offset) { assert(DwarfReg >= 0 && "invalid negative dwarf register number"); + assert(LocationKind != Register && "location description already locked down"); if (DwarfReg < 32) { - EmitOp(dwarf::DW_OP_breg0 + DwarfReg); + emitOp(dwarf::DW_OP_breg0 + DwarfReg); } else { - EmitOp(dwarf::DW_OP_bregx); - EmitUnsigned(DwarfReg); + emitOp(dwarf::DW_OP_bregx); + emitUnsigned(DwarfReg); } - EmitSigned(Offset); - if (Deref) - EmitOp(dwarf::DW_OP_deref); + emitSigned(Offset); +} + +void DwarfExpression::addFBReg(int Offset) { + emitOp(dwarf::DW_OP_fbreg); + emitSigned(Offset); } -void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { +void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) { if (!SizeInBits) return; const unsigned SizeOfByte = 8; if (OffsetInBits > 0 || SizeInBits % SizeOfByte) { - EmitOp(dwarf::DW_OP_bit_piece); - EmitUnsigned(SizeInBits); - EmitUnsigned(OffsetInBits); + emitOp(dwarf::DW_OP_bit_piece); + emitUnsigned(SizeInBits); + emitUnsigned(OffsetInBits); } else { - EmitOp(dwarf::DW_OP_piece); + emitOp(dwarf::DW_OP_piece); unsigned ByteSize = SizeInBits / SizeOfByte; - EmitUnsigned(ByteSize); + emitUnsigned(ByteSize); } this->OffsetInBits += SizeInBits; } -void DwarfExpression::AddShr(unsigned ShiftBy) { - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(ShiftBy); - EmitOp(dwarf::DW_OP_shr); +void DwarfExpression::addShr(unsigned ShiftBy) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(ShiftBy); + emitOp(dwarf::DW_OP_shr); } -bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI, - unsigned MachineReg, int Offset) { - if (isFrameRegister(TRI, MachineReg)) { - // If variable offset is based in frame register then use fbreg. - EmitOp(dwarf::DW_OP_fbreg); - EmitSigned(Offset); - return true; - } - - int DwarfReg = TRI.getDwarfRegNum(MachineReg, false); - if (DwarfReg < 0) - return false; - - AddRegIndirect(DwarfReg, Offset); - return true; +void DwarfExpression::addAnd(unsigned Mask) { + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Mask); + emitOp(dwarf::DW_OP_and); } -bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, unsigned MaxSize) { - if (!TRI.isPhysicalRegister(MachineReg)) + if (!TRI.isPhysicalRegister(MachineReg)) { + if (isFrameRegister(TRI, MachineReg)) { + DwarfRegs.push_back({-1, 0, nullptr}); + return true; + } return false; + } int Reg = TRI.getDwarfRegNum(MachineReg, false); // If this is a valid register number, emit it. if (Reg >= 0) { - AddReg(Reg); + DwarfRegs.push_back({Reg, 0, nullptr}); return true; } @@ -106,7 +107,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg); unsigned Size = TRI.getSubRegIdxSize(Idx); unsigned RegOffset = TRI.getSubRegIdxOffset(Idx); - AddReg(Reg, "super-register"); + DwarfRegs.push_back({Reg, 0, "super-register"}); // Use a DW_OP_bit_piece to describe the sub-register. setSubRegisterPiece(Size, RegOffset); return true; @@ -116,8 +117,9 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, // Otherwise, attempt to find a covering set of sub-register numbers. // For example, Q0 on ARM is a composition of D0+D1. unsigned CurPos = 0; - // The size of the register in bits, assuming 8 bits per byte. - unsigned RegSize = TRI.getMinimalPhysRegClass(MachineReg)->getSize() * 8; + // The size of the register in bits. + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg); + unsigned RegSize = TRI.getRegSizeInBits(*RC); // Keep track of the bits in the register we already emitted, so we // can avoid emitting redundant aliasing subregs. SmallBitVector Coverage(RegSize, false); @@ -136,100 +138,156 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI, // If this sub-register has a DWARF number and we haven't covered // its range, emit a DWARF piece for it. if (Reg >= 0 && Intersection.any()) { - AddReg(Reg, "sub-register"); + // Emit a piece for any gap in the coverage. + if (Offset > CurPos) + DwarfRegs.push_back({-1, Offset - CurPos, nullptr}); + DwarfRegs.push_back( + {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"}); if (Offset >= MaxSize) break; - // Emit a piece for the any gap in the coverage. - if (Offset > CurPos) - AddOpPiece(Offset - CurPos); - AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset)); - CurPos = Offset + Size; // Mark it as emitted. Coverage.set(Offset, Offset + Size); + CurPos = Offset + Size; } } return CurPos; } -void DwarfExpression::AddStackValue() { +void DwarfExpression::addStackValue() { if (DwarfVersion >= 4) - EmitOp(dwarf::DW_OP_stack_value); + emitOp(dwarf::DW_OP_stack_value); } -void DwarfExpression::AddSignedConstant(int64_t Value) { - EmitOp(dwarf::DW_OP_consts); - EmitSigned(Value); - AddStackValue(); +void DwarfExpression::addSignedConstant(int64_t Value) { + assert(LocationKind == Implicit || LocationKind == Unknown); + LocationKind = Implicit; + emitOp(dwarf::DW_OP_consts); + emitSigned(Value); } -void DwarfExpression::AddUnsignedConstant(uint64_t Value) { - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Value); - AddStackValue(); +void DwarfExpression::addUnsignedConstant(uint64_t Value) { + assert(LocationKind == Implicit || LocationKind == Unknown); + LocationKind = Implicit; + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Value); } -void DwarfExpression::AddUnsignedConstant(const APInt &Value) { +void DwarfExpression::addUnsignedConstant(const APInt &Value) { + assert(LocationKind == Implicit || LocationKind == Unknown); + LocationKind = Implicit; + unsigned Size = Value.getBitWidth(); const uint64_t *Data = Value.getRawData(); // Chop it up into 64-bit pieces, because that's the maximum that - // AddUnsignedConstant takes. + // addUnsignedConstant takes. unsigned Offset = 0; while (Offset < Size) { - AddUnsignedConstant(*Data++); + addUnsignedConstant(*Data++); if (Offset == 0 && Size <= 64) break; - AddOpPiece(std::min(Size-Offset, 64u), Offset); + addStackValue(); + addOpPiece(std::min(Size - Offset, 64u), Offset); Offset += 64; } } -bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI, +bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &ExprCursor, unsigned MachineReg, unsigned FragmentOffsetInBits) { - if (!ExprCursor) - return AddMachineReg(TRI, MachineReg); + auto Fragment = ExprCursor.getFragmentInfo(); + if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) { + LocationKind = Unknown; + return false; + } - // Pattern-match combinations for which more efficient representations exist - // first. - bool ValidReg = false; + bool HasComplexExpression = false; auto Op = ExprCursor.peek(); - switch (Op->getOp()) { - default: { - auto Fragment = ExprCursor.getFragmentInfo(); - ValidReg = AddMachineReg(TRI, MachineReg, - Fragment ? Fragment->SizeInBits : ~1U); - break; + if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment) + HasComplexExpression = true; + + // If the register can only be described by a complex expression (i.e., + // multiple subregisters) it doesn't safely compose with another complex + // expression. For example, it is not possible to apply a DW_OP_deref + // operation to multiple DW_OP_pieces. + if (HasComplexExpression && DwarfRegs.size() > 1) { + DwarfRegs.clear(); + LocationKind = Unknown; + return false; } - case dwarf::DW_OP_plus: - case dwarf::DW_OP_minus: { - // [DW_OP_reg,Offset,DW_OP_plus, DW_OP_deref] --> [DW_OP_breg, Offset]. - // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset]. - auto N = ExprCursor.peekNext(); - if (N && N->getOp() == dwarf::DW_OP_deref) { - unsigned Offset = Op->getArg(0); - ValidReg = AddMachineRegIndirect( - TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset); - ExprCursor.consume(2); - } else - ValidReg = AddMachineReg(TRI, MachineReg); - break; + + // Handle simple register locations. + if (LocationKind != Memory && !HasComplexExpression) { + for (auto &Reg : DwarfRegs) { + if (Reg.DwarfRegNo >= 0) + addReg(Reg.DwarfRegNo, Reg.Comment); + addOpPiece(Reg.Size); + } + DwarfRegs.clear(); + return true; } - case dwarf::DW_OP_deref: - // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg]. - ValidReg = AddMachineRegIndirect(TRI, MachineReg); + + // Don't emit locations that cannot be expressed without DW_OP_stack_value. + if (DwarfVersion < 4) + if (std::any_of(ExprCursor.begin(), ExprCursor.end(), + [](DIExpression::ExprOperand Op) -> bool { + return Op.getOp() == dwarf::DW_OP_stack_value; + })) { + DwarfRegs.clear(); + LocationKind = Unknown; + return false; + } + + assert(DwarfRegs.size() == 1); + auto Reg = DwarfRegs[0]; + bool FBReg = isFrameRegister(TRI, MachineReg); + int SignedOffset = 0; + assert(Reg.Size == 0 && "subregister has same size as superregister"); + + // Pattern-match combinations for which more efficient representations exist. + // [Reg, Offset, DW_OP_plus] --> [DW_OP_breg, Offset]. + // [Reg, Offset, DW_OP_minus] --> [DW_OP_breg, -Offset]. + // If Reg is a subregister we need to mask it out before subtracting. + if (Op && ((Op->getOp() == dwarf::DW_OP_plus) || + (Op->getOp() == dwarf::DW_OP_minus && !SubRegisterSizeInBits))) { + int Offset = Op->getArg(0); + SignedOffset = (Op->getOp() == dwarf::DW_OP_plus) ? Offset : -Offset; ExprCursor.take(); - break; } + if (FBReg) + addFBReg(SignedOffset); + else + addBReg(Reg.DwarfRegNo, SignedOffset); + DwarfRegs.clear(); + return true; +} - return ValidReg; +/// Assuming a well-formed expression, match "DW_OP_deref* DW_OP_LLVM_fragment?". +static bool isMemoryLocation(DIExpressionCursor ExprCursor) { + while (ExprCursor) { + auto Op = ExprCursor.take(); + switch (Op->getOp()) { + case dwarf::DW_OP_deref: + case dwarf::DW_OP_LLVM_fragment: + break; + default: + return false; + } + } + return true; } -void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, +void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor, unsigned FragmentOffsetInBits) { + // If we need to mask out a subregister, do it now, unless the next + // operation would emit an OpPiece anyway. + auto N = ExprCursor.peek(); + if (SubRegisterSizeInBits && N && (N->getOp() != dwarf::DW_OP_LLVM_fragment)) + maskSubRegister(); + while (ExprCursor) { auto Op = ExprCursor.take(); switch (Op->getOp()) { @@ -241,49 +299,94 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor, // location. assert(OffsetInBits >= FragmentOffset && "fragment offset not added?"); - // If \a AddMachineReg already emitted DW_OP_piece operations to represent + // If addMachineReg already emitted DW_OP_piece operations to represent // a super-register by splicing together sub-registers, subtract the size // of the pieces that was already emitted. SizeInBits -= OffsetInBits - FragmentOffset; - // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a + // If addMachineReg requested a DW_OP_bit_piece to stencil out a // sub-register that is smaller than the current fragment's size, use it. if (SubRegisterSizeInBits) SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits); - - AddOpPiece(SizeInBits, SubRegisterOffsetInBits); + + // Emit a DW_OP_stack_value for implicit location descriptions. + if (LocationKind == Implicit) + addStackValue(); + + // Emit the DW_OP_piece. + addOpPiece(SizeInBits, SubRegisterOffsetInBits); setSubRegisterPiece(0, 0); - break; + // Reset the location description kind. + LocationKind = Unknown; + return; } case dwarf::DW_OP_plus: - EmitOp(dwarf::DW_OP_plus_uconst); - EmitUnsigned(Op->getArg(0)); + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_plus_uconst); + emitUnsigned(Op->getArg(0)); break; case dwarf::DW_OP_minus: - // There is no OP_minus_uconst. - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Op->getArg(0)); - EmitOp(dwarf::DW_OP_minus); + assert(LocationKind != Register); + // There is no DW_OP_minus_uconst. + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Op->getArg(0)); + emitOp(dwarf::DW_OP_minus); break; - case dwarf::DW_OP_deref: - EmitOp(dwarf::DW_OP_deref); + case dwarf::DW_OP_deref: { + assert(LocationKind != Register); + if (LocationKind != Memory && isMemoryLocation(ExprCursor)) + // Turning this into a memory location description makes the deref + // implicit. + LocationKind = Memory; + else + emitOp(dwarf::DW_OP_deref); break; + } case dwarf::DW_OP_constu: - EmitOp(dwarf::DW_OP_constu); - EmitUnsigned(Op->getArg(0)); + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_constu); + emitUnsigned(Op->getArg(0)); break; case dwarf::DW_OP_stack_value: - AddStackValue(); + LocationKind = Implicit; + break; + case dwarf::DW_OP_swap: + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_swap); + break; + case dwarf::DW_OP_xderef: + assert(LocationKind != Register); + emitOp(dwarf::DW_OP_xderef); break; default: llvm_unreachable("unhandled opcode found in expression"); } } + + if (LocationKind == Implicit) + // Turn this into an implicit location description. + addStackValue(); +} + +/// add masking operations to stencil out a subregister. +void DwarfExpression::maskSubRegister() { + assert(SubRegisterSizeInBits && "no subregister was registered"); + if (SubRegisterOffsetInBits > 0) + addShr(SubRegisterOffsetInBits); + uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL; + addAnd(Mask); } + void DwarfExpression::finalize() { - if (SubRegisterSizeInBits) - AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); + assert(DwarfRegs.size() == 0 && "dwarf registers not emitted"); + // Emit any outstanding DW_OP_piece operations to mask out subregisters. + if (SubRegisterSizeInBits == 0) + return; + // Don't emit a DW_OP_piece for a subregister at offset 0. + if (SubRegisterOffsetInBits == 0) + return; + addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits); } void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { @@ -294,6 +397,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) { assert(FragmentOffset >= OffsetInBits && "overlapping or duplicate fragments"); if (FragmentOffset > OffsetInBits) - AddOpPiece(FragmentOffset - OffsetInBits); + addOpPiece(FragmentOffset - OffsetInBits); OffsetInBits = FragmentOffset; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index fd90fa05bc32..de8613200067 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -72,6 +72,8 @@ public: } /// Determine whether there are any operations left in this expression. operator bool() const { return Start != End; } + DIExpression::expr_op_iterator begin() const { return Start; } + DIExpression::expr_op_iterator end() const { return End; } /// Retrieve the fragment information, if any. Optional<DIExpression::FragmentInfo> getFragmentInfo() const { @@ -84,14 +86,27 @@ public: /// entry. class DwarfExpression { protected: - unsigned DwarfVersion; + /// Holds information about all subregisters comprising a register location. + struct Register { + int DwarfRegNo; + unsigned Size; + const char *Comment; + }; + + /// The register location, if any. + SmallVector<Register, 2> DwarfRegs; + /// Current Fragment Offset in Bits. uint64_t OffsetInBits = 0; + unsigned DwarfVersion; /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister. unsigned SubRegisterSizeInBits = 0; unsigned SubRegisterOffsetInBits = 0; + /// The kind of location description being produced. + enum { Unknown = 0, Register, Memory, Implicit } LocationKind = Unknown; + /// Push a DW_OP_piece / DW_OP_bit_piece for emitting later, if one is needed /// to represent a subregister. void setSubRegisterPiece(unsigned SizeInBits, unsigned OffsetInBits) { @@ -99,35 +114,55 @@ protected: SubRegisterOffsetInBits = OffsetInBits; } -public: - DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} - virtual ~DwarfExpression() {}; - - /// This needs to be called last to commit any pending changes. - void finalize(); + /// Add masking operations to stencil out a subregister. + void maskSubRegister(); /// Output a dwarf operand and an optional assembler comment. - virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0; + virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0; /// Emit a raw signed value. - virtual void EmitSigned(int64_t Value) = 0; + virtual void emitSigned(int64_t Value) = 0; /// Emit a raw unsigned value. - virtual void EmitUnsigned(uint64_t Value) = 0; + virtual void emitUnsigned(uint64_t Value) = 0; /// Return whether the given machine register is the frame register in the /// current function. virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0; - /// Emit a dwarf register operation. - void AddReg(int DwarfReg, const char *Comment = nullptr); - /// Emit an (double-)indirect dwarf register operation. - void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false); + /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF + /// register location description. + void addReg(int DwarfReg, const char *Comment = nullptr); + /// Emit a DW_OP_breg operation. + void addBReg(int DwarfReg, int Offset); + /// Emit DW_OP_fbreg <Offset>. + void addFBReg(int Offset); + + /// Emit a partial DWARF register operation. + /// + /// \param MachineReg The register number. + /// \param MaxSize If the register must be composed from + /// sub-registers this is an upper bound + /// for how many bits the emitted DW_OP_piece + /// may cover. + /// + /// If size and offset is zero an operation for the entire register is + /// emitted: Some targets do not provide a DWARF register number for every + /// register. If this is the case, this function will attempt to emit a DWARF + /// register by emitting a fragment of a super-register or by piecing together + /// multiple subregisters that alias the register. + /// + /// \return false if no DWARF register exists for MachineReg. + bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, + unsigned MaxSize = ~1U); + /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment. /// \param OffsetInBits This is an optional offset into the location that /// is at the top of the DWARF stack. - void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); + void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0); - /// Emit a shift-right dwarf expression. - void AddShr(unsigned ShiftBy); + /// Emit a shift-right dwarf operation. + void addShr(unsigned ShiftBy); + /// Emit a bitwise and dwarf operation. + void addAnd(unsigned Mask); /// Emit a DW_OP_stack_value, if supported. /// @@ -140,48 +175,39 @@ public: /// constant value, so the producers and consumers started to rely on /// heuristics to disambiguate the value vs. location status of the /// expression. See PR21176 for more details. - void AddStackValue(); + void addStackValue(); - /// Emit an indirect dwarf register operation for the given machine register. - /// \return false if no DWARF register exists for MachineReg. - bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg, - int Offset = 0); + ~DwarfExpression() = default; +public: + DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {} - /// Emit a partial DWARF register operation. - /// - /// \param MachineReg The register number. - /// \param MaxSize If the register must be composed from - /// sub-registers this is an upper bound - /// for how many bits the emitted DW_OP_piece - /// may cover. - /// - /// If size and offset is zero an operation for the entire register is - /// emitted: Some targets do not provide a DWARF register number for every - /// register. If this is the case, this function will attempt to emit a DWARF - /// register by emitting a fragment of a super-register or by piecing together - /// multiple subregisters that alias the register. - /// - /// \return false if no DWARF register exists for MachineReg. - bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg, - unsigned MaxSize = ~1U); + /// This needs to be called last to commit any pending changes. + void finalize(); /// Emit a signed constant. - void AddSignedConstant(int64_t Value); + void addSignedConstant(int64_t Value); /// Emit an unsigned constant. - void AddUnsignedConstant(uint64_t Value); + void addUnsignedConstant(uint64_t Value); /// Emit an unsigned constant. - void AddUnsignedConstant(const APInt &Value); + void addUnsignedConstant(const APInt &Value); + + /// Lock this down to become a memory location description. + void setMemoryLocationKind() { + assert(LocationKind == Unknown); + LocationKind = Memory; + } /// Emit a machine register location. As an optimization this may also consume /// the prefix of a DwarfExpression if a more efficient representation for /// combining the register location and the first operation exists. /// - /// \param FragmentOffsetInBits If this is one fragment out of a fragmented + /// \param FragmentOffsetInBits If this is one fragment out of a + /// fragmented /// location, this is the offset of the /// fragment inside the entire variable. /// \return false if no DWARF register exists /// for MachineReg. - bool AddMachineRegExpression(const TargetRegisterInfo &TRI, + bool addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &Expr, unsigned MachineReg, unsigned FragmentOffsetInBits = 0); /// Emit all remaining operations in the DIExpressionCursor. @@ -189,7 +215,7 @@ public: /// \param FragmentOffsetInBits If this is one fragment out of multiple /// locations, this is the offset of the /// fragment inside the entire variable. - void AddExpression(DIExpressionCursor &&Expr, + void addExpression(DIExpressionCursor &&Expr, unsigned FragmentOffsetInBits = 0); /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to @@ -198,33 +224,32 @@ public: }; /// DwarfExpression implementation for .debug_loc entries. -class DebugLocDwarfExpression : public DwarfExpression { +class DebugLocDwarfExpression final : public DwarfExpression { ByteStreamer &BS; + void emitOp(uint8_t Op, const char *Comment = nullptr) override; + void emitSigned(int64_t Value) override; + void emitUnsigned(uint64_t Value) override; + bool isFrameRegister(const TargetRegisterInfo &TRI, + unsigned MachineReg) override; public: DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS) : DwarfExpression(DwarfVersion), BS(BS) {} - - void EmitOp(uint8_t Op, const char *Comment = nullptr) override; - void EmitSigned(int64_t Value) override; - void EmitUnsigned(uint64_t Value) override; - bool isFrameRegister(const TargetRegisterInfo &TRI, - unsigned MachineReg) override; }; /// DwarfExpression implementation for singular DW_AT_location. -class DIEDwarfExpression : public DwarfExpression { +class DIEDwarfExpression final : public DwarfExpression { const AsmPrinter &AP; DwarfUnit &DU; DIELoc &DIE; -public: - DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); - void EmitOp(uint8_t Op, const char *Comment = nullptr) override; - void EmitSigned(int64_t Value) override; - void EmitUnsigned(uint64_t Value) override; + void emitOp(uint8_t Op, const char *Comment = nullptr) override; + void emitSigned(int64_t Value) override; + void emitUnsigned(uint64_t Value) override; bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) override; +public: + DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE); DIELoc *finalize() { DwarfExpression::finalize(); return &DIE; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h index d4d2ed277274..54924e9806ed 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -53,6 +53,7 @@ class DwarfFile { // Collection of abstract subprogram DIEs. DenseMap<const MDNode *, DIE *> AbstractSPDies; + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables; /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can /// be shared across CUs, that is why we keep the map here instead @@ -105,6 +106,9 @@ public: DenseMap<const MDNode *, DIE *> &getAbstractSPDies() { return AbstractSPDies; } + DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() { + return AbstractVariables; + } void insertDIE(const MDNode *TypeMD, DIE *Die) { DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die)); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 2a866c071f59..667afbb450bd 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Metadata.h" #include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" @@ -54,15 +55,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU), DIE(DIE) {} -void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) { +void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) { DU.addUInt(DIE, dwarf::DW_FORM_data1, Op); } -void DIEDwarfExpression::EmitSigned(int64_t Value) { +void DIEDwarfExpression::emitSigned(int64_t Value) { DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value); } -void DIEDwarfExpression::EmitUnsigned(uint64_t Value) { +void DIEDwarfExpression::emitUnsigned(uint64_t Value) { DU.addUInt(DIE, dwarf::DW_FORM_udata, Value); } @@ -73,8 +74,8 @@ bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI, DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU) - : DIEUnit(A->getDwarfVersion(), A->getPointerSize(), UnitTag), CUNode(Node), - Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) { + : DIEUnit(A->getDwarfVersion(), A->MAI->getCodePointerSize(), UnitTag), + CUNode(Node), Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) { } DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A, @@ -98,25 +99,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const { default: break; - case dwarf::DW_LANG_C89: - case dwarf::DW_LANG_C99: + // The languages below have valid values in all DWARF versions. case dwarf::DW_LANG_C: + case dwarf::DW_LANG_C89: case dwarf::DW_LANG_C_plus_plus: - case dwarf::DW_LANG_ObjC: - case dwarf::DW_LANG_ObjC_plus_plus: return 0; case dwarf::DW_LANG_Fortran77: case dwarf::DW_LANG_Fortran90: - case dwarf::DW_LANG_Fortran95: return 1; - // The languages below have valid values only if the DWARF version >= 4. + // The languages below have valid values only if the DWARF version >= 3. + case dwarf::DW_LANG_C99: + case dwarf::DW_LANG_ObjC: + case dwarf::DW_LANG_ObjC_plus_plus: + if (DD->getDwarfVersion() >= 3) + return 0; + break; + + case dwarf::DW_LANG_Fortran95: + if (DD->getDwarfVersion() >= 3) + return 1; + break; + + // Starting with DWARF v4, all defined languages have valid values. + case dwarf::DW_LANG_D: case dwarf::DW_LANG_Java: case dwarf::DW_LANG_Python: case dwarf::DW_LANG_UPC: - case dwarf::DW_LANG_D: - if (dwarf::DWARF_VERSION >= 4) + if (DD->getDwarfVersion() >= 4) return 0; break; @@ -127,31 +138,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const { case dwarf::DW_LANG_Modula2: case dwarf::DW_LANG_Pascal83: case dwarf::DW_LANG_PLI: - if (dwarf::DWARF_VERSION >= 4) + if (DD->getDwarfVersion() >= 4) return 1; break; - // The languages below have valid values only if the DWARF version >= 5. - case dwarf::DW_LANG_OpenCL: - case dwarf::DW_LANG_Go: - case dwarf::DW_LANG_Haskell: + // The languages below are new in DWARF v5. + case dwarf::DW_LANG_BLISS: + case dwarf::DW_LANG_C11: case dwarf::DW_LANG_C_plus_plus_03: case dwarf::DW_LANG_C_plus_plus_11: + case dwarf::DW_LANG_C_plus_plus_14: + case dwarf::DW_LANG_Dylan: + case dwarf::DW_LANG_Go: + case dwarf::DW_LANG_Haskell: case dwarf::DW_LANG_OCaml: + case dwarf::DW_LANG_OpenCL: + case dwarf::DW_LANG_RenderScript: case dwarf::DW_LANG_Rust: - case dwarf::DW_LANG_C11: case dwarf::DW_LANG_Swift: - case dwarf::DW_LANG_Dylan: - case dwarf::DW_LANG_C_plus_plus_14: - if (dwarf::DWARF_VERSION >= 5) + if (DD->getDwarfVersion() >= 5) return 0; break; - case dwarf::DW_LANG_Modula3: - case dwarf::DW_LANG_Julia: case dwarf::DW_LANG_Fortran03: case dwarf::DW_LANG_Fortran08: - if (dwarf::DWARF_VERSION >= 5) + case dwarf::DW_LANG_Julia: + case dwarf::DW_LANG_Modula3: + if (DD->getDwarfVersion() >= 5) return 1; break; } @@ -160,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const { } /// Check whether the DIE for this MDNode can be shared across CUs. -static bool isShareableAcrossCUs(const DINode *D) { +bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const { // When the MDNode can be part of the type system, the DIE can be shared // across CUs. // Combining type units and cross-CU DIE sharing is lower value (since @@ -168,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) { // level already) but may be implementable for some value in projects // building multiple independent libraries with LTO and then linking those // together. + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return false; return (isa<DIType>(D) || (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) && !GenerateDwarfTypeUnits; @@ -285,13 +300,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) { dwarf::DW_FORM_ref_sig8, DIEInteger(Signature)); } -void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, - StringRef Identifier) { - uint64_t Signature = DD->makeTypeSignature(Identifier); - Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8, - DIEInteger(Signature)); -} - void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry) { const DIEUnit *CU = Die.getUnit(); @@ -369,10 +377,6 @@ void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) { addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory()); } -void DwarfUnit::addSourceLine(DIE &Die, const DINamespace *NS) { - addSourceLine(Die, NS->getLine(), NS->getFilename(), NS->getDirectory()); -} - /* Byref variables, in Blocks, are declared by the programmer as "SomeType VarName;", but the compiler creates a __Block_byref_x_VarName struct, and gives the variable VarName either the struct, or a pointer to the struct, as @@ -465,50 +469,48 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die, // Decode the original location, and use that as the start of the byref // variable's location. DIELoc *Loc = new (DIEValueAllocator) DIELoc; - SmallVector<uint64_t, 6> DIExpr; - DIEDwarfExpression Expr(*Asm, *this, *Loc); - - bool validReg; - if (Location.isReg()) - validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg()); - else - validReg = - Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(), - Location.getReg(), Location.getOffset()); - - if (!validReg) - return; - + DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc); + if (Location.isIndirect()) + DwarfExpr.setMemoryLocationKind(); + + SmallVector<uint64_t, 9> Ops; + if (Location.isIndirect() && Location.getOffset()) { + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(Location.getOffset()); + } // If we started with a pointer to the __Block_byref... struct, then // the first thing we need to do is dereference the pointer (DW_OP_deref). if (isPointer) - DIExpr.push_back(dwarf::DW_OP_deref); + Ops.push_back(dwarf::DW_OP_deref); // Next add the offset for the '__forwarding' field: // DW_OP_plus_uconst ForwardingFieldOffset. Note there's no point in // adding the offset if it's 0. if (forwardingFieldOffset > 0) { - DIExpr.push_back(dwarf::DW_OP_plus); - DIExpr.push_back(forwardingFieldOffset); + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(forwardingFieldOffset); } // Now dereference the __forwarding field to get to the real __Block_byref // struct: DW_OP_deref. - DIExpr.push_back(dwarf::DW_OP_deref); + Ops.push_back(dwarf::DW_OP_deref); // Now that we've got the real __Block_byref... struct, add the offset // for the variable's field to get to the location of the actual variable: // DW_OP_plus_uconst varFieldOffset. Again, don't add if it's 0. if (varFieldOffset > 0) { - DIExpr.push_back(dwarf::DW_OP_plus); - DIExpr.push_back(varFieldOffset); + Ops.push_back(dwarf::DW_OP_plus); + Ops.push_back(varFieldOffset); } - Expr.AddExpression(makeArrayRef(DIExpr)); - Expr.finalize(); + + DIExpressionCursor Cursor(Ops); + const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); + if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg())) + return; + DwarfExpr.addExpression(std::move(Cursor)); // Now attach the location information to the DIE. - addBlock(Die, Attribute, Loc); + addBlock(Die, Attribute, DwarfExpr.finalize()); } /// Return true if type encoding is unsigned. @@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) { addString(Die, DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name : dwarf::DW_AT_MIPS_linkage_name, - GlobalValue::getRealLinkageName(LinkageName)); + GlobalValue::dropLLVMManglingEscape(LinkageName)); } void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) { @@ -658,6 +660,14 @@ void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) { } } +/// Add thrown types. +void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) { + for (const auto *Ty : ThrownTypes) { + DIE &TT = createAndAddDIE(dwarf::DW_TAG_thrown_type, Die); + addType(TT, cast<DIType>(Ty)); + } +} + DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { if (!Context || isa<DIFile>(Context)) return &getUnitDie(); @@ -672,7 +682,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { return getDIE(Context); } -DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { +DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) { auto *Context = resolve(Ty->getScope()); DIE *ContextDIE = getOrCreateContextDIE(Context); @@ -684,8 +694,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { constructTypeDIE(TyDIE, cast<DICompositeType>(Ty)); - if (!Ty->isExternalTypeRef()) - updateAcceleratorTables(Context, Ty, TyDIE); + updateAcceleratorTables(Context, Ty, TyDIE); return &TyDIE; } @@ -841,6 +850,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) { // Add source line info if available and TyDesc is not a forward declaration. if (!DTy->isForwardDecl()) addSourceLine(Buffer, DTy); + + // If DWARF address space value is other than None, add it for pointer and + // reference types as DW_AT_address_class. + if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type || + Tag == dwarf::DW_TAG_reference_type)) + addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4, + DTy->getDWARFAddressSpace().getValue()); } void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) { @@ -892,13 +908,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) { } void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { - if (CTy->isExternalTypeRef()) { - StringRef Identifier = CTy->getIdentifier(); - assert(!Identifier.empty() && "external type ref without identifier"); - addFlag(Buffer, dwarf::DW_AT_declaration); - return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier); - } - // Add name if not anonymous or intermediate type. StringRef Name = CTy->getName(); @@ -1074,7 +1083,6 @@ DIE *DwarfUnit::getOrCreateNameSpace(const DINamespace *NS) { Name = "(anonymous namespace)"; DD->addAccelNamespace(Name, NDie); addGlobalName(Name, NDie, NS->getScope()); - addSourceLine(NDie, NS); if (NS->getExportSymbols()) addFlag(NDie, dwarf::DW_AT_export_symbols); return &NDie; @@ -1180,8 +1188,12 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, } void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, - bool Minimal) { - if (!Minimal) + bool SkipSPAttributes) { + // If -fdebug-info-for-profiling is enabled, need to emit the subprogram + // and its source location. + bool SkipSPSourceLocation = SkipSPAttributes && + !CUNode->getDebugInfoForProfiling(); + if (!SkipSPSourceLocation) if (applySubprogramDefinitionAttributes(SP, SPDie)) return; @@ -1189,12 +1201,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, if (!SP->getName().empty()) addString(SPDie, dwarf::DW_AT_name, SP->getName()); + if (!SkipSPSourceLocation) + addSourceLine(SPDie, SP); + // Skip the rest of the attributes under -gmlt to save space. - if (Minimal) + if (SkipSPAttributes) return; - addSourceLine(SPDie, SP); - // Add the prototype if we have a prototype and we have a C like // language. uint16_t Language = getLanguage(); @@ -1241,6 +1254,8 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, constructSubprogramArguments(SPDie, Args); } + addThrownTypes(SPDie, SP->getThrownTypes()); + if (SP->isArtificial()) addFlag(SPDie, dwarf::DW_AT_artificial); @@ -1526,18 +1541,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) { return &StaticMemberDIE; } -void DwarfUnit::emitHeader(bool UseOffsets) { +void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) { // Emit size of content not including length itself Asm->OutStreamer->AddComment("Length of Unit"); Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize()); Asm->OutStreamer->AddComment("DWARF version number"); - Asm->EmitInt16(DD->getDwarfVersion()); - Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); + unsigned Version = DD->getDwarfVersion(); + Asm->EmitInt16(Version); + + // DWARF v5 reorders the address size and adds a unit type. + if (Version >= 5) { + Asm->OutStreamer->AddComment("DWARF Unit Type"); + Asm->EmitInt8(UT); + Asm->OutStreamer->AddComment("Address Size (in bytes)"); + Asm->EmitInt8(Asm->MAI->getCodePointerSize()); + } // We share one abbreviations table across all units so it's always at the // start of the section. Use a relocatable offset where needed to ensure // linking doesn't invalidate that offset. + Asm->OutStreamer->AddComment("Offset Into Abbrev. Section"); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); if (UseOffsets) Asm->EmitInt32(0); @@ -1545,12 +1569,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) { Asm->emitDwarfSymbolReference( TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false); - Asm->OutStreamer->AddComment("Address Size (in bytes)"); - Asm->EmitInt8(Asm->getDataLayout().getPointerSize()); + if (Version <= 4) { + Asm->OutStreamer->AddComment("Address Size (in bytes)"); + Asm->EmitInt8(Asm->MAI->getCodePointerSize()); + } } void DwarfTypeUnit::emitHeader(bool UseOffsets) { - DwarfUnit::emitHeader(UseOffsets); + DwarfUnit::emitCommonHeader(UseOffsets, + DD->useSplitDwarf() ? dwarf::DW_UT_split_type + : dwarf::DW_UT_type); Asm->OutStreamer->AddComment("Type Signature"); Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature)); Asm->OutStreamer->AddComment("Type DIE Offset"); @@ -1564,3 +1592,21 @@ bool DwarfTypeUnit::isDwoUnit() const { // when split DWARF is being used. return DD->useSplitDwarf(); } + +void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) { + getCU().addGlobalNameForTypeUnit(Name, Context); +} + +void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die, + const DIScope *Context) { + getCU().addGlobalTypeUnitType(Ty, Context); +} + +const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const { + if (!Asm->MAI->doesDwarfUseRelocationsAcrossSections()) + return nullptr; + if (isDwoUnit()) + return nullptr; + return getSection()->getBeginSymbol(); +} diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 8654d6f0caf4..7acad2cbd89f 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -65,7 +65,7 @@ public: //===----------------------------------------------------------------------===// /// This dwarf writer support class manages information associated with a /// source file. - class DwarfUnit : public DIEUnit { +class DwarfUnit : public DIEUnit { protected: /// MDNode for the compile unit. const DICompileUnit *CUNode; @@ -103,9 +103,10 @@ protected: bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie); -public: - virtual ~DwarfUnit(); + bool shareAcrossDWOCUs() const; + bool isShareableAcrossCUs(const DINode *D) const; +public: // Accessors. AsmPrinter* getAsmPrinter() const { return Asm; } uint16_t getLanguage() const { return CUNode->getSourceLanguage(); } @@ -124,12 +125,12 @@ public: std::string getParentContextString(const DIScope *Context) const; /// Add a new global name to the compile unit. - virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) { - } + virtual void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) = 0; /// Add a new global type to the compile unit. virtual void addGlobalType(const DIType *Ty, const DIE &Die, - const DIScope *Context) {} + const DIScope *Context) = 0; /// Returns the DIE map slot for the specified debug variable. /// @@ -198,9 +199,6 @@ public: /// Add a type's DW_AT_signature and set the declaration flag. void addDIETypeSignature(DIE &Die, uint64_t Signature); - /// Add an attribute containing the type signature for a unique identifier. - void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, - StringRef Identifier); /// Add block data. void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block); @@ -215,7 +213,6 @@ public: void addSourceLine(DIE &Die, const DIGlobalVariable *G); void addSourceLine(DIE &Die, const DISubprogram *SP); void addSourceLine(DIE &Die, const DIType *Ty); - void addSourceLine(DIE &Die, const DINamespace *NS); void addSourceLine(DIE &Die, const DIObjCProperty *Ty); /// Add constant value entry in variable DIE. @@ -235,6 +232,9 @@ public: /// Add template parameters in buffer. void addTemplateParams(DIE &Buffer, DINodeArray TParams); + /// Add thrown types. + void addThrownTypes(DIE &Die, DINodeArray ThrownTypes); + // FIXME: Should be reformulated in terms of addComplexAddress. /// Start with the address based on the location provided, and generate the /// DWARF information necessary to find the actual Block variable (navigating @@ -256,15 +256,12 @@ public: DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false); void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, - bool Minimal = false); + bool SkipSPAttributes = false); /// Find existing DIE or create new DIE for the given type. DIE *getOrCreateTypeDIE(const MDNode *N); /// Get context owner's DIE. - DIE *createTypeDIE(const DICompositeType *Ty); - - /// Get context owner's DIE. DIE *getOrCreateContextDIE(const DIScope *Context); /// Construct DIEs for types that contain vtables. @@ -282,17 +279,21 @@ public: virtual unsigned getHeaderSize() const { return sizeof(int16_t) + // DWARF version number sizeof(int32_t) + // Offset Into Abbrev. Section - sizeof(int8_t); // Pointer Size (in bytes) + sizeof(int8_t) + // Pointer Size (in bytes) + (DD->getDwarfVersion() >= 5 ? sizeof(int8_t) + : 0); // DWARF v5 unit type } /// Emit the header for this unit, not including the initial length field. - virtual void emitHeader(bool UseOffsets); + virtual void emitHeader(bool UseOffsets) = 0; virtual DwarfCompileUnit &getCU() = 0; void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy); protected: + ~DwarfUnit(); + /// Create new static data member DIE. DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT); @@ -306,6 +307,14 @@ protected: return Ref.resolve(); } + /// If this is a named finished type then include it in the list of types for + /// the accelerator tables. + void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, + const DIE &TyDIE); + + /// Emit the common part of the header for this unit. + void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT); + private: void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy); void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy); @@ -330,15 +339,11 @@ private: /// Set D as anonymous type for index which can be reused later. void setIndexTyDie(DIE *D) { IndexTyDie = D; } - /// If this is a named finished type then include it in the list of types for - /// the accelerator tables. - void updateAcceleratorTables(const DIScope *Context, const DIType *Ty, - const DIE &TyDIE); - virtual bool isDwoUnit() const = 0; + const MCSymbol *getCrossSectionRelativeBaseAddress() const override; }; -class DwarfTypeUnit : public DwarfUnit { +class DwarfTypeUnit final : public DwarfUnit { uint64_t TypeSignature; const DIE *Ty; DwarfCompileUnit &CU; @@ -354,12 +359,19 @@ public: void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; } void setType(const DIE *Ty) { this->Ty = Ty; } + /// Get context owner's DIE. + DIE *createTypeDIE(const DICompositeType *Ty); + /// Emit the header for this unit, not including the initial length field. void emitHeader(bool UseOffsets) override; unsigned getHeaderSize() const override { return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature sizeof(uint32_t); // Type DIE Offset } + void addGlobalName(StringRef Name, const DIE &Die, + const DIScope *Context) override; + void addGlobalType(const DIType *Ty, const DIE &Die, + const DIScope *Context) override; DwarfCompileUnit &getCU() override { return CU; } }; } // end llvm namespace diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp index 6a023b998b32..342efc3611c7 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp @@ -1,4 +1,4 @@ -//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===// +//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===// // // The LLVM Compiler Infrastructure // @@ -14,21 +14,19 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/GCs.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Metadata.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Support/ELF.h" using namespace llvm; @@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter { public: void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override; }; -} + +} // end anonymous namespace static GCMetadataPrinterRegistry::Add<ErlangGCPrinter> X("erlang", "erlang-compatible garbage collector"); -void llvm::linkErlangGCPrinter() {} - void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) { MCStreamer &OS = *AP.OutStreamer; @@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, } } } + +void llvm::linkErlangGCPrinter() {} diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index 9d7c96a1b8ef..815658bfb637 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) { const Function *F = MF->getFunction(); - shouldEmitMoves = Asm->needsSEHMoves(); + shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI(); const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); unsigned PerEncoding = TLOF.getPersonalityEncoding(); @@ -94,14 +94,14 @@ void WinException::beginFunction(const MachineFunction *MF) { // If we're not using CFI, we don't want the CFI or the personality, but we // might want EH tables if we had EH pads. - if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) { + if (!Asm->MAI->usesWindowsCFI()) { if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) { // If this is 32-bit SEH and we don't have any funclets (really invokes), // make sure we emit the parent offset label. Some unreferenced filter // functions may still refer to it. const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); StringRef FLinkageName = - GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName()); emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName); } shouldEmitLSDA = hasEHFunclets; @@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm, // their funclet entry block's number. const MachineFunction *MF = MBB->getParent(); const Function *F = MF->getFunction(); - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); MCContext &Ctx = MF->getContext(); StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch"; return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" + @@ -252,7 +252,7 @@ void WinException::endFunclet() { !CurrentFuncletEntry->isCleanupFuncletEntry()) { // If this is a C++ catch funclet (or the parent function), // emit a reference to the LSDA for the parent function. - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol( Twine("$cppxdata$", FuncLinkageName)); Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4); @@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) { // Emit a label assignment with the SEH frame offset so we can use it for // llvm.x86.seh.recoverfp. StringRef FLinkageName = - GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName()); MCSymbol *ParentFrameOffset = Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); const MCExpr *MCOffset = @@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { auto &OS = *Asm->OutStreamer; const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable; MCSymbol *FuncInfoXData = nullptr; @@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo, void WinException::emitExceptHandlerTable(const MachineFunction *MF) { MCStreamer &OS = *Asm->OutStreamer; const Function *F = MF->getFunction(); - StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); bool VerboseAsm = OS.isVerboseAsm(); auto AddComment = [&](const Twine &Comment) { diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp index bf5cf105a8f8..344136b1f195 100644 --- a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" @@ -35,20 +36,17 @@ using namespace llvm; namespace { class AtomicExpand: public FunctionPass { - const TargetMachine *TM; const TargetLowering *TLI; public: static char ID; // Pass identification, replacement for typeid - explicit AtomicExpand(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr) { + AtomicExpand() : FunctionPass(ID), TLI(nullptr) { initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; private: - bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, - bool IsStore, bool IsLoad); + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); bool tryExpandAtomicLoad(LoadInst *LI); @@ -98,12 +96,10 @@ namespace { char AtomicExpand::ID = 0; char &llvm::AtomicExpandID = AtomicExpand::ID; -INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions", - false, false) +INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", + false, false) -FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) { - return new AtomicExpand(TM); -} +FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } namespace { // Helper functions to retrieve the size of atomic instructions. @@ -173,9 +169,14 @@ bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { } // end anonymous namespace bool AtomicExpand::runOnFunction(Function &F) { - if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<TargetMachine>(); + if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + TLI = TM.getSubtargetImpl(F)->getTargetLowering(); SmallVector<Instruction *, 1> AtomicInsts; @@ -224,22 +225,16 @@ bool AtomicExpand::runOnFunction(Function &F) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; - bool IsStore, IsLoad; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); - IsStore = false; - IsLoad = true; } else if (SI && isReleaseOrStronger(SI->getOrdering())) { FenceOrdering = SI->getOrdering(); SI->setOrdering(AtomicOrdering::Monotonic); - IsStore = true; - IsLoad = false; } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || isAcquireOrStronger(RMWI->getOrdering()))) { FenceOrdering = RMWI->getOrdering(); RMWI->setOrdering(AtomicOrdering::Monotonic); - IsStore = IsLoad = true; } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && (isReleaseOrStronger(CASI->getSuccessOrdering()) || isAcquireOrStronger(CASI->getSuccessOrdering()))) { @@ -250,11 +245,10 @@ bool AtomicExpand::runOnFunction(Function &F) { FenceOrdering = CASI->getSuccessOrdering(); CASI->setSuccessOrdering(AtomicOrdering::Monotonic); CASI->setFailureOrdering(AtomicOrdering::Monotonic); - IsStore = IsLoad = true; } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); + MadeChange |= bracketInstWithFences(I, FenceOrdering); } } @@ -320,13 +314,12 @@ bool AtomicExpand::runOnFunction(Function &F) { return MadeChange; } -bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, - bool IsStore, bool IsLoad) { +bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { IRBuilder<> Builder(I); - auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); - auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); // The trailing fence is emitted before the instruction instead of after // because there is no easy way of setting Builder insertion point after // an instruction. So we must erase it from the BB, and insert it back @@ -1048,8 +1041,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) - TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitLeadingFence(Builder, CI, SuccessOrder); Builder.CreateBr(StartBB); // Start the main loop block now that we've taken care of the preliminaries. @@ -1064,8 +1056,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.SetInsertPoint(ReleasingStoreBB); if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) - TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitLeadingFence(Builder, CI, SuccessOrder); Builder.CreateBr(TryStoreBB); Builder.SetInsertPoint(TryStoreBB); @@ -1094,8 +1085,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // necessary. Builder.SetInsertPoint(SuccessBB); if (ShouldInsertFencesForAtomic) - TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitTrailingFence(Builder, CI, SuccessOrder); Builder.CreateBr(ExitBB); Builder.SetInsertPoint(NoStoreBB); @@ -1107,8 +1097,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.SetInsertPoint(FailureBB); if (ShouldInsertFencesForAtomic) - TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitTrailingFence(Builder, CI, FailureOrder); Builder.CreateBr(ExitBB); // Finally, we have control-flow based knowledge of whether the cmpxchg @@ -1532,7 +1521,7 @@ bool AtomicExpand::expandAtomicOpToLibcall( Type *ResultTy; SmallVector<Value *, 6> Args; - AttributeSet Attr; + AttributeList Attr; // 'size' argument. if (!UseSizedLibcall) { @@ -1593,7 +1582,7 @@ bool AtomicExpand::expandAtomicOpToLibcall( // Now, the return type. if (CASExpected) { ResultTy = Type::getInt1Ty(Ctx); - Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt); + Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt); } else if (HasResult && UseSizedLibcall) ResultTy = SizedIntTy; else diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp index a67e194356d8..d3fced436b68 100644 --- a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -24,8 +24,6 @@ #include <utility> using namespace llvm; -#define DEBUG_TYPE "basictti" - // This flag is used by the template base class for BasicTTIImpl, and here to // provide a definition. cl::opt<unsigned> diff --git a/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp b/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp new file mode 100644 index 000000000000..2c41b597843c --- /dev/null +++ b/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp @@ -0,0 +1,758 @@ +//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Coalesce basic blocks guarded by the same branch condition into a single +/// basic block. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "branch-coalescing" + +static cl::opt<cl::boolOrDefault> + EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden, + cl::desc("enable coalescing of duplicate branches")); + +STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced"); +STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged"); +STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced"); + +//===----------------------------------------------------------------------===// +// BranchCoalescing +//===----------------------------------------------------------------------===// +/// +/// Improve scheduling by coalescing branches that depend on the same condition. +/// This pass looks for blocks that are guarded by the same branch condition +/// and attempts to merge the blocks together. Such opportunities arise from +/// the expansion of select statements in the IR. +/// +/// For example, consider the following LLVM IR: +/// +/// %test = icmp eq i32 %x 0 +/// %tmp1 = select i1 %test, double %a, double 2.000000e-03 +/// %tmp2 = select i1 %test, double %b, double 5.000000e-03 +/// +/// This IR expands to the following machine code on PowerPC: +/// +/// BB#0: derived from LLVM BB %entry +/// Live Ins: %F1 %F3 %X6 +/// <SNIP1> +/// %vreg0<def> = COPY %F1; F8RC:%vreg0 +/// %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +/// %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +/// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +/// BCC 76, %vreg5, <BB#2>; CRRC:%vreg5 +/// Successors according to CFG: BB#1(?%) BB#2(?%) +/// +/// BB#1: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 +/// Successors according to CFG: BB#2(?%) +/// +/// BB#2: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 BB#1 +/// %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +/// F8RC:%vreg9,%vreg8,%vreg0 +/// <SNIP2> +/// BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +/// Successors according to CFG: BB#3(?%) BB#4(?%) +/// +/// BB#3: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#2 +/// Successors according to CFG: BB#4(?%) +/// +/// BB#4: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#2 BB#3 +/// %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>; +/// F8RC:%vreg13,%vreg12,%vreg2 +/// <SNIP3> +/// BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// When this pattern is detected, branch coalescing will try to collapse +/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3. +/// +/// If all conditions are meet, IR should collapse to: +/// +/// BB#0: derived from LLVM BB %entry +/// Live Ins: %F1 %F3 %X6 +/// <SNIP1> +/// %vreg0<def> = COPY %F1; F8RC:%vreg0 +/// %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4 +/// %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>; +/// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7 +/// <SNIP2> +/// BCC 76, %vreg5, <BB#4>; CRRC:%vreg5 +/// Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%) +/// BB#4(0x55555554 / 0x80000000 = 66.67%) +/// +/// BB#1: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 +/// Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%) +/// +/// BB#4: derived from LLVM BB %entry +/// Predecessors according to CFG: BB#0 BB#1 +/// %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>; +/// F8RC:%vreg9,%vreg8,%vreg0 +/// %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>; +/// F8RC:%vreg13,%vreg12,%vreg2 +/// <SNIP3> +/// BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use> +/// +/// Branch Coalescing does not split blocks, it moves everything in the same +/// direction ensuring it does not break use/definition semantics. +/// +/// PHI nodes and its corresponding use instructions are moved to its successor +/// block if there are no uses within the successor block PHI nodes. PHI +/// node ordering cannot be assumed. +/// +/// Non-PHI can be moved up to the predecessor basic block or down to the +/// successor basic block following any PHI instructions. Whether it moves +/// up or down depends on whether the register(s) defined in the instructions +/// are used in current block or in any PHI instructions at the beginning of +/// the successor block. + +namespace { + +class BranchCoalescing : public MachineFunctionPass { + struct CoalescingCandidateInfo { + MachineBasicBlock *BranchBlock; // Block containing the branch + MachineBasicBlock *BranchTargetBlock; // Block branched to + MachineBasicBlock *FallThroughBlock; // Fall-through if branch not taken + SmallVector<MachineOperand, 4> Cond; + bool MustMoveDown; + bool MustMoveUp; + + CoalescingCandidateInfo(); + void clear(); + }; + + MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + void initialize(MachineFunction &F); + bool canCoalesceBranch(CoalescingCandidateInfo &Cand); + bool identicalOperands(ArrayRef<MachineOperand> OperandList1, + ArrayRef<MachineOperand> OperandList2) const; + bool validateCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + + static bool isBranchCoalescingEnabled() { + return EnableBranchCoalescing == cl::BOU_TRUE; + } + +public: + static char ID; + + BranchCoalescing() : MachineFunctionPass(ID) { + initializeBranchCoalescingPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Branch Coalescing"; } + + bool mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion); + bool canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + bool canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const; + void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB, + MachineBasicBlock *TargetRegionMBB); + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char BranchCoalescing::ID = 0; +char &llvm::BranchCoalescingID = BranchCoalescing::ID; + +INITIALIZE_PASS_BEGIN(BranchCoalescing, DEBUG_TYPE, + "Branch Coalescing", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(BranchCoalescing, DEBUG_TYPE, "Branch Coalescing", + false, false) + +BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo() + : BranchBlock(nullptr), BranchTargetBlock(nullptr), + FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {} + +void BranchCoalescing::CoalescingCandidateInfo::clear() { + BranchBlock = nullptr; + BranchTargetBlock = nullptr; + FallThroughBlock = nullptr; + Cond.clear(); + MustMoveDown = false; + MustMoveUp = false; +} + +void BranchCoalescing::initialize(MachineFunction &MF) { + MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = &getAnalysis<MachinePostDominatorTree>(); + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); +} + +/// +/// Analyze the branch statement to determine if it can be coalesced. This +/// method analyses the branch statement for the given candidate to determine +/// if it can be coalesced. If the branch can be coalesced, then the +/// BranchTargetBlock and the FallThroughBlock are recorded in the specified +/// Candidate. +/// +///\param[in,out] Cand The coalescing candidate to analyze +///\return true if and only if the branch can be coalesced, false otherwise +/// +bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) { + DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber() + << " can be coalesced:"); + MachineBasicBlock *FalseMBB = nullptr; + + if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB, + Cand.Cond)) { + DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n"); + return false; + } + + for (auto &I : Cand.BranchBlock->terminators()) { + DEBUG(dbgs() << "Looking at terminator : " << I << "\n"); + if (!I.isBranch()) + continue; + + if (I.getNumOperands() != I.getNumExplicitOperands()) { + DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I + << "\n"); + return false; + } + } + + if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) { + DEBUG(dbgs() << "EH Pad - skip\n"); + return false; + } + + // For now only consider triangles (i.e, BranchTargetBlock is set, + // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock) + if (!Cand.BranchTargetBlock || FalseMBB || + !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() << "Does not form a triangle - skip\n"); + return false; + } + + // Ensure there are only two successors + if (Cand.BranchBlock->succ_size() != 2) { + DEBUG(dbgs() << "Does not have 2 successors - skip\n"); + return false; + } + + // Sanity check - the block must be able to fall through + assert(Cand.BranchBlock->canFallThrough() && + "Expecting the block to fall through!"); + + // We have already ensured there are exactly two successors to + // BranchBlock and that BranchTargetBlock is a successor to BranchBlock. + // Ensure the single fall though block is empty. + MachineBasicBlock *Succ = + (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock) + ? *Cand.BranchBlock->succ_rbegin() + : *Cand.BranchBlock->succ_begin(); + + assert(Succ && "Expecting a valid fall-through block\n"); + + if (!Succ->empty()) { + DEBUG(dbgs() << "Fall-through block contains code -- skip\n"); + return false; + } + + if (!Succ->isSuccessor(Cand.BranchTargetBlock)) { + DEBUG(dbgs() + << "Successor of fall through block is not branch taken block\n"); + return false; + } + + Cand.FallThroughBlock = Succ; + DEBUG(dbgs() << "Valid Candidate\n"); + return true; +} + +/// +/// Determine if the two operand lists are identical +/// +/// \param[in] OpList1 operand list +/// \param[in] OpList2 operand list +/// \return true if and only if the operands lists are identical +/// +bool BranchCoalescing::identicalOperands( + ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const { + + if (OpList1.size() != OpList2.size()) { + DEBUG(dbgs() << "Operand list is different size\n"); + return false; + } + + for (unsigned i = 0; i < OpList1.size(); ++i) { + const MachineOperand &Op1 = OpList1[i]; + const MachineOperand &Op2 = OpList2[i]; + + DEBUG(dbgs() << "Op1: " << Op1 << "\n" + << "Op2: " << Op2 << "\n"); + + if (Op1.isIdenticalTo(Op2)) { + DEBUG(dbgs() << "Op1 and Op2 are identical!\n"); + continue; + } + + // If the operands are not identical, but are registers, check to see if the + // definition of the register produces the same value. If they produce the + // same value, consider them to be identical. + if (Op1.isReg() && Op2.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg()) && + TargetRegisterInfo::isVirtualRegister(Op2.getReg())) { + MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg()); + MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg()); + if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) { + DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def + << " produce the same value!\n"); + } else { + DEBUG(dbgs() << "Operands produce different values\n"); + return false; + } + } else { + DEBUG(dbgs() << "The operands are not provably identical.\n"); + return false; + } + } + return true; +} + +/// +/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB +/// and update them to refer to the new block. PHI node ordering +/// cannot be assumed so it does not matter where the PHI instructions +/// are moved to in TargetMBB. +/// +/// \param[in] SourceMBB block to move PHI instructions from +/// \param[in] TargetMBB block to move PHI instructions to +/// +void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB, + MachineBasicBlock *TargetMBB) { + + MachineBasicBlock::iterator MI = SourceMBB->begin(); + MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI(); + + if (MI == ME) { + DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n"); + return; + } + + // Update all PHI instructions in SourceMBB and move to top of TargetMBB + for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) { + MachineInstr &PHIInst = *Iter; + for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) { + MachineOperand &MO = PHIInst.getOperand(i); + if (MO.getMBB() == SourceMBB) + MO.setMBB(TargetMBB); + } + } + TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME); +} + +/// +/// This function checks if MI can be moved to the beginning of the TargetMBB +/// following PHI instructions. A MI instruction can be moved to beginning of +/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to beginning of TargetMBB, +/// false otherwise. +/// +bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Def : MI.defs()) { // Looking at Def + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == &TargetMBB) { + DEBUG(dbgs() << " *** used in a PHI -- cannot move ***\n"); + return false; + } + } + } + + DEBUG(dbgs() << " Safe to move to the beginning.\n"); + return true; +} + +/// +/// This function checks if MI can be moved to the end of the TargetMBB, +/// immediately before the first terminator. A MI instruction can be moved +/// to then end of the TargetMBB if no PHI node defines what MI uses within +/// it's own MBB. +/// +/// \param[in] MI the machine instruction to move. +/// \param[in] TargetMBB the machine basic block to move to +/// \return true if it is safe to move MI to end of TargetMBB, +/// false otherwise. +/// +bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI, + const MachineBasicBlock &TargetMBB + ) const { + + DEBUG(dbgs() << "Checking if " << MI << " can move to end of " + << TargetMBB.getNumber() << "\n"); + + for (auto &Use : MI.uses()) { + if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) { + MachineInstr *DefInst = MRI->getVRegDef(Use.getReg()); + if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) { + DEBUG(dbgs() << " *** Cannot move this instruction ***\n"); + return false; + } else { + DEBUG(dbgs() << " *** def is in another block -- safe to move!\n"); + } + } + } + + DEBUG(dbgs() << " Safe to move to the end.\n"); + return true; +} + +/// +/// This method checks to ensure the two coalescing candidates follows the +/// expected pattern required for coalescing. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion; false otherwise. +/// +bool BranchCoalescing::validateCandidates( + CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + + if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock) + llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion"); + else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock)) + llvm_unreachable("Expecting TargetRegion to dominate SourceRegion"); + else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock)) + llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion"); + else if (!TargetRegion.FallThroughBlock->empty() || + !SourceRegion.FallThroughBlock->empty()) + llvm_unreachable("Expecting fall-through blocks to be empty"); + + return true; +} + +/// +/// This method determines whether the two coalescing candidates can be merged. +/// In order to be merged, all instructions must be able to +/// 1. Move to the beginning of the SourceRegion.BranchTargetBlock; +/// 2. Move to the end of the TargetRegion.BranchBlock. +/// Merging involves moving the instructions in the +/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock). +/// +/// This function first try to move instructions from the +/// TargetRegion.BranchTargetBlock down, to the beginning of the +/// SourceRegion.BranchTargetBlock. This is not possible if any register defined +/// in TargetRegion.BranchTargetBlock is used in a PHI node in the +/// SourceRegion.BranchTargetBlock. In this case, check whether the statement +/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately +/// before the branch statement). If it cannot move, then these blocks cannot +/// be merged. +/// +/// Note that there is no analysis for moving instructions past the fall-through +/// blocks because they are confirmed to be empty. An assert is thrown if they +/// are not. +/// +/// \param[in] SourceRegion The candidate to move statements from +/// \param[in] TargetRegion The candidate to move statements to +/// \return true if all instructions in SourceRegion.BranchBlock can be merged +/// into a block in TargetRegion, false otherwise. +/// +bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) const { + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Walk through PHI nodes first and see if they force the merge into the + // SourceRegion.BranchTargetBlock. + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->instr_begin(), + E = SourceRegion.BranchBlock->getFirstNonPHI(); + I != E; ++I) { + for (auto &Def : I->defs()) + for (auto &Use : MRI->use_instructions(Def.getReg())) { + if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) { + DEBUG(dbgs() << "PHI " << *I << " defines register used in another " + "PHI within branch target block -- can't merge\n"); + NumPHINotMoved++; + return false; + } + if (Use.getParent() == SourceRegion.BranchBlock) { + DEBUG(dbgs() << "PHI " << *I + << " defines register used in this " + "block -- all must move down\n"); + SourceRegion.MustMoveDown = true; + } + } + } + + // Walk through the MI to see if they should be merged into + // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down) + for (MachineBasicBlock::iterator + I = SourceRegion.BranchBlock->getFirstNonPHI(), + E = SourceRegion.BranchBlock->end(); + I != E; ++I) { + if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move down - must move up!\n"); + SourceRegion.MustMoveUp = true; + } + if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) { + DEBUG(dbgs() << "Instruction " << *I + << " cannot move up - must move down!\n"); + SourceRegion.MustMoveDown = true; + } + } + + return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true; +} + +/// Merge the instructions from SourceRegion.BranchBlock, +/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into +/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and +/// TargetRegion.FallThroughBlock respectively. +/// +/// The successors for blocks in TargetRegion will be updated to use the +/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion +/// will be removed from the function. +/// +/// A region consists of a BranchBlock, a FallThroughBlock, and a +/// BranchTargetBlock. Branch coalesce works on patterns where the +/// TargetRegion's BranchTargetBlock must also be the SourceRegions's +/// BranchBlock. +/// +/// Before mergeCandidates: +/// +/// +---------------------------+ +/// | TargetRegion.BranchBlock | +/// +---------------------------+ +/// / | +/// / +--------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | TargetRegion.BranchTargetBlock | +/// | SourceRegion.BranchBlock | +/// +----------------------------------+ +/// / | +/// / +--------------------------------+ +/// | | SourceRegion.FallThroughBlock | +/// \ +--------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// After mergeCandidates: +/// +/// +-----------------------------+ +/// | TargetRegion.BranchBlock | +/// | SourceRegion.BranchBlock | +/// +-----------------------------+ +/// / | +/// / +---------------------------------+ +/// | | TargetRegion.FallThroughBlock | +/// | | SourceRegion.FallThroughBlock | +/// \ +---------------------------------+ +/// \ | +/// +----------------------------------+ +/// | SourceRegion.BranchTargetBlock | +/// +----------------------------------+ +/// +/// \param[in] SourceRegion The candidate to move blocks from +/// \param[in] TargetRegion The candidate to move blocks to +/// +bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion, + CoalescingCandidateInfo &TargetRegion) { + + if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) { + llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!"); + return false; + } + + if (!validateCandidates(SourceRegion, TargetRegion)) + return false; + + // Start the merging process by first handling the BranchBlock. + // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block + moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + + // Move remaining instructions in SourceRegion.BranchBlock into + // TargetRegion.BranchBlock + MachineBasicBlock::iterator firstInstr = + SourceRegion.BranchBlock->getFirstNonPHI(); + MachineBasicBlock::iterator lastInstr = + SourceRegion.BranchBlock->getFirstTerminator(); + + MachineBasicBlock *Source = SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock + : TargetRegion.BranchBlock; + + MachineBasicBlock::iterator Target = + SourceRegion.MustMoveDown + ? SourceRegion.BranchTargetBlock->getFirstNonPHI() + : TargetRegion.BranchBlock->getFirstTerminator(); + + Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr); + + // Once PHI and instructions have been moved we need to clean up the + // control flow. + + // Remove SourceRegion.FallThroughBlock before transferring successors of + // SourceRegion.BranchBlock to TargetRegion.BranchBlock. + SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock); + TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.BranchBlock); + // Update branch in TargetRegion.BranchBlock to jump to + // SourceRegion.BranchTargetBlock + // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock. + TargetRegion.BranchBlock->ReplaceUsesOfBlockWith( + SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock); + // Remove the branch statement(s) in SourceRegion.BranchBlock + MachineBasicBlock::iterator I = + SourceRegion.BranchBlock->terminators().begin(); + while (I != SourceRegion.BranchBlock->terminators().end()) { + MachineInstr &CurrInst = *I; + ++I; + if (CurrInst.isBranch()) + CurrInst.eraseFromParent(); + } + + // Fall-through block should be empty since this is part of the condition + // to coalesce the branches. + assert(TargetRegion.FallThroughBlock->empty() && + "FallThroughBlocks should be empty!"); + + // Transfer successor information and move PHIs down to the + // branch-taken block. + TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs( + SourceRegion.FallThroughBlock); + TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock); + + // Remove the blocks from the function. + assert(SourceRegion.BranchBlock->empty() && + "Expecting branch block to be empty!"); + SourceRegion.BranchBlock->eraseFromParent(); + + assert(SourceRegion.FallThroughBlock->empty() && + "Expecting fall-through block to be empty!\n"); + SourceRegion.FallThroughBlock->eraseFromParent(); + + NumBlocksCoalesced++; + return true; +} + +bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) { + + if (skipFunction(*MF.getFunction()) || MF.empty() || + !isBranchCoalescingEnabled()) + return false; + + bool didSomething = false; + + DEBUG(dbgs() << "******** Branch Coalescing ********\n"); + initialize(MF); + + DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); + + CoalescingCandidateInfo Cand1, Cand2; + // Walk over blocks and find candidates to merge + // Continue trying to merge with the first candidate found, as long as merging + // is successfull. + for (MachineBasicBlock &MBB : MF) { + bool MergedCandidates = false; + do { + MergedCandidates = false; + Cand1.clear(); + Cand2.clear(); + + Cand1.BranchBlock = &MBB; + + // If unable to coalesce the branch, then continue to next block + if (!canCoalesceBranch(Cand1)) + break; + + Cand2.BranchBlock = Cand1.BranchTargetBlock; + if (!canCoalesceBranch(Cand2)) + break; + + // Sanity check + // The branch-taken block of the second candidate should post-dominate the + // first candidate + assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) && + "Branch-taken block should post-dominate first candidate"); + + if (!identicalOperands(Cand1.Cond, Cand2.Cond)) { + DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and " + << Cand2.BranchBlock->getNumber() + << " have different branches\n"); + break; + } + if (!canMerge(Cand2, Cand1)) { + DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand2.BranchBlock->getNumber() << "\n"); + NumBlocksNotCoalesced++; + continue; + } + DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber() + << " and " << Cand1.BranchTargetBlock->getNumber() << "\n"); + MergedCandidates = mergeCandidates(Cand2, Cand1); + if (MergedCandidates) + didSomething = true; + + DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n"); + } while (MergedCandidates); + } + +#ifndef NDEBUG + // Verify MF is still valid after branch coalescing + if (didSomething) + MF.verify(nullptr, "Error in code produced by branch coalescing"); +#endif // NDEBUG + + DEBUG(dbgs() << "Finished Branch Coalescing\n"); + return didSomething; +} diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp index 6fba161033b0..03ceac10beec 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp +++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -43,12 +44,13 @@ #include <algorithm> using namespace llvm; -#define DEBUG_TYPE "branchfolding" +#define DEBUG_TYPE "branch-folder" STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); STATISTIC(NumBranchOpts, "Number of branches optimized"); STATISTIC(NumTailMerge , "Number of block tails merged"); STATISTIC(NumHoist , "Number of times common instructions are hoisted"); +STATISTIC(NumTailCalls, "Number of tail calls optimized"); static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge", cl::init(cl::BOU_UNSET), cl::Hidden); @@ -87,7 +89,7 @@ namespace { char BranchFolderPass::ID = 0; char &llvm::BranchFolderPassID = BranchFolderPass::ID; -INITIALIZE_PASS(BranchFolderPass, "branch-folder", +INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE, "Control Flow Optimizer", false, false) bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) { @@ -123,8 +125,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist, } } -/// RemoveDeadBlock - Remove the specified dead machine basic block from the -/// function, updating the CFG. void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); @@ -144,9 +144,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { MLI->removeBlock(MBB); } -/// OptimizeFunction - Perhaps branch folding, tail merging and other -/// CFG optimizations on the given function. Block placement changes the layout -/// and may create new tail merging opportunities. bool BranchFolder::OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, @@ -156,13 +153,14 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, TriedMerging.clear(); + MachineRegisterInfo &MRI = MF.getRegInfo(); AfterBlockPlacement = AfterPlacement; TII = tii; TRI = tri; MMI = mmi; MLI = mli; + this->MRI = &MRI; - MachineRegisterInfo &MRI = MF.getRegInfo(); UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF); if (!UpdateLiveIns) MRI.invalidateLiveness(); @@ -348,23 +346,18 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1, return TailLen; } -/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything -/// after it, replacing it with an unconditional branch to NewDest. void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, MachineBasicBlock *NewDest) { TII->ReplaceTailWithBranchTo(OldInst, NewDest); if (UpdateLiveIns) { NewDest->clearLiveIns(); - computeLiveIns(LiveRegs, *TRI, *NewDest); + computeLiveIns(LiveRegs, *MRI, *NewDest); } ++NumTailMerge; } -/// SplitMBBAt - Given a machine basic block and an iterator into it, split the -/// MBB so that the part before the iterator falls into the part starting at the -/// iterator. This returns the new MBB. MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MachineBasicBlock::iterator BBI1, const BasicBlock *BB) { @@ -388,7 +381,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end()); // NewMBB belongs to the same loop as CurMBB. - if (MLI) + if (MLI) if (MachineLoop *ML = MLI->getLoopFor(&CurMBB)) ML->addBasicBlockToLoop(NewMBB, MLI->getBase()); @@ -396,7 +389,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB)); if (UpdateLiveIns) - computeLiveIns(LiveRegs, *TRI, *NewMBB); + computeLiveIns(LiveRegs, *MRI, *NewMBB); // Add the new block to the funclet. const auto &FuncletI = FuncletMembership.find(&CurMBB); @@ -436,7 +429,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB, MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB)); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc dl; // FIXME: this is nowhere + DebugLoc dl = CurMBB->findBranchDebugLoc(); if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) { MachineBasicBlock *NextBB = &*I; if (TBB == NextBB && !Cond.empty() && !FBB) { @@ -497,6 +490,15 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS, return MBFI.printBlockFreq(OS, Freq); } +void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) { + MBFI.view(Name, isSimple); +} + +uint64_t +BranchFolder::MBFIWrapper::getEntryFreq() const { + return MBFI.getEntryFreq(); +} + /// CountTerminators - Count the number of terminators in the given /// block and set I to the position of the first non-terminator, if there /// is one, or MBB->end() otherwise. @@ -516,6 +518,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB, return NumTerms; } +/// A no successor, non-return block probably ends in unreachable and is cold. +/// Also consider a block that ends in an indirect branch to be a return block, +/// since many targets use plain indirect branches to return. +static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) { + if (!MBB->succ_empty()) + return false; + if (MBB->empty()) + return true; + return !(MBB->back().isReturn() || MBB->back().isIndirectBranch()); +} + /// ProfitableToMerge - Check if two machine basic blocks have a common tail /// and decide if it would be profitable to merge those tails. Return the /// length of the common tail and iterators to the first common instruction @@ -570,6 +583,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, return true; } + // If these are identical non-return blocks with no successors, merge them. + // Such blocks are typically cold calls to noreturn functions like abort, and + // are unlikely to become a fallthrough target after machine block placement. + // Tail merging these blocks is unlikely to create additional unconditional + // branches, and will reduce the size of this cold code. + if (I1 == MBB1->begin() && I2 == MBB2->begin() && + blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2)) + return true; + // If one of the blocks can be completely merged and happens to be in // a position where the other could fall through into it, merge any number // of instructions, because it can be done without a branch. @@ -579,6 +601,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin()) return true; + // If both blocks are identical and end in a branch, merge them unless they + // both have a fallthrough predecessor and successor. + // We can only do this after block placement because it depends on whether + // there are fallthroughs, and we don't know until after layout. + if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) { + auto BothFallThrough = [](MachineBasicBlock *MBB) { + if (MBB->succ_size() != 0 && !MBB->canFallThrough()) + return false; + MachineFunction::iterator I(MBB); + MachineFunction *MF = MBB->getParent(); + return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough(); + }; + if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2)) + return true; + } + // If both blocks have an unconditional branch temporarily stripped out, // count that as an additional common instruction for the following // heuristics. This heuristic is only accurate for single-succ blocks, so to @@ -604,16 +642,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, (I1 == MBB1->begin() || I2 == MBB2->begin()); } -/// ComputeSameTails - Look through all the blocks in MergePotentials that have -/// hash CurHash (guaranteed to match the last element). Build the vector -/// SameTails of all those that have the (same) largest number of instructions -/// in common of any pair of these blocks. SameTails entries contain an -/// iterator into MergePotentials (from which the MachineBasicBlock can be -/// found) and a MachineBasicBlock::iterator into that MBB indicating the -/// instruction where the matching code sequence begins. -/// Order of elements in SameTails is the reverse of the order in which -/// those blocks appear in MergePotentials (where they are not necessarily -/// consecutive). unsigned BranchFolder::ComputeSameTails(unsigned CurHash, unsigned MinCommonTailLength, MachineBasicBlock *SuccBB, @@ -650,8 +678,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash, return maxCommonTailLength; } -/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from -/// MergePotentials, restoring branches at ends of blocks as appropriate. void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB) { @@ -671,8 +697,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash, MergePotentials.erase(CurMPIter, MergePotentials.end()); } -/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist -/// only of the common tail. Create a block that does by splitting one. bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, MachineBasicBlock *SuccBB, unsigned maxCommonTailLength, @@ -723,6 +747,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, return true; } +void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) { + MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock(); + + std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size()); + for (unsigned int i = 0 ; i != SameTails.size() ; ++i) { + if (i != commonTailIndex) + NextCommonInsts[i] = SameTails[i].getTailStartPos(); + else { + assert(SameTails[i].getTailStartPos() == MBB->begin() && + "MBB is not a common tail only block"); + } + } + + for (auto &MI : *MBB) { + if (MI.isDebugValue()) + continue; + DebugLoc DL = MI.getDebugLoc(); + for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) { + if (i == commonTailIndex) + continue; + + auto &Pos = NextCommonInsts[i]; + assert(Pos != SameTails[i].getBlock()->end() && + "Reached BB end within common tail"); + while (Pos->isDebugValue()) { + ++Pos; + assert(Pos != SameTails[i].getBlock()->end() && + "Reached BB end within common tail"); + } + assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!"); + DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc()); + NextCommonInsts[i] = ++Pos; + } + MI.setDebugLoc(DL); + } +} + static void mergeOperations(MachineBasicBlock::iterator MBBIStartPos, MachineBasicBlock &MBBCommon) { @@ -875,10 +936,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB, // Recompute common tail MBB's edge weights and block frequency. setCommonTailEdgeWeights(*MBB); - // Remove the original debug location from the common tail. - for (auto &MI : *MBB) - if (!MI.isDebugValue()) - MI.setDebugLoc(DebugLoc()); + // Merge debug locations across identical instructions for common tail. + MergeCommonTailDebugLocs(commonTailIndex); // MBB is common tail. Adjust all other BB's to jump to this one. // Traversal must be forwards so erases work. @@ -1043,7 +1102,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // Remove the unconditional branch at the end, if any. if (TBB && (Cond.empty() || FBB)) { - DebugLoc dl; // FIXME: this is nowhere + DebugLoc dl = PBB->findBranchDebugLoc(); TII->removeBranch(*PBB); if (!Cond.empty()) // reinsert conditional branch only, for now @@ -1193,8 +1252,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) { return DebugLoc(); } -/// OptimizeBlock - Analyze and optimize control flow related to the specified -/// block. This is never called on the entry block. bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { bool MadeChange = false; MachineFunction &MF = *MBB->getParent(); @@ -1386,6 +1443,42 @@ ReoptimizeBlock: } } + if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && + MF.getFunction()->optForSize()) { + // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch + // direction, thereby defeating careful block placement and regressing + // performance. Therefore, only consider this for optsize functions. + MachineInstr &TailCall = *MBB->getFirstNonDebugInstr(); + if (TII->isUnconditionalTailCall(TailCall)) { + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector<MachineOperand, 4> PredCond; + bool PredAnalyzable = + !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); + + if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) { + // The predecessor has a conditional branch to this block which consists + // of only a tail call. Try to fold the tail call into the conditional + // branch. + if (TII->canMakeTailCallConditional(PredCond, TailCall)) { + // TODO: It would be nice if analyzeBranch() could provide a pointer + // to the branch insturction so replaceBranchWithTailCall() doesn't + // have to search for it. + TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); + ++NumTailCalls; + Pred->removeSuccessor(MBB); + MadeChange = true; + return MadeChange; + } + } + // If the predecessor is falling through to this block, we could reverse + // the branch condition and fold the tail call into that. However, after + // that we might have to re-arrange the CFG to fall through to the other + // block and there is a high risk of regressing code size rather than + // improving it. + } + } + // Analyze the branch in the current block. MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr; SmallVector<MachineOperand, 4> CurCond; @@ -1599,8 +1692,6 @@ ReoptimizeBlock: // Hoist Common Code //===----------------------------------------------------------------------===// -/// HoistCommonCode - Hoist common instruction sequences at the start of basic -/// blocks to their common predecessor. bool BranchFolder::HoistCommonCode(MachineFunction &MF) { bool MadeChange = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) { @@ -1734,9 +1825,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, return PI; } -/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction -/// sequence at the start of the function, move the instructions before MBB -/// terminator if it's legal. bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; @@ -1763,8 +1851,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { return false; bool HasDups = false; - SmallVector<unsigned, 4> LocalDefs; - SmallSet<unsigned, 4> LocalDefsSet; + SmallVector<unsigned, 4> LocalDefs, LocalKills; + SmallSet<unsigned, 4> ActiveDefsSet, AllDefsSet; MachineBasicBlock::iterator TIB = TBB->begin(); MachineBasicBlock::iterator FIB = FBB->begin(); MachineBasicBlock::iterator TIE = TBB->end(); @@ -1818,7 +1906,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { IsSafe = false; break; } - } else if (!LocalDefsSet.count(Reg)) { + } else if (!ActiveDefsSet.count(Reg)) { if (Defs.count(Reg)) { // Use is defined by the instruction at the point of insertion. IsSafe = false; @@ -1838,18 +1926,22 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore)) break; - // Remove kills from LocalDefsSet, these registers had short live ranges. + // Remove kills from ActiveDefsSet, these registers had short live ranges. for (const MachineOperand &MO : TIB->operands()) { if (!MO.isReg() || !MO.isUse() || !MO.isKill()) continue; unsigned Reg = MO.getReg(); - if (!Reg || !LocalDefsSet.count(Reg)) + if (!Reg) + continue; + if (!AllDefsSet.count(Reg)) { + LocalKills.push_back(Reg); continue; + } if (TargetRegisterInfo::isPhysicalRegister(Reg)) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - LocalDefsSet.erase(*AI); + ActiveDefsSet.erase(*AI); } else { - LocalDefsSet.erase(Reg); + ActiveDefsSet.erase(Reg); } } @@ -1861,7 +1953,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg)) continue; LocalDefs.push_back(Reg); - addRegAndItsAliases(Reg, TRI, LocalDefsSet); + addRegAndItsAliases(Reg, TRI, ActiveDefsSet); + addRegAndItsAliases(Reg, TRI, AllDefsSet); } HasDups = true; @@ -1876,17 +1969,22 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { FBB->erase(FBB->begin(), FIB); // Update livein's. - bool AddedLiveIns = false; + bool ChangedLiveIns = false; for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { unsigned Def = LocalDefs[i]; - if (LocalDefsSet.count(Def)) { + if (ActiveDefsSet.count(Def)) { TBB->addLiveIn(Def); FBB->addLiveIn(Def); - AddedLiveIns = true; + ChangedLiveIns = true; } } + for (unsigned K : LocalKills) { + TBB->removeLiveIn(K); + FBB->removeLiveIn(K); + ChangedLiveIns = true; + } - if (AddedLiveIns) { + if (ChangedLiveIns) { TBB->sortUniqueLiveIns(); FBB->sortUniqueLiveIns(); } diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h index fc48e484292d..92681137e4c6 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.h +++ b/contrib/llvm/lib/CodeGen/BranchFolding.h @@ -37,6 +37,9 @@ namespace llvm { // flag. Ignored for optsize. unsigned MinCommonTailLength = 0); + /// Perhaps branch folding, tail merging and other CFG optimizations on the + /// given function. Block placement changes the layout and may create new + /// tail merging opportunities. bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli = nullptr, @@ -105,6 +108,7 @@ namespace llvm { bool UpdateLiveIns; unsigned MinCommonTailLength; const TargetInstrInfo *TII; + const MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; MachineModuleInfo *MMI; MachineLoopInfo *MLI; @@ -122,6 +126,8 @@ namespace llvm { const MachineBasicBlock *MBB) const; raw_ostream &printBlockFreq(raw_ostream &OS, const BlockFrequency Freq) const; + void view(const Twine &Name, bool isSimple = true); + uint64_t getEntryFreq() const; private: const MachineBlockFrequencyInfo &MBFI; @@ -137,26 +143,64 @@ namespace llvm { MachineBasicBlock* PredBB, unsigned MinCommonTailLength); void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB); + + /// Delete the instruction OldInst and everything after it, replacing it + /// with an unconditional branch to NewDest. void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, MachineBasicBlock *NewDest); + + /// Given a machine basic block and an iterator into it, split the MBB so + /// that the part before the iterator falls into the part starting at the + /// iterator. This returns the new MBB. MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB, MachineBasicBlock::iterator BBI1, const BasicBlock *BB); + + /// Look through all the blocks in MergePotentials that have hash CurHash + /// (guaranteed to match the last element). Build the vector SameTails of + /// all those that have the (same) largest number of instructions in common + /// of any pair of these blocks. SameTails entries contain an iterator into + /// MergePotentials (from which the MachineBasicBlock can be found) and a + /// MachineBasicBlock::iterator into that MBB indicating the instruction + /// where the matching code sequence begins. Order of elements in SameTails + /// is the reverse of the order in which those blocks appear in + /// MergePotentials (where they are not necessarily consecutive). unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB); + + /// Remove all blocks with hash CurHash from MergePotentials, restoring + /// branches at ends of blocks as appropriate. void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB, MachineBasicBlock* PredBB); + + /// None of the blocks to be tail-merged consist only of the common tail. + /// Create a block that does by splitting one. bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, MachineBasicBlock *SuccBB, unsigned maxCommonTailLength, unsigned &commonTailIndex); + /// Create merged DebugLocs of identical instructions across SameTails and + /// assign it to the instruction in common tail. + void MergeCommonTailDebugLocs(unsigned commonTailIndex); + bool OptimizeBranches(MachineFunction &MF); + + /// Analyze and optimize control flow related to the specified block. This + /// is never called on the entry block. bool OptimizeBlock(MachineBasicBlock *MBB); + + /// Remove the specified dead machine basic block from the function, + /// updating the CFG. void RemoveDeadBlock(MachineBasicBlock *MBB); + /// Hoist common instruction sequences at the start of basic blocks to their + /// common predecessor. bool HoistCommonCode(MachineFunction &MF); + + /// If the successors of MBB has common instruction sequence at the start of + /// the function, move the instructions before MBB terminator if it's legal. bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB); }; } diff --git a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp index 8b27570a17f4..e3de61c7816f 100644 --- a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -126,14 +126,16 @@ void BranchRelaxation::verify() { #endif } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// print block size and offset information - debugging -void BranchRelaxation::dumpBBs() { +LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() { for (auto &MBB : *MF) { const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()]; dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset) << format("size=%#x\n", BBI.Size); } } +#endif /// scanFunction - Do the initial scan of the function, building up /// information about each block. @@ -257,7 +259,7 @@ MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI, // Need to fix live-in lists if we track liveness. if (TRI->trackLivenessAfterRegAlloc(*MF)) - computeLiveIns(LiveRegs, *TRI, *NewBB); + computeLiveIns(LiveRegs, MF->getRegInfo(), *NewBB); ++NumSplit; @@ -343,6 +345,10 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) { // Do it here since if there's no split, no update is needed. MBB->replaceSuccessor(FBB, &NewBB); NewBB.addSuccessor(FBB); + + // Need to fix live-in lists if we track liveness. + if (TRI->trackLivenessAfterRegAlloc(*MF)) + computeLiveIns(LiveRegs, MF->getRegInfo(), NewBB); } // We now have an appropriate fall-through block in place (either naturally or diff --git a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp index ff7c99de0420..e4eab8c513d9 100644 --- a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp +++ b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp @@ -1,4 +1,4 @@ -//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===// +//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===// // // The LLVM Compiler Infrastructure // @@ -14,6 +14,8 @@ #include "llvm/CodeGen/GCs.h" #include "llvm/CodeGen/GCStrategy.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Casting.h" using namespace llvm; @@ -77,6 +79,7 @@ public: UsesMetadata = false; CustomRoots = false; } + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. const PointerType *PT = cast<PointerType>(Ty); @@ -110,6 +113,7 @@ public: UsesMetadata = false; CustomRoots = false; } + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. const PointerType *PT = cast<PointerType>(Ty); @@ -117,7 +121,8 @@ public: return (1 == PT->getAddressSpace()); } }; -} + +} // end anonymous namespace // Register all the above so that they can be found at runtime. Note that // these static initializers are important since the registration list is diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp index 2e33f14c7ee3..7cad4d031169 100644 --- a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp @@ -30,8 +30,7 @@ using namespace llvm; CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) : CallingConv(CC), IsVarArg(isVarArg), MF(mf), - TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C), - CallOrPrologue(Unknown) { + TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) { // No stack is used. StackOffset = 0; MaxStackArgAlign = 1; diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp index 4cf9b138f10d..2a2715beaadc 100644 --- a/contrib/llvm/lib/CodeGen/CodeGen.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp @@ -21,6 +21,7 @@ using namespace llvm; /// initializeCodeGen - Initialize all passes linked into the CodeGen library. void llvm::initializeCodeGen(PassRegistry &Registry) { initializeAtomicExpandPass(Registry); + initializeBranchCoalescingPass(Registry); initializeBranchFolderPassPass(Registry); initializeBranchRelaxationPass(Registry); initializeCodeGenPreparePass(Registry); @@ -31,12 +32,15 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeEarlyIfConverterPass(Registry); initializeExpandISelPseudosPass(Registry); initializeExpandPostRAPass(Registry); + initializeFEntryInserterPass(Registry); initializeFinalizeMachineBundlesPass(Registry); initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); + initializeImplicitNullChecksPass(Registry); initializeInterleavedAccessPass(Registry); + initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); initializeLiveStacksPass(Registry); @@ -47,7 +51,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCSEPass(Registry); - initializeImplicitNullChecksPass(Registry); initializeMachineCombinerPass(Registry); initializeMachineCopyPropagationPass(Registry); initializeMachineDominatorTreePass(Registry); @@ -55,16 +58,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoPass(Registry); + initializeMachineOptimizationRemarkEmitterPassPass(Registry); + initializeMachineOutlinerPass(Registry); initializeMachinePipelinerPass(Registry); initializeMachinePostDominatorTreePass(Registry); + initializeMachineRegionInfoPassPass(Registry); initializeMachineSchedulerPass(Registry); initializeMachineSinkingPass(Registry); initializeMachineVerifierPassPass(Registry); - initializeXRayInstrumentationPass(Registry); - initializePatchableFunctionPass(Registry); initializeOptimizePHIsPass(Registry); initializePEIPass(Registry); initializePHIEliminationPass(Registry); + initializePatchableFunctionPass(Registry); initializePeepholeOptimizerPass(Registry); initializePostMachineSchedulerPass(Registry); initializePostRAHazardRecognizerPass(Registry); @@ -74,12 +79,12 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRAGreedyPass(Registry); initializeRegisterCoalescerPass(Registry); initializeRenameIndependentSubregsPass(Registry); + initializeSafeStackLegacyPassPass(Registry); + initializeScalarizeMaskedMemIntrinPass(Registry); initializeShrinkWrapPass(Registry); initializeSlotIndexesPass(Registry); initializeStackColoringPass(Registry); initializeStackMapLivenessPass(Registry); - initializeLiveDebugValuesPass(Registry); - initializeSafeStackPass(Registry); initializeStackProtectorPass(Registry); initializeStackSlotColoringPass(Registry); initializeTailDuplicatePassPass(Registry); @@ -91,6 +96,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeVirtRegMapPass(Registry); initializeVirtRegRewriterPass(Registry); initializeWinEHPreparePass(Registry); + initializeXRayInstrumentationPass(Registry); } void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp index 934b470f13b5..4e85708efafc 100644 --- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -14,11 +14,14 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -53,8 +56,10 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -77,7 +82,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); -STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); static cl::opt<bool> DisableBranchOpts( @@ -93,7 +97,7 @@ static cl::opt<bool> DisableSelectToBranch( cl::desc("Disable select to branch conversion.")); static cl::opt<bool> AddrSinkUsingGEPs( - "addr-sink-using-gep", cl::Hidden, cl::init(false), + "addr-sink-using-gep", cl::Hidden, cl::init(true), cl::desc("Address sinking in CGP using GEPs.")); static cl::opt<bool> EnableAndCmpSinking( @@ -135,15 +139,24 @@ static cl::opt<bool> ForceSplitStore( "force-split-store", cl::Hidden, cl::init(false), cl::desc("Force store splitting no matter what the target query says.")); +static cl::opt<bool> +EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, + cl::desc("Enable merging of redundant sexts when one is dominating" + " the other."), cl::init(true)); + namespace { typedef SmallPtrSet<Instruction *, 16> SetOfInstrs; typedef PointerIntPair<Type *, 1, bool> TypeIsSExt; typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; +typedef SmallVector<Instruction *, 16> SExts; +typedef DenseMap<Value *, SExts> ValueToSExts; class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { const TargetMachine *TM; + const TargetSubtargetInfo *SubtargetInfo; const TargetLowering *TLI; + const TargetRegisterInfo *TRI; const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; const LoopInfo *LI; @@ -165,6 +178,15 @@ class TypePromotionTransaction; /// promotion for the current function. InstrToOrigTy PromotedInsts; + /// Keep track of instructions removed during promotion. + SetOfInstrs RemovedInsts; + + /// Keep track of sext chains based on their initial value. + DenseMap<Value *, Instruction *> SeenChainsForSExt; + + /// Keep track of SExt promoted. + ValueToSExts ValToSExtendedUses; + /// True if CFG is modified in any way. bool ModifiedDT; @@ -176,10 +198,11 @@ class TypePromotionTransaction; public: static char ID; // Pass identification, replacement for typeid - explicit CodeGenPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) { - initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); - } + CodeGenPrepare() + : FunctionPass(ID), TM(nullptr), TLI(nullptr), TTI(nullptr), + DL(nullptr) { + initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "CodeGen Prepare"; } @@ -206,7 +229,7 @@ class TypePromotionTransaction; Type *AccessTy, unsigned AS); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); - bool moveExtToFormExtLoad(Instruction *&I); + bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *I); bool optimizeSelectInst(SelectInst *SI); @@ -215,26 +238,32 @@ class TypePromotionTransaction; bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB); bool placeDbgValues(Function &F); - bool sinkAndCmp(Function &F); - bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, - Instruction *&Inst, - const SmallVectorImpl<Instruction *> &Exts, - unsigned CreatedInstCost); + bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, + LoadInst *&LI, Instruction *&Inst, bool HasPromoted); + bool tryToPromoteExts(TypePromotionTransaction &TPT, + const SmallVectorImpl<Instruction *> &Exts, + SmallVectorImpl<Instruction *> &ProfitablyMovedExts, + unsigned CreatedInstsCost = 0); + bool mergeSExts(Function &F); + bool performAddressTypePromotion( + Instruction *&Inst, + bool AllowPromotionWithoutCommonHeader, + bool HasPromoted, TypePromotionTransaction &TPT, + SmallVectorImpl<Instruction *> &SpeculativelyMovedExts); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); + bool splitIndirectCriticalEdges(Function &F); }; } char CodeGenPrepare::ID = 0; -INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) +INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, + "Optimize for code generation", false, false) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false) +INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, + "Optimize for code generation", false, false) -FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { - return new CodeGenPrepare(TM); -} +FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); } bool CodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) @@ -250,8 +279,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) { BPI.reset(); ModifiedDT = false; - if (TM) - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { + TM = &TPC->getTM<TargetMachine>(); + SubtargetInfo = TM->getSubtargetImpl(F); + TLI = SubtargetInfo->getTargetLowering(); + TRI = SubtargetInfo->getRegisterInfo(); + } TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -260,10 +293,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (ProfileGuidedSectionPrefix) { ProfileSummaryInfo *PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - if (PSI->isFunctionEntryHot(&F)) + if (PSI->isFunctionHotInCallGraph(&F)) F.setSectionPrefix(".hot"); - else if (PSI->isFunctionEntryCold(&F)) - F.setSectionPrefix(".cold"); + else if (PSI->isFunctionColdInCallGraph(&F)) + F.setSectionPrefix(".unlikely"); } /// This optimization identifies DIV instructions that can be @@ -290,18 +323,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // find a node corresponding to the value. EverMadeChange |= placeDbgValues(F); - // If there is a mask, compare against zero, and branch that can be combined - // into a single target instruction, push the mask and compare into branch - // users. Do this before OptimizeBlock -> OptimizeInst -> - // OptimizeCmpExpression, which perturbs the pattern being searched for. - if (!DisableBranchOpts) { - EverMadeChange |= sinkAndCmp(F); + if (!DisableBranchOpts) EverMadeChange |= splitBranchCondition(F); - } + + // Split some critical edges where one of the sources is an indirect branch, + // to help generate sane code for PHIs involving such edges. + EverMadeChange |= splitIndirectCriticalEdges(F); bool MadeChange = true; while (MadeChange) { MadeChange = false; + SeenChainsForSExt.clear(); + ValToSExtendedUses.clear(); + RemovedInsts.clear(); for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; @@ -311,6 +345,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (ModifiedDTOnIteration) break; } + if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) + MadeChange |= mergeSExts(F); + + // Really free removed instructions during promotion. + for (Instruction *I : RemovedInsts) + I->deleteValue(); + EverMadeChange |= MadeChange; } @@ -432,6 +473,160 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) { return DestBB; } +// Return the unique indirectbr predecessor of a block. This may return null +// even if such a predecessor exists, if it's not useful for splitting. +// If a predecessor is found, OtherPreds will contain all other (non-indirectbr) +// predecessors of BB. +static BasicBlock * +findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { + // If the block doesn't have any PHIs, we don't care about it, since there's + // no point in splitting it. + PHINode *PN = dyn_cast<PHINode>(BB->begin()); + if (!PN) + return nullptr; + + // Verify we have exactly one IBR predecessor. + // Conservatively bail out if one of the other predecessors is not a "regular" + // terminator (that is, not a switch or a br). + BasicBlock *IBB = nullptr; + for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { + BasicBlock *PredBB = PN->getIncomingBlock(Pred); + TerminatorInst *PredTerm = PredBB->getTerminator(); + switch (PredTerm->getOpcode()) { + case Instruction::IndirectBr: + if (IBB) + return nullptr; + IBB = PredBB; + break; + case Instruction::Br: + case Instruction::Switch: + OtherPreds.push_back(PredBB); + continue; + default: + return nullptr; + } + } + + return IBB; +} + +// Split critical edges where the source of the edge is an indirectbr +// instruction. This isn't always possible, but we can handle some easy cases. +// This is useful because MI is unable to split such critical edges, +// which means it will not be able to sink instructions along those edges. +// This is especially painful for indirect branches with many successors, where +// we end up having to prepare all outgoing values in the origin block. +// +// Our normal algorithm for splitting critical edges requires us to update +// the outgoing edges of the edge origin block, but for an indirectbr this +// is hard, since it would require finding and updating the block addresses +// the indirect branch uses. But if a block only has a single indirectbr +// predecessor, with the others being regular branches, we can do it in a +// different way. +// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr. +// We can split D into D0 and D1, where D0 contains only the PHIs from D, +// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and +// create the following structure: +// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1 +bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) { + // Check whether the function has any indirectbrs, and collect which blocks + // they may jump to. Since most functions don't have indirect branches, + // this lowers the common case's overhead to O(Blocks) instead of O(Edges). + SmallSetVector<BasicBlock *, 16> Targets; + for (auto &BB : F) { + auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); + if (!IBI) + continue; + + for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) + Targets.insert(IBI->getSuccessor(Succ)); + } + + if (Targets.empty()) + return false; + + bool Changed = false; + for (BasicBlock *Target : Targets) { + SmallVector<BasicBlock *, 16> OtherPreds; + BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); + // If we did not found an indirectbr, or the indirectbr is the only + // incoming edge, this isn't the kind of edge we're looking for. + if (!IBRPred || OtherPreds.empty()) + continue; + + // Don't even think about ehpads/landingpads. + Instruction *FirstNonPHI = Target->getFirstNonPHI(); + if (FirstNonPHI->isEHPad() || Target->isLandingPad()) + continue; + + BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); + // It's possible Target was its own successor through an indirectbr. + // In this case, the indirectbr now comes from BodyBlock. + if (IBRPred == Target) + IBRPred = BodyBlock; + + // At this point Target only has PHIs, and BodyBlock has the rest of the + // block's body. Create a copy of Target that will be used by the "direct" + // preds. + ValueToValueMapTy VMap; + BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); + + for (BasicBlock *Pred : OtherPreds) { + // If the target is a loop to itself, then the terminator of the split + // block needs to be updated. + if (Pred == Target) + BodyBlock->getTerminator()->replaceUsesOfWith(Target, DirectSucc); + else + Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc); + } + + // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that + // they are clones, so the number of PHIs are the same. + // (a) Remove the edge coming from IBRPred from the "Direct" PHI + // (b) Leave that as the only edge in the "Indirect" PHI. + // (c) Merge the two in the body block. + BasicBlock::iterator Indirect = Target->begin(), + End = Target->getFirstNonPHI()->getIterator(); + BasicBlock::iterator Direct = DirectSucc->begin(); + BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); + + assert(&*End == Target->getTerminator() && + "Block was expected to only contain PHIs"); + + while (Indirect != End) { + PHINode *DirPHI = cast<PHINode>(Direct); + PHINode *IndPHI = cast<PHINode>(Indirect); + + // Now, clean up - the direct block shouldn't get the indirect value, + // and vice versa. + DirPHI->removeIncomingValue(IBRPred); + Direct++; + + // Advance the pointer here, to avoid invalidation issues when the old + // PHI is erased. + Indirect++; + + PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); + NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), + IBRPred); + + // Create a PHI in the body block, to merge the direct and indirect + // predecessors. + PHINode *MergePHI = + PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); + MergePHI->addIncoming(NewIndPHI, Target); + MergePHI->addIncoming(DirPHI, DirectSucc); + + IndPHI->replaceAllUsesWith(MergePHI); + IndPHI->eraseFromParent(); + } + + Changed = true; + } + + return Changed; +} + /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these @@ -1090,6 +1285,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) { return false; } +/// Duplicate and sink the given 'and' instruction into user blocks where it is +/// used in a compare to allow isel to generate better code for targets where +/// this operation can be combined. +/// +/// Return true if any changes are made. +static bool sinkAndCmp0Expression(Instruction *AndI, + const TargetLowering &TLI, + SetOfInstrs &InsertedInsts) { + // Double-check that we're not trying to optimize an instruction that was + // already optimized by some other part of this pass. + assert(!InsertedInsts.count(AndI) && + "Attempting to optimize already optimized and instruction"); + (void) InsertedInsts; + + // Nothing to do for single use in same basic block. + if (AndI->hasOneUse() && + AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) + return false; + + // Try to avoid cases where sinking/duplicating is likely to increase register + // pressure. + if (!isa<ConstantInt>(AndI->getOperand(0)) && + !isa<ConstantInt>(AndI->getOperand(1)) && + AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) + return false; + + for (auto *U : AndI->users()) { + Instruction *User = cast<Instruction>(U); + + // Only sink for and mask feeding icmp with 0. + if (!isa<ICmpInst>(User)) + return false; + + auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); + if (!CmpC || !CmpC->isZero()) + return false; + } + + if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) + return false; + + DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); + DEBUG(AndI->getParent()->dump()); + + // Push the 'and' into the same block as the icmp 0. There should only be + // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any + // others, so we don't need to keep track of which BBs we insert into. + for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); + + // Keep the 'and' in the same place if the use is already in the same block. + Instruction *InsertPt = + User->getParent() == AndI->getParent() ? AndI : User; + Instruction *InsertedAnd = + BinaryOperator::Create(Instruction::And, AndI->getOperand(0), + AndI->getOperand(1), "", InsertPt); + // Propagate the debug info. + InsertedAnd->setDebugLoc(AndI->getDebugLoc()); + + // Replace a use of the 'and' with a use of the new 'and'. + TheUse = InsertedAnd; + ++NumAndUses; + DEBUG(User->getParent()->dump()); + } + + // We removed all uses, nuke the and. + AndI->eraseFromParent(); + return true; +} + /// Check if the candidates could be combined with a shift instruction, which /// includes: /// 1. Truncate instruction @@ -1278,519 +1550,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, return MadeChange; } -// Translate a masked load intrinsic like -// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, -// <16 x i1> %mask, <16 x i32> %passthru) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// %1 = bitcast i8* %addr to i32* -// %2 = extractelement <16 x i1> %mask, i32 0 -// %3 = icmp eq i1 %2, true -// br i1 %3, label %cond.load, label %else -// -//cond.load: ; preds = %0 -// %4 = getelementptr i32* %1, i32 0 -// %5 = load i32* %4 -// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 -// br label %else -// -//else: ; preds = %0, %cond.load -// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] -// %7 = extractelement <16 x i1> %mask, i32 1 -// %8 = icmp eq i1 %7, true -// br i1 %8, label %cond.load1, label %else2 -// -//cond.load1: ; preds = %else -// %9 = getelementptr i32* %1, i32 1 -// %10 = load i32* %9 -// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 -// br label %else2 -// -//else2: ; preds = %else, %cond.load1 -// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] -// %12 = extractelement <16 x i1> %mask, i32 2 -// %13 = icmp eq i1 %12, true -// br i1 %13, label %cond.load4, label %else5 -// -static void scalarizeMaskedLoad(CallInst *CI) { - Value *Ptr = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); - - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - VectorType *VecType = dyn_cast<VectorType>(CI->getType()); - assert(VecType && "Unexpected return type of masked load intrinsic"); - - Type *EltTy = CI->getType()->getVectorElementType(); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - BasicBlock *CondBlock = nullptr; - BasicBlock *PrevIfBlock = CI->getParent(); - - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Short-cut if the mask is all-true. - bool IsAllOnesMask = isa<Constant>(Mask) && - cast<Constant>(Mask)->isAllOnesValue(); - - if (IsAllOnesMask) { - Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - // Adjust alignment for the scalar instruction. - AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); - // Bitcast %addr fron i8* to EltTy* - Type *NewPtrType = - EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); - Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); - - Value *UndefVal = UndefValue::get(VecType); - - // The result vector - Value *VResult = UndefVal; - - if (isa<ConstantVector>(Mask)) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, - Builder.getInt32(Idx)); - } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // %to_load = icmp eq i1 %mask_1, true - // br i1 %to_load, label %cond.load, label %else - // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } - - Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1)); - - // Create "cond" block - // - // %EltAddr = getelementptr i32* %1, i32 0 - // %Elt = load i32* %EltAddr - // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx - // - CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); - Builder.SetInsertPoint(InsertPt); - - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = - CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - PrevIfBlock = IfBlock; - IfBlock = NewIfBlock; - } - - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); -} - -// Translate a masked store intrinsic, like -// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, -// <16 x i1> %mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set -// -// %1 = bitcast i8* %addr to i32* -// %2 = extractelement <16 x i1> %mask, i32 0 -// %3 = icmp eq i1 %2, true -// br i1 %3, label %cond.store, label %else -// -// cond.store: ; preds = %0 -// %4 = extractelement <16 x i32> %val, i32 0 -// %5 = getelementptr i32* %1, i32 0 -// store i32 %4, i32* %5 -// br label %else -// -// else: ; preds = %0, %cond.store -// %6 = extractelement <16 x i1> %mask, i32 1 -// %7 = icmp eq i1 %6, true -// br i1 %7, label %cond.store1, label %else2 -// -// cond.store1: ; preds = %else -// %8 = extractelement <16 x i32> %val, i32 1 -// %9 = getelementptr i32* %1, i32 1 -// store i32 %8, i32* %9 -// br label %else2 -// . . . -static void scalarizeMaskedStore(CallInst *CI) { - Value *Src = CI->getArgOperand(0); - Value *Ptr = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); - - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - VectorType *VecType = dyn_cast<VectorType>(Src->getType()); - assert(VecType && "Unexpected data type in masked store intrinsic"); - - Type *EltTy = VecType->getElementType(); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Short-cut if the mask is all-true. - bool IsAllOnesMask = isa<Constant>(Mask) && - cast<Constant>(Mask)->isAllOnesValue(); - - if (IsAllOnesMask) { - Builder.CreateAlignedStore(Src, Ptr, AlignVal); - CI->eraseFromParent(); - return; - } - - // Adjust alignment for the scalar instruction. - AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); - // Bitcast %addr fron i8* to EltTy* - Type *NewPtrType = - EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); - Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); - - if (isa<ConstantVector>(Mask)) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - } - CI->eraseFromParent(); - return; - } - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // %to_store = icmp eq i1 %mask_1, true - // br i1 %to_store, label %cond.store, label %else - // - Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1)); - - // Create "cond" block - // - // %OneElt = extractelement <16 x i32> %Src, i32 Idx - // %EltAddr = getelementptr i32* %1, i32 0 - // %store i32 %OneElt, i32* %EltAddr - // - BasicBlock *CondBlock = - IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); - Builder.SetInsertPoint(InsertPt); - - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = - CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - CI->eraseFromParent(); -} - -// Translate a masked gather intrinsic like -// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, -// <16 x i1> %Mask, <16 x i32> %Src) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> %Mask, i32 0 -// % ToLoad0 = icmp eq i1 % Mask0, true -// br i1 % ToLoad0, label %cond.load, label %else -// -// cond.load: -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// % Load0 = load i32, i32* % Ptr0, align 4 -// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 -// br label %else -// -// else: -// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] -// % Mask1 = extractelement <16 x i1> %Mask, i32 1 -// % ToLoad1 = icmp eq i1 % Mask1, true -// br i1 % ToLoad1, label %cond.load1, label %else2 -// -// cond.load1: -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// % Load1 = load i32, i32* % Ptr1, align 4 -// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 -// br label %else2 -// . . . -// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src -// ret <16 x i32> %Result -static void scalarizeMaskedGather(CallInst *CI) { - Value *Ptrs = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); - - VectorType *VecType = dyn_cast<VectorType>(CI->getType()); - - assert(VecType && "Unexpected return type of masked load intrinsic"); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - BasicBlock *CondBlock = nullptr; - BasicBlock *PrevIfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - Value *UndefVal = UndefValue::get(VecType); - - // The result vector - Value *VResult = UndefVal; - unsigned VectorWidth = VecType->getNumElements(); - - // Shorten the way if the mask is a vector of constants. - bool IsConstMask = isa<ConstantVector>(Mask); - - if (IsConstMask) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, - "Load" + Twine(Idx)); - VResult = Builder.CreateInsertElement(VResult, Load, - Builder.getInt32(Idx), - "Res" + Twine(Idx)); - } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %Mask1 = extractelement <16 x i1> %Mask, i32 1 - // %ToLoad1 = icmp eq i1 %Mask1, true - // br i1 %ToLoad1, label %cond.load, label %else - // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } - - Value *Predicate = Builder.CreateExtractElement(Mask, - Builder.getInt32(Idx), - "Mask" + Twine(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1), - "ToLoad" + Twine(Idx)); - - // Create "cond" block - // - // %EltAddr = getelementptr i32* %1, i32 0 - // %Elt = load i32* %EltAddr - // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx - // - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); - Builder.SetInsertPoint(InsertPt); - - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, - "Load" + Twine(Idx)); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), - "Res" + Twine(Idx)); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - PrevIfBlock = IfBlock; - IfBlock = NewIfBlock; - } - - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); -} - -// Translate a masked scatter intrinsic, like -// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, -// <16 x i1> %Mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set. -// -// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> % Mask, i32 0 -// % ToStore0 = icmp eq i1 % Mask0, true -// br i1 %ToStore0, label %cond.store, label %else -// -// cond.store: -// % Elt0 = extractelement <16 x i32> %Src, i32 0 -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// store i32 %Elt0, i32* % Ptr0, align 4 -// br label %else -// -// else: -// % Mask1 = extractelement <16 x i1> % Mask, i32 1 -// % ToStore1 = icmp eq i1 % Mask1, true -// br i1 % ToStore1, label %cond.store1, label %else2 -// -// cond.store1: -// % Elt1 = extractelement <16 x i32> %Src, i32 1 -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// store i32 % Elt1, i32* % Ptr1, align 4 -// br label %else2 -// . . . -static void scalarizeMaskedScatter(CallInst *CI) { - Value *Src = CI->getArgOperand(0); - Value *Ptrs = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); - - assert(isa<VectorType>(Src->getType()) && - "Unexpected data type in masked scatter intrinsic"); - assert(isa<VectorType>(Ptrs->getType()) && - isa<PointerType>(Ptrs->getType()->getVectorElementType()) && - "Vector of pointers is expected in masked scatter intrinsic"); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - unsigned VectorWidth = Src->getType()->getVectorNumElements(); - - // Shorten the way if the mask is a vector of constants. - bool IsConstMask = isa<ConstantVector>(Mask); - - if (IsConstMask) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), - "Elt" + Twine(Idx)); - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - } - CI->eraseFromParent(); - return; - } - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - // Fill the "else" block, created in the previous iteration - // - // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx - // % ToStore = icmp eq i1 % Mask1, true - // br i1 % ToStore, label %cond.store, label %else - // - Value *Predicate = Builder.CreateExtractElement(Mask, - Builder.getInt32(Idx), - "Mask" + Twine(Idx)); - Value *Cmp = - Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1), - "ToStore" + Twine(Idx)); - - // Create "cond" block - // - // % Elt1 = extractelement <16 x i32> %Src, i32 1 - // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 - // %store i32 % Elt1, i32* % Ptr1 - // - BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - Builder.SetInsertPoint(InsertPt); - - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), - "Elt" + Twine(Idx)); - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - CI->eraseFromParent(); -} - /// If counting leading or trailing zeros is an expensive operation and a zero /// input is defined, add a check for zero to avoid calling the intrinsic. /// @@ -1955,10 +1714,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { ConstantInt *RetVal = lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true); // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this + // invalidate our iterator. Use a WeakTrackingVH to hold onto it in case + // this // happens. Value *CurValue = &*CurInstIterator; - WeakVH IterHandle(CurValue); + WeakTrackingVH IterHandle(CurValue); replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); @@ -1970,39 +1730,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } return true; } - case Intrinsic::masked_load: { - // Scalarize unsupported vector masked load - if (!TTI->isLegalMaskedLoad(CI->getType())) { - scalarizeMaskedLoad(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_store: { - if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { - scalarizeMaskedStore(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_gather: { - if (!TTI->isLegalMaskedGather(CI->getType())) { - scalarizeMaskedGather(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_scatter: { - if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { - scalarizeMaskedScatter(CI); - ModifiedDT = true; - return true; - } - return false; - } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); @@ -2028,16 +1755,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } if (TLI) { - // Unknown address space. - // TODO: Target hook to pick which address space the intrinsic cares - // about? - unsigned AddrSpace = ~0u; SmallVector<Value*, 2> PtrOps; Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) - while (!PtrOps.empty()) - if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) + if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) { + Value *PtrVal = PtrOps.pop_back_val(); + unsigned AS = PtrVal->getType()->getPointerAddressSpace(); + if (optimizeMemoryInst(II, PtrVal, AccessTy, AS)) return true; + } } } @@ -2168,11 +1894,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CalleeAttrs = CS.getAttributes(); - if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias) != - AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). - removeAttribute(Attribute::NoAlias)) + AttributeList CalleeAttrs = CS.getAttributes(); + if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias) != + AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to @@ -2561,25 +2287,30 @@ class TypePromotionTransaction { OperandsHider Hider; /// Keep track of the uses replaced, if any. UsesReplacer *Replacer; + /// Keep track of instructions removed. + SetOfInstrs &RemovedInsts; public: /// \brief Remove all reference of \p Inst and optinally replace all its /// uses with New. + /// \p RemovedInsts Keep track of the instructions removed by this Action. /// \pre If !Inst->use_empty(), then New != nullptr - InstructionRemover(Instruction *Inst, Value *New = nullptr) + InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, + Value *New = nullptr) : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), - Replacer(nullptr) { + Replacer(nullptr), RemovedInsts(RemovedInsts) { if (New) Replacer = new UsesReplacer(Inst, New); DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); + RemovedInsts.insert(Inst); + /// The instructions removed here will be freed after completing + /// optimizeBlock() for all blocks as we need to keep track of the + /// removed instructions during promotion. Inst->removeFromParent(); } ~InstructionRemover() override { delete Replacer; } - /// \brief Really remove the instruction. - void commit() override { delete Inst; } - /// \brief Resurrect the instruction and reassign it to the proper uses if /// new value was provided when build this action. void undo() override { @@ -2588,6 +2319,7 @@ class TypePromotionTransaction { if (Replacer) Replacer->undo(); Hider.undo(); + RemovedInsts.erase(Inst); } }; @@ -2596,6 +2328,10 @@ public: /// The restoration point is a pointer to an action instead of an iterator /// because the iterator may be invalidated but not the pointer. typedef const TypePromotionAction *ConstRestorationPt; + + TypePromotionTransaction(SetOfInstrs &RemovedInsts) + : RemovedInsts(RemovedInsts) {} + /// Advocate every changes made in that transaction. void commit(); /// Undo all the changes made after the given point. @@ -2627,6 +2363,7 @@ private: /// The ordered list of actions made so far. SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions; typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt; + SetOfInstrs &RemovedInsts; }; void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, @@ -2638,7 +2375,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, void TypePromotionTransaction::eraseInstruction(Instruction *Inst, Value *NewVal) { Actions.push_back( - make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal)); + make_unique<TypePromotionTransaction::InstructionRemover>(Inst, + RemovedInsts, NewVal)); } void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, @@ -2705,8 +2443,8 @@ void TypePromotionTransaction::rollback( /// This encapsulates the logic for matching the target-legal addressing modes. class AddressingModeMatcher { SmallVectorImpl<Instruction*> &AddrModeInsts; - const TargetMachine &TM; const TargetLowering &TLI; + const TargetRegisterInfo &TRI; const DataLayout &DL; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and @@ -2731,14 +2469,14 @@ class AddressingModeMatcher { bool IgnoreProfitability; AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, - const TargetMachine &TM, Type *AT, unsigned AS, + const TargetLowering &TLI, + const TargetRegisterInfo &TRI, + Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) - : AddrModeInsts(AMI), TM(TM), - TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) - ->getTargetLowering()), + : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT) { @@ -2756,13 +2494,15 @@ public: static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst, SmallVectorImpl<Instruction*> &AddrModeInsts, - const TargetMachine &TM, + const TargetLowering &TLI, + const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, + AccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT).matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -3583,18 +3323,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { /// Check to see if all uses of OpVal by the specified inline asm call are due /// to memory operands. If so, return true, otherwise return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetMachine &TM) { + const TargetLowering &TLI, + const TargetRegisterInfo &TRI) { const Function *F = CI->getParent()->getParent(); - const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); - const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); TargetLowering::AsmOperandInfoVector TargetConstraints = - TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, + TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, ImmutableCallSite(CI)); + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. - TLI->ComputeConstraintToUse(OpInfo, SDValue()); + TLI.ComputeConstraintToUse(OpInfo, SDValue()); // If this asm operand is our Value*, and if it isn't an indirect memory // operand, we can't fold it! @@ -3613,7 +3353,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, static bool FindAllMemoryUses( Instruction *I, SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, - SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) { + SmallPtrSetImpl<Instruction *> &ConsideredInsts, + const TargetLowering &TLI, const TargetRegisterInfo &TRI) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -3635,11 +3376,28 @@ static bool FindAllMemoryUses( if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { unsigned opNo = U.getOperandNo(); - if (opNo == 0) return true; // Storing addr, not into addr. + if (opNo != StoreInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { + unsigned opNo = U.getOperandNo(); + if (opNo != AtomicRMWInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(RMW, opNo)); + continue; + } + + if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { + unsigned opNo = U.getOperandNo(); + if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) + return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(CmpX, opNo)); + continue; + } + if (CallInst *CI = dyn_cast<CallInst>(UserI)) { // If this is a cold call, we can sink the addressing calculation into // the cold path. See optimizeCallInst @@ -3650,12 +3408,12 @@ static bool FindAllMemoryUses( if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TM)) + if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI)) return true; continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI)) return true; } @@ -3743,7 +3501,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // the use is just a particularly nice way of sinking it. SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -3775,7 +3533,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode Result; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, + AddressAccessTy, AS, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; @@ -3844,7 +3603,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool IsNumUsesConsensusValid = false; SmallVector<Instruction*, 16> AddrModeInsts; ExtAddrMode AddrMode; - TypePromotionTransaction TPT; + TypePromotionTransaction TPT(RemovedInsts); TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); while (!worklist.empty()) { @@ -3869,7 +3628,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // addressing instructions might have. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, + V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI, InsertedInsts, PromotedInsts, TPT); // This check is broken into two cases with very similar code to avoid using @@ -3935,11 +3694,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && TM && - TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) - ->useAA())) { + SubtargetInfo->useAA())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4042,7 +3800,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // We need to add this separately from the scale above to help with // SDAG consecutive load/store merging. if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } @@ -4053,12 +3811,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, SunkAddr = ResultPtr; } else { if (ResultPtr->getType() != I8PtrTy) - ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); + ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy); SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr"); } if (SunkAddr->getType() != Addr->getType()) - SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType()); } } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4140,9 +3898,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // using it. if (Repl->use_empty()) { // This can cause recursive deletion, which can invalidate our iterator. - // Use a WeakVH to hold onto it in case this happens. + // Use a WeakTrackingVH to hold onto it in case this happens. Value *CurValue = &*CurInstIterator; - WeakVH IterHandle(CurValue); + WeakTrackingVH IterHandle(CurValue); BasicBlock *BB = CurInstIterator->getParent(); RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); @@ -4185,14 +3943,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { return MadeChange; } -/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// \brief Check if all the uses of \p Val are equivalent (or free) zero or /// sign extensions. -static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { - assert(!Inst->use_empty() && "Input must have at least one use"); - const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin()); +static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { + assert(!Val->use_empty() && "Input must have at least one use"); + const Instruction *FirstUser = cast<Instruction>(*Val->user_begin()); bool IsSExt = isa<SExtInst>(FirstUser); Type *ExtTy = FirstUser->getType(); - for (const User *U : Inst->users()) { + for (const User *U : Val->users()) { const Instruction *UI = cast<Instruction>(U); if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI))) return false; @@ -4202,11 +3960,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { continue; // If IsSExt is true, we are in this situation: - // a = Inst + // a = Val // b = sext ty1 a to ty2 // c = sext ty1 a to ty3 // Assuming ty2 is shorter than ty3, this could be turned into: - // a = Inst + // a = Val // b = sext ty1 a to ty2 // c = sext ty2 b to ty3 // However, the last sext is not free. @@ -4233,51 +3991,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { return true; } -/// \brief Try to form ExtLd by promoting \p Exts until they reach a -/// load instruction. -/// If an ext(load) can be formed, it is returned via \p LI for the load -/// and \p Inst for the extension. -/// Otherwise LI == nullptr and Inst == nullptr. -/// When some promotion happened, \p TPT contains the proper state to -/// revert them. +/// \brief Try to speculatively promote extensions in \p Exts and continue +/// promoting through newly promoted operands recursively as far as doing so is +/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. +/// When some promotion happened, \p TPT contains the proper state to revert +/// them. /// -/// \return true when promoting was necessary to expose the ext(load) -/// opportunity, false otherwise. -/// -/// Example: -/// \code -/// %ld = load i32* %addr -/// %add = add nuw i32 %ld, 4 -/// %zext = zext i32 %add to i64 -/// \endcode -/// => -/// \code -/// %ld = load i32* %addr -/// %zext = zext i32 %ld to i64 -/// %add = add nuw i64 %zext, 4 -/// \encode -/// Thanks to the promotion, we can match zext(load i32*) to i64. -bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, - LoadInst *&LI, Instruction *&Inst, - const SmallVectorImpl<Instruction *> &Exts, - unsigned CreatedInstsCost = 0) { - // Iterate over all the extensions to see if one form an ext(load). +/// \return true if some promotion happened, false otherwise. +bool CodeGenPrepare::tryToPromoteExts( + TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, + SmallVectorImpl<Instruction *> &ProfitablyMovedExts, + unsigned CreatedInstsCost) { + bool Promoted = false; + + // Iterate over all the extensions to try to promote them. for (auto I : Exts) { - // Check if we directly have ext(load). - if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) { - Inst = I; - // No promotion happened here. - return false; + // Early check if we directly have ext(load). + if (isa<LoadInst>(I->getOperand(0))) { + ProfitablyMovedExts.push_back(I); + continue; } - // Check whether or not we want to do any promotion. + + // Check whether or not we want to do any promotion. The reason we have + // this check inside the for loop is to catch the case where an extension + // is directly fed by a load because in such case the extension can be moved + // up without any promotion on its operands. if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) - continue; + return false; + // Get the action to perform the promotion. - TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( - I, InsertedInsts, *TLI, PromotedInsts); + TypePromotionHelper::Action TPH = + TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts); // Check if we can promote. - if (!TPH) + if (!TPH) { + // Save the current extension as we cannot move up through its operand. + ProfitablyMovedExts.push_back(I); continue; + } + // Save the current state. TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); @@ -4297,110 +4048,293 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, // one extension but leave one. However, we optimistically keep going, // because the new extension may be removed too. long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; - TotalCreatedInstsCost -= ExtCost; + // FIXME: It would be possible to propagate a negative value instead of + // conservatively ceiling it to 0. + TotalCreatedInstsCost = + std::max((long long)0, (TotalCreatedInstsCost - ExtCost)); if (!StressExtLdPromotion && (TotalCreatedInstsCost > 1 || !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { - // The promotion is not profitable, rollback to the previous state. + // This promotion is not profitable, rollback to the previous state, and + // save the current extension in ProfitablyMovedExts as the latest + // speculative promotion turned out to be unprofitable. + TPT.rollback(LastKnownGood); + ProfitablyMovedExts.push_back(I); + continue; + } + // Continue promoting NewExts as far as doing so is profitable. + SmallVector<Instruction *, 2> NewlyMovedExts; + (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); + bool NewPromoted = false; + for (auto ExtInst : NewlyMovedExts) { + Instruction *MovedExt = cast<Instruction>(ExtInst); + Value *ExtOperand = MovedExt->getOperand(0); + // If we have reached to a load, we need this extra profitability check + // as it could potentially be merged into an ext(load). + if (isa<LoadInst>(ExtOperand) && + !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || + (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) + continue; + + ProfitablyMovedExts.push_back(MovedExt); + NewPromoted = true; + } + + // If none of speculative promotions for NewExts is profitable, rollback + // and save the current extension (I) as the last profitable extension. + if (!NewPromoted) { TPT.rollback(LastKnownGood); + ProfitablyMovedExts.push_back(I); continue; } // The promotion is profitable. - // Check if it exposes an ext(load). - (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); - if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || - // If we have created a new extension, i.e., now we have two - // extensions. We must make sure one of them is merged with - // the load, otherwise we may degrade the code quality. - (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) - // Promotion happened. - return true; - // If this does not help to expose an ext(load) then, rollback. - TPT.rollback(LastKnownGood); + Promoted = true; } - // None of the extension can form an ext(load). - LI = nullptr; - Inst = nullptr; - return false; + return Promoted; } -/// Move a zext or sext fed by a load into the same basic block as the load, -/// unless conditions are unfavorable. This allows SelectionDAG to fold the -/// extend into the load. -/// \p I[in/out] the extension may be modified during the process if some -/// promotions apply. -/// -bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { - // ExtLoad formation infrastructure requires TLI to be effective. - if (!TLI) - return false; +/// Merging redundant sexts when one is dominating the other. +bool CodeGenPrepare::mergeSExts(Function &F) { + DominatorTree DT(F); + bool Changed = false; + for (auto &Entry : ValToSExtendedUses) { + SExts &Insts = Entry.second; + SExts CurPts; + for (Instruction *Inst : Insts) { + if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || + Inst->getOperand(0) != Entry.first) + continue; + bool inserted = false; + for (auto &Pt : CurPts) { + if (DT.dominates(Inst, Pt)) { + Pt->replaceAllUsesWith(Inst); + RemovedInsts.insert(Pt); + Pt->removeFromParent(); + Pt = Inst; + inserted = true; + Changed = true; + break; + } + if (!DT.dominates(Pt, Inst)) + // Give up if we need to merge in a common dominator as the + // expermients show it is not profitable. + continue; + Inst->replaceAllUsesWith(Pt); + RemovedInsts.insert(Inst); + Inst->removeFromParent(); + inserted = true; + Changed = true; + break; + } + if (!inserted) + CurPts.push_back(Inst); + } + } + return Changed; +} - // Try to promote a chain of computation if it allows to form - // an extended load. - TypePromotionTransaction TPT; - TypePromotionTransaction::ConstRestorationPt LastKnownGood = - TPT.getRestorationPoint(); - SmallVector<Instruction *, 1> Exts; - Exts.push_back(I); - // Look for a load being extended. - LoadInst *LI = nullptr; - Instruction *OldExt = I; - bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); - if (!LI || !I) { - assert(!HasPromoted && !LI && "If we did not match any load instruction " - "the code must remain the same"); - I = OldExt; - return false; +/// Return true, if an ext(load) can be formed from an extension in +/// \p MovedExts. +bool CodeGenPrepare::canFormExtLd( + const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, + Instruction *&Inst, bool HasPromoted) { + for (auto *MovedExtInst : MovedExts) { + if (isa<LoadInst>(MovedExtInst->getOperand(0))) { + LI = cast<LoadInst>(MovedExtInst->getOperand(0)); + Inst = MovedExtInst; + break; + } } + if (!LI) + return false; // If they're already in the same block, there's nothing to do. // Make the cheap checks first if we did not promote. // If we promoted, we need to check if it is indeed profitable. - if (!HasPromoted && LI->getParent() == I->getParent()) + if (!HasPromoted && LI->getParent() == Inst->getParent()) return false; - EVT VT = TLI->getValueType(*DL, I->getType()); + EVT VT = TLI->getValueType(*DL, Inst->getType()); EVT LoadVT = TLI->getValueType(*DL, LI->getType()); // If the load has other users and the truncate is not free, this probably // isn't worthwhile. - if (!LI->hasOneUse() && - (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && - !TLI->isTruncateFree(I->getType(), LI->getType())) { - I = OldExt; - TPT.rollback(LastKnownGood); + if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && + !TLI->isTruncateFree(Inst->getType(), LI->getType())) return false; - } // Check whether the target supports casts folded into loads. unsigned LType; - if (isa<ZExtInst>(I)) + if (isa<ZExtInst>(Inst)) LType = ISD::ZEXTLOAD; else { - assert(isa<SExtInst>(I) && "Unexpected ext type!"); + assert(isa<SExtInst>(Inst) && "Unexpected ext type!"); LType = ISD::SEXTLOAD; } - if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) { - I = OldExt; - TPT.rollback(LastKnownGood); + + return TLI->isLoadExtLegal(LType, VT, LoadVT); +} + +/// Move a zext or sext fed by a load into the same basic block as the load, +/// unless conditions are unfavorable. This allows SelectionDAG to fold the +/// extend into the load. +/// +/// E.g., +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which +/// allow us to match zext(load i32*) to i64. +/// +/// Also, try to promote the computations used to obtain a sign extended +/// value used into memory accesses. +/// E.g., +/// \code +/// a = add nsw i32 b, 3 +/// d = sext i32 a to i64 +/// e = getelementptr ..., i64 d +/// \endcode +/// => +/// \code +/// f = sext i32 b to i64 +/// a = add nsw i64 f, 3 +/// e = getelementptr ..., i64 a +/// \endcode +/// +/// \p Inst[in/out] the extension may be modified during the process if some +/// promotions apply. +bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { + // ExtLoad formation and address type promotion infrastructure requires TLI to + // be effective. + if (!TLI) return false; + + bool AllowPromotionWithoutCommonHeader = false; + /// See if it is an interesting sext operations for the address type + /// promotion before trying to promote it, e.g., the ones with the right + /// type and used in memory accesses. + bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( + *Inst, AllowPromotionWithoutCommonHeader); + TypePromotionTransaction TPT(RemovedInsts); + TypePromotionTransaction::ConstRestorationPt LastKnownGood = + TPT.getRestorationPoint(); + SmallVector<Instruction *, 1> Exts; + SmallVector<Instruction *, 2> SpeculativelyMovedExts; + Exts.push_back(Inst); + + bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); + + // Look for a load being extended. + LoadInst *LI = nullptr; + Instruction *ExtFedByLoad; + + // Try to promote a chain of computation if it allows to form an extended + // load. + if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { + assert(LI && ExtFedByLoad && "Expect a valid load and extension"); + TPT.commit(); + // Move the extend into the same block as the load + ExtFedByLoad->removeFromParent(); + ExtFedByLoad->insertAfter(LI); + // CGP does not check if the zext would be speculatively executed when moved + // to the same basic block as the load. Preserving its original location + // would pessimize the debugging experience, as well as negatively impact + // the quality of sample pgo. We don't want to use "line 0" as that has a + // size cost in the line-table section and logically the zext can be seen as + // part of the load. Therefore we conservatively reuse the same debug + // location for the load and the zext. + ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); + ++NumExtsMoved; + Inst = ExtFedByLoad; + return true; } - // Move the extend into the same block as the load, so that SelectionDAG - // can fold it. - TPT.commit(); - I->removeFromParent(); - I->insertAfter(LI); - // CGP does not check if the zext would be speculatively executed when moved - // to the same basic block as the load. Preserving its original location would - // pessimize the debugging experience, as well as negatively impact the - // quality of sample pgo. We don't want to use "line 0" as that has a - // size cost in the line-table section and logically the zext can be seen as - // part of the load. Therefore we conservatively reuse the same debug location - // for the load and the zext. - I->setDebugLoc(LI->getDebugLoc()); - ++NumExtsMoved; - return true; + // Continue promoting SExts if known as considerable depending on targets. + if (ATPConsiderable && + performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, + HasPromoted, TPT, SpeculativelyMovedExts)) + return true; + + TPT.rollback(LastKnownGood); + return false; +} + +// Perform address type promotion if doing so is profitable. +// If AllowPromotionWithoutCommonHeader == false, we should find other sext +// instructions that sign extended the same initial value. However, if +// AllowPromotionWithoutCommonHeader == true, we expect promoting the +// extension is just profitable. +bool CodeGenPrepare::performAddressTypePromotion( + Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, + bool HasPromoted, TypePromotionTransaction &TPT, + SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { + bool Promoted = false; + SmallPtrSet<Instruction *, 1> UnhandledExts; + bool AllSeenFirst = true; + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + DenseMap<Value *, Instruction *>::iterator AlreadySeen = + SeenChainsForSExt.find(HeadOfChain); + // If there is an unhandled SExt which has the same header, try to promote + // it as well. + if (AlreadySeen != SeenChainsForSExt.end()) { + if (AlreadySeen->second != nullptr) + UnhandledExts.insert(AlreadySeen->second); + AllSeenFirst = false; + } + } + + if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && + SpeculativelyMovedExts.size() == 1)) { + TPT.commit(); + if (HasPromoted) + Promoted = true; + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + SeenChainsForSExt[HeadOfChain] = nullptr; + ValToSExtendedUses[HeadOfChain].push_back(I); + } + // Update Inst as promotion happen. + Inst = SpeculativelyMovedExts.pop_back_val(); + } else { + // This is the first chain visited from the header, keep the current chain + // as unhandled. Defer to promote this until we encounter another SExt + // chain derived from the same header. + for (auto I : SpeculativelyMovedExts) { + Value *HeadOfChain = I->getOperand(0); + SeenChainsForSExt[HeadOfChain] = Inst; + } + return false; + } + + if (!AllSeenFirst && !UnhandledExts.empty()) + for (auto VisitedSExt : UnhandledExts) { + if (RemovedInsts.count(VisitedSExt)) + continue; + TypePromotionTransaction TPT(RemovedInsts); + SmallVector<Instruction *, 1> Exts; + SmallVector<Instruction *, 2> Chains; + Exts.push_back(VisitedSExt); + bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); + TPT.commit(); + if (HasPromoted) + Promoted = true; + for (auto I : Chains) { + Value *HeadOfChain = I->getOperand(0); + // Mark this as handled. + SeenChainsForSExt[HeadOfChain] = nullptr; + ValToSExtendedUses[HeadOfChain].push_back(I); + } + } + return Promoted; } bool CodeGenPrepare::optimizeExtUses(Instruction *I) { @@ -4534,13 +4468,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) return false; - // Skip loads we've already transformed or have no reason to transform. - if (Load->hasOneUse()) { - User *LoadUser = *Load->user_begin(); - if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() && - !dyn_cast<PHINode>(LoadUser)) - return false; - } + // Skip loads we've already transformed. + if (Load->hasOneUse() && + InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) + return false; // Look at all uses of Load, looking through phis, to determine how many bits // of the loaded value are needed. @@ -4590,16 +4521,14 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { if (!ShlC) return false; uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); - auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); - DemandBits |= ShlDemandBits; + DemandBits.setLowBits(BitWidth - ShiftAmt); break; } case llvm::Instruction::Trunc: { EVT TruncVT = TLI->getValueType(*DL, I->getType()); unsigned TruncBitWidth = TruncVT.getSizeInBits(); - auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); - DemandBits |= TruncBits; + DemandBits.setLowBits(TruncBitWidth); break; } @@ -4620,7 +4549,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { // // Also avoid hoisting if we didn't see any ands with the exact DemandBits // mask, since these are the only ands that will be removed by isel. - if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) || WidestAndBits != DemandBits) return false; @@ -4636,6 +4565,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { IRBuilder<> Builder(Load->getNextNode()); auto *NewAnd = dyn_cast<Instruction>( Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + // Mark this instruction as "inserted by CGP", so that other + // optimizations don't touch it. + InsertedInsts.insert(NewAnd); // Replace all uses of load with new and (except for the use of load in the // new and itself). @@ -4985,7 +4917,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); ExtInst->insertBefore(SI); SI->setCondition(ExtInst); - for (SwitchInst::CaseIt Case : SI->cases()) { + for (auto Case : SI->cases()) { APInt NarrowConst = Case.getCaseValue()->getValue(); APInt WideConst = (ExtType == Instruction::ZExt) ? NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); @@ -5483,7 +5415,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) { + if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; @@ -5514,7 +5446,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { - bool MadeChange = moveExtToFormExtLoad(I); + bool MadeChange = optimizeExt(I); return MadeChange | optimizeExtUses(I); } } @@ -5548,8 +5480,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { return false; } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { + unsigned AS = RMW->getPointerAddressSpace(); + return optimizeMemoryInst(I, RMW->getPointerOperand(), + RMW->getType(), AS); + } + + if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { + unsigned AS = CmpX->getPointerAddressSpace(); + return optimizeMemoryInst(I, CmpX->getPointerOperand(), + CmpX->getCompareOperand()->getType(), AS); + } + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); + if (BinOp && (BinOp->getOpcode() == Instruction::And) && + EnableAndCmpSinking && TLI) + return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); + if (BinOp && (BinOp->getOpcode() == Instruction::AShr || BinOp->getOpcode() == Instruction::LShr)) { ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); @@ -5679,68 +5627,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) { return MadeChange; } -// If there is a sequence that branches based on comparing a single bit -// against zero that can be combined into a single instruction, and the -// target supports folding these into a single instruction, sink the -// mask and compare into the branch uses. Do this before OptimizeBlock -> -// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being -// searched for. -bool CodeGenPrepare::sinkAndCmp(Function &F) { - if (!EnableAndCmpSinking) - return false; - if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) - return false; - bool MadeChange = false; - for (BasicBlock &BB : F) { - // Does this BB end with the following? - // %andVal = and %val, #single-bit-set - // %icmpVal = icmp %andResult, 0 - // br i1 %cmpVal label %dest1, label %dest2" - BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator()); - if (!Brcc || !Brcc->isConditional()) - continue; - ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0)); - if (!Cmp || Cmp->getParent() != &BB) - continue; - ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1)); - if (!Zero || !Zero->isZero()) - continue; - Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0)); - if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB) - continue; - ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1)); - if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) - continue; - DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump()); - - // Push the "and; icmp" for any users that are conditional branches. - // Since there can only be one branch use per BB, we don't need to keep - // track of which BBs we insert into. - for (Use &TheUse : Cmp->uses()) { - // Find brcc use. - BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse); - if (!BrccUser || !BrccUser->isConditional()) - continue; - BasicBlock *UserBB = BrccUser->getParent(); - if (UserBB == &BB) continue; - DEBUG(dbgs() << "found Brcc use\n"); - - // Sink the "and; icmp" to use. - MadeChange = true; - BinaryOperator *NewAnd = - BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", - BrccUser); - CmpInst *NewCmp = - CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, - "", BrccUser); - TheUse = NewCmp; - ++NumAndCmpsMoved; - DEBUG(BrccUser->getParent()->dump()); - } - } - return MadeChange; -} - /// \brief Scale down both weights to fit into uint32_t. static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; diff --git a/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp b/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp index 1e46a7a99e7e..7f7350f5fb5c 100644 --- a/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp +++ b/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp @@ -41,7 +41,7 @@ namespace { Type *VoidTy = Type::getVoidTy(F.getContext()); Constant *CountingFn = F.getParent()->getOrInsertFunction(CountingFunctionName, - VoidTy, nullptr); + VoidTy); CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt()); return true; } diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 5d60c3055456..b2d6652b075e 100644 --- a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // callee-saved register that is not saved in the prolog. const MachineFrameInfo &MFI = MF.getFrameInfo(); BitVector Pristine = MFI.getPristineRegs(MF); - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) { - if (!IsReturnBlock && !Pristine.test(*I)) continue; + for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; + ++I) { + unsigned Reg = *I; + if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + continue; for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); @@ -645,10 +648,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits, // as well. const SUnit *SU = MISUnitMap[Q->second->getParent()]; if (!SU) continue; - for (DbgValueVector::iterator DVI = DbgValues.begin(), - DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q->second->getParent()) - UpdateDbgValue(*DVI->first, AntiDepReg, NewReg); + UpdateDbgValues(DbgValues, Q->second->getParent(), + AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp index 7b1b2d64fccc..65f58e5686e0 100644 --- a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp +++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp @@ -213,10 +213,8 @@ VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf, VLIWPacketizerList::~VLIWPacketizerList() { - if (VLIWScheduler) - delete VLIWScheduler; - if (ResourceTracker) - delete ResourceTracker; + delete VLIWScheduler; + delete ResourceTracker; } diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index 17c229a216ae..265dda16bfa7 100644 --- a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -23,7 +23,7 @@ using namespace llvm; -#define DEBUG_TYPE "codegen-dce" +#define DEBUG_TYPE "dead-mi-elimination" STATISTIC(NumDeletes, "Number of dead instructions deleted"); @@ -54,7 +54,7 @@ namespace { char DeadMachineInstructionElim::ID = 0; char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID; -INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination", +INITIALIZE_PASS(DeadMachineInstructionElim, DEBUG_TYPE, "Remove dead machine instructions", false, false) bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const { @@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { // Start out assuming that reserved registers are live out of this block. LivePhysRegs = MRI->getReservedRegs(); - // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not + // Add live-ins from successors to LivePhysRegs. Normally, physregs are not // live across blocks, but some targets (x86) can have flags live out of a // block. for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), diff --git a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp index a7ba694c144d..ab9a0592e017 100644 --- a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp +++ b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp @@ -132,8 +132,7 @@ private: char DetectDeadLanes::ID = 0; char &llvm::DetectDeadLanesID = DetectDeadLanes::ID; -INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes", - false, false) +INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false) /// Returns true if \p MI will get lowered to a series of COPY instructions. /// We call this a COPY-like instruction. @@ -441,7 +440,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) { const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg); CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO); if (CrossCopy) - DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI); + DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI); } if (!CrossCopy) diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp index 38af19a04448..06ae5cd72c85 100644 --- a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" @@ -34,8 +35,6 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered"); namespace { class DwarfEHPrepare : public FunctionPass { - const TargetMachine *TM; - // RewindFunction - _Unwind_Resume or the target equivalent. Constant *RewindFunction; @@ -52,15 +51,9 @@ namespace { public: static char ID; // Pass identification, replacement for typeid. - // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in - // practice. DwarfEHPrepare() - : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr), DT(nullptr), - TLI(nullptr) {} - - DwarfEHPrepare(const TargetMachine *TM) - : FunctionPass(ID), TM(TM), RewindFunction(nullptr), DT(nullptr), - TLI(nullptr) {} + : FunctionPass(ID), RewindFunction(nullptr), DT(nullptr), TLI(nullptr) { + } bool runOnFunction(Function &Fn) override; @@ -78,18 +71,18 @@ namespace { } // end anonymous namespace char DwarfEHPrepare::ID = 0; -INITIALIZE_TM_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare", - "Prepare DWARF exceptions", false, false) +INITIALIZE_PASS_BEGIN(DwarfEHPrepare, DEBUG_TYPE, + "Prepare DWARF exceptions", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_TM_PASS_END(DwarfEHPrepare, "dwarfehprepare", - "Prepare DWARF exceptions", false, false) +INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE, + "Prepare DWARF exceptions", false, false) -FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) { - return new DwarfEHPrepare(TM); -} +FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); } void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); } @@ -254,9 +247,10 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) { } bool DwarfEHPrepare::runOnFunction(Function &Fn) { - assert(TM && "DWARF EH preparation requires a target machine"); + const TargetMachine &TM = + getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = TM->getSubtargetImpl(Fn)->getTargetLowering(); + TLI = TM.getSubtargetImpl(Fn)->getTargetLowering(); bool Changed = InsertUnwindResumeCalls(Fn); DT = nullptr; TLI = nullptr; diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp index 729172796453..402afe75b141 100644 --- a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -616,13 +616,13 @@ private: char EarlyIfConverter::ID = 0; char &llvm::EarlyIfConverterID = EarlyIfConverter::ID; -INITIALIZE_PASS_BEGIN(EarlyIfConverter, - "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE, + "Early If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(EarlyIfConverter, - "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE, + "Early If Converter", false, false) void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineBranchProbabilityInfo>(); diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp index 32c57e3e3705..e272d25047e6 100644 --- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp +++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp @@ -6,21 +6,9 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file contains the execution dependency fix pass. -// -// Some X86 SSE instructions like mov, and, or, xor are available in different -// variants for different operand types. These variant instructions are -// equivalent, but on Nehalem and newer cpus there is extra latency -// transferring data between integer and floating point domains. ARM cores -// have similar issues when they are configured with both VFP and NEON -// pipelines. -// -// This pass changes the variant instructions to minimize domain crossings. -// -//===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ExecutionDepsFix.h" + #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,193 +23,18 @@ using namespace llvm; -#define DEBUG_TYPE "execution-fix" - -/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track -/// of execution domains. -/// -/// An open DomainValue represents a set of instructions that can still switch -/// execution domain. Multiple registers may refer to the same open -/// DomainValue - they will eventually be collapsed to the same execution -/// domain. -/// -/// A collapsed DomainValue represents a single register that has been forced -/// into one of more execution domains. There is a separate collapsed -/// DomainValue for each register, but it may contain multiple execution -/// domains. A register value is initially created in a single execution -/// domain, but if we were forced to pay the penalty of a domain crossing, we -/// keep track of the fact that the register is now available in multiple -/// domains. -namespace { -struct DomainValue { - // Basic reference counting. - unsigned Refs; - - // Bitmask of available domains. For an open DomainValue, it is the still - // possible domains for collapsing. For a collapsed DomainValue it is the - // domains where the register is available for free. - unsigned AvailableDomains; - - // Pointer to the next DomainValue in a chain. When two DomainValues are - // merged, Victim.Next is set to point to Victor, so old DomainValue - // references can be updated by following the chain. - DomainValue *Next; - - // Twiddleable instructions using or defining these registers. - SmallVector<MachineInstr*, 8> Instrs; - - // A collapsed DomainValue has no instructions to twiddle - it simply keeps - // track of the domains where the registers are already available. - bool isCollapsed() const { return Instrs.empty(); } - - // Is domain available? - bool hasDomain(unsigned domain) const { - assert(domain < - static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && - "undefined behavior"); - return AvailableDomains & (1u << domain); - } - - // Mark domain as available. - void addDomain(unsigned domain) { - AvailableDomains |= 1u << domain; - } - - // Restrict to a single domain available. - void setSingleDomain(unsigned domain) { - AvailableDomains = 1u << domain; - } - - // Return bitmask of domains that are available and in mask. - unsigned getCommonDomains(unsigned mask) const { - return AvailableDomains & mask; - } - - // First domain available. - unsigned getFirstDomain() const { - return countTrailingZeros(AvailableDomains); - } - - DomainValue() : Refs(0) { clear(); } - - // Clear this DomainValue and point to next which has all its data. - void clear() { - AvailableDomains = 0; - Next = nullptr; - Instrs.clear(); - } -}; -} - -namespace { -/// Information about a live register. -struct LiveReg { - /// Value currently in this register, or NULL when no value is being tracked. - /// This counts as a DomainValue reference. - DomainValue *Value; - - /// Instruction that defined this register, relative to the beginning of the - /// current basic block. When a LiveReg is used to represent a live-out - /// register, this value is relative to the end of the basic block, so it - /// will be a negative number. - int Def; -}; -} // anonymous namespace - -namespace { -class ExeDepsFix : public MachineFunctionPass { - static char ID; - SpecificBumpPtrAllocator<DomainValue> Allocator; - SmallVector<DomainValue*,16> Avail; - - const TargetRegisterClass *const RC; - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RegClassInfo; - std::vector<SmallVector<int, 1>> AliasMap; - const unsigned NumRegs; - LiveReg *LiveRegs; - typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap; - LiveOutMap LiveOuts; - - /// List of undefined register reads in this block in forward order. - std::vector<std::pair<MachineInstr*, unsigned> > UndefReads; - - /// Storage for register unit liveness. - LivePhysRegs LiveRegSet; - - /// Current instruction number. - /// The first instruction in each basic block is 0. - int CurInstr; - - /// True when the current block has a predecessor that hasn't been visited - /// yet. - bool SeenUnknownBackEdge; - -public: - ExeDepsFix(const TargetRegisterClass *rc) - : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - - StringRef getPassName() const override { return "Execution dependency fix"; } - -private: - iterator_range<SmallVectorImpl<int>::const_iterator> - regIndices(unsigned Reg) const; - - // DomainValue allocation. - DomainValue *alloc(int domain = -1); - DomainValue *retain(DomainValue *DV) { - if (DV) ++DV->Refs; - return DV; - } - void release(DomainValue*); - DomainValue *resolve(DomainValue*&); - - // LiveRegs manipulations. - void setLiveReg(int rx, DomainValue *DV); - void kill(int rx); - void force(int rx, unsigned domain); - void collapse(DomainValue *dv, unsigned domain); - bool merge(DomainValue *A, DomainValue *B); - - void enterBasicBlock(MachineBasicBlock*); - void leaveBasicBlock(MachineBasicBlock*); - void visitInstr(MachineInstr*); - void processDefs(MachineInstr*, bool Kill); - void visitSoftInstr(MachineInstr*, unsigned mask); - void visitHardInstr(MachineInstr*, unsigned domain); - void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); - bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); - void processUndefReads(MachineBasicBlock*); -}; -} - -char ExeDepsFix::ID = 0; +#define DEBUG_TYPE "execution-deps-fix" /// Translate TRI register number to a list of indices into our smaller tables /// of interesting registers. iterator_range<SmallVectorImpl<int>::const_iterator> -ExeDepsFix::regIndices(unsigned Reg) const { +ExecutionDepsFix::regIndices(unsigned Reg) const { assert(Reg < AliasMap.size() && "Invalid register"); const auto &Entry = AliasMap[Reg]; return make_range(Entry.begin(), Entry.end()); } -DomainValue *ExeDepsFix::alloc(int domain) { +DomainValue *ExecutionDepsFix::alloc(int domain) { DomainValue *dv = Avail.empty() ? new(Allocator.Allocate()) DomainValue : Avail.pop_back_val(); @@ -234,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) { /// Release a reference to DV. When the last reference is released, /// collapse if needed. -void ExeDepsFix::release(DomainValue *DV) { +void ExecutionDepsFix::release(DomainValue *DV) { while (DV) { assert(DV->Refs && "Bad DomainValue"); if (--DV->Refs) @@ -254,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) { /// Follow the chain of dead DomainValues until a live DomainValue is reached. /// Update the referenced pointer when necessary. -DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { +DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) { DomainValue *DV = DVRef; if (!DV || !DV->Next) return DV; @@ -271,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) { } /// Set LiveRegs[rx] = dv, updating reference counts. -void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { +void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); @@ -283,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) { } // Kill register rx, recycle or collapse any DomainValue. -void ExeDepsFix::kill(int rx) { +void ExecutionDepsFix::kill(int rx) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); if (!LiveRegs[rx].Value) @@ -294,7 +107,7 @@ void ExeDepsFix::kill(int rx) { } /// Force register rx into domain. -void ExeDepsFix::force(int rx, unsigned domain) { +void ExecutionDepsFix::force(int rx, unsigned domain) { assert(unsigned(rx) < NumRegs && "Invalid index"); assert(LiveRegs && "Must enter basic block first."); if (DomainValue *dv = LiveRegs[rx].Value) { @@ -317,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) { /// Collapse open DomainValue into given domain. If there are multiple /// registers using dv, they each get a unique collapsed DomainValue. -void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { +void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) { assert(dv->hasDomain(domain) && "Cannot collapse"); // Collapse all the instructions. @@ -333,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) { } /// All instructions and registers in B are moved to A, and B is released. -bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { +bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) { assert(!A->isCollapsed() && "Cannot merge into collapsed"); assert(!B->isCollapsed() && "Cannot merge from collapsed"); if (A == B) @@ -359,10 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) { } /// Set up LiveRegs by merging predecessor live-out values. -void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { - // Detect back-edges from predecessors we haven't processed yet. - SeenUnknownBackEdge = false; - +void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // Reset instruction counter in each basic block. CurInstr = 0; @@ -397,18 +207,21 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // Try to coalesce live-out registers from predecessors. for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), pe = MBB->pred_end(); pi != pe; ++pi) { - LiveOutMap::const_iterator fi = LiveOuts.find(*pi); - if (fi == LiveOuts.end()) { - SeenUnknownBackEdge = true; + auto fi = MBBInfos.find(*pi); + assert(fi != MBBInfos.end() && + "Should have pre-allocated MBBInfos for all MBBs"); + LiveReg *Incoming = fi->second.OutRegs; + // Incoming is null if this is a backedge from a BB + // we haven't processed yet + if (Incoming == nullptr) { continue; } - assert(fi->second && "Can't have NULL entries"); for (unsigned rx = 0; rx != NumRegs; ++rx) { // Use the most recent predecessor def for each register. - LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def); + LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); - DomainValue *pdv = resolve(fi->second[rx].Value); + DomainValue *pdv = resolve(Incoming[rx].Value); if (!pdv) continue; if (!LiveRegs[rx].Value) { @@ -432,35 +245,34 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { force(rx, pdv->getFirstDomain()); } } - DEBUG(dbgs() << "BB#" << MBB->getNumber() - << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n")); + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); } -void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { +void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { assert(LiveRegs && "Must enter basic block first."); - // Save live registers at end of MBB - used by enterBasicBlock(). - // Also use LiveOuts as a visited set to detect back-edges. - bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second; - - if (First) { - // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to - // the end of this block instead of the beginning. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - LiveRegs[i].Def -= CurInstr; - } else { - // Insertion failed, this must be the second pass. + LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; + // Save register clearances at end of MBB - used by enterBasicBlock(). + MBBInfos[MBB].OutRegs = LiveRegs; + + // While processing the basic block, we kept `Def` relative to the start + // of the basic block for convenience. However, future use of this information + // only cares about the clearance from the end of the block, so adjust + // everything to be relative to the end of the basic block. + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def -= CurInstr; + if (OldOutRegs) { + // This must be the second pass. // Release all the DomainValues instead of keeping them. for (unsigned i = 0, e = NumRegs; i != e; ++i) - release(LiveRegs[i].Value); - delete[] LiveRegs; + release(OldOutRegs[i].Value); + delete[] OldOutRegs; } LiveRegs = nullptr; } -void ExeDepsFix::visitInstr(MachineInstr *MI) { - if (MI->isDebugValue()) - return; - +bool ExecutionDepsFix::visitInstr(MachineInstr *MI) { // Update instructions with explicit execution domains. std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI); if (DomP.first) { @@ -470,16 +282,16 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) { visitHardInstr(MI, DomP.first); } - // Process defs to track register ages, and kill values clobbered by generic - // instructions. - processDefs(MI, !DomP.first); + return !DomP.first; } /// \brief Helps avoid false dependencies on undef registers by updating the /// machine instructions' undef operand to use a register that the instruction /// is truly dependent on, or use a register with clearance higher than Pref. -void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +/// Returns true if it was able to find a true dependency, thus not requiring +/// a dependency breaking instruction regardless of clearance. +bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, + unsigned OpIdx, unsigned Pref) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); @@ -487,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // Update only undef operands that are mapped to one register. if (AliasMap[OriginalReg].size() != 1) - return; + return false; // Get the undef operand's register class const TargetRegisterClass *OpRC = @@ -502,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // We found a true dependency - replace the undef register with the true // dependency. MO.setReg(CurrMO.getReg()); - return; + return true; } // Go over all registers in the register class and find the register with @@ -527,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // Update the operand if we found a register with better clearance. if (MaxClearanceReg != OriginalReg) MO.setReg(MaxClearanceReg); + + return false; } /// \brief Return true to if it makes sense to break dependence on a partial def /// or undef use. -bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { unsigned reg = MI->getOperand(OpIdx).getReg(); for (int rx : regIndices(reg)) { unsigned Clearance = CurInstr - LiveRegs[rx].Def; @@ -542,14 +356,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, DEBUG(dbgs() << ": Break dependency.\n"); continue; } - // The current clearance seems OK, but we may be ignoring a def from a - // back-edge. - if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { - DEBUG(dbgs() << ": OK .\n"); - return false; - } - // A def from an unprocessed back-edge may make us break this dependency. - DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); + DEBUG(dbgs() << ": OK .\n"); return false; } return true; @@ -559,16 +366,22 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, // If Kill is set, also kill off DomainValues clobbered by the defs. // // Also break dependencies on partial defs and undef uses. -void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { +void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, + bool Kill) { assert(!MI->isDebugValue() && "Won't process debug values"); // Break dependence on undef uses. Do this before updating LiveRegs below. unsigned OpNum; - unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); - if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); + if (breakDependency) { + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { + bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); + // We don't need to bother trying to break a dependency if this + // instruction has a true dependency on that register through another + // operand - we'll have to wait for it to be available regardless. + if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } } const MCInstrDesc &MCID = MI->getDesc(); for (unsigned i = 0, @@ -584,11 +397,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr << '\t' << *MI); - // Check clearance before partial register updates. - // Call breakDependence before setting LiveRegs[rx].Def. - unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); - if (Pref && shouldBreakDependence(MI, i, Pref)) - TII->breakPartialRegDependency(*MI, i, TRI); + if (breakDependency) { + // Check clearance before partial register updates. + // Call breakDependence before setting LiveRegs[rx].Def. + unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); + if (Pref && shouldBreakDependence(MI, i, Pref)) + TII->breakPartialRegDependency(*MI, i, TRI); + } // How many instructions since rx was last written? LiveRegs[rx].Def = CurInstr; @@ -607,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { /// only do it on demand. Note that the occurrence of undefined register reads /// that should be broken is very rare, but when they occur we may have many in /// a single block. -void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { +void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) { if (UndefReads.empty()) return; @@ -640,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { // A hard instruction only works in one domain. All input registers will be // forced into that domain. -void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { +void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { // Collapse all uses. for (unsigned i = mi->getDesc().getNumDefs(), e = mi->getDesc().getNumOperands(); i != e; ++i) { @@ -663,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { } // A soft instruction can be changed to work in other domains given by mask. -void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { +void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { // Bitmask of available domains for this instruction after taking collapsed // operands into account. unsigned available = mask; @@ -774,7 +589,34 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { } } -bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { +void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, + bool PrimaryPass) { + enterBasicBlock(MBB); + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + bool breakDependency = isBlockDone(MBB); + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { + bool Kill = false; + if (PrimaryPass) + Kill = visitInstr(&MI); + processDefs(&MI, breakDependency, Kill); + } + } + if (breakDependency) + processUndefReads(MBB); + leaveBasicBlock(MBB); +} + +bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { + return MBBInfos[MBB].PrimaryCompleted && + MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && + MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(*mf.getFunction())) return false; MF = &mf; @@ -810,52 +652,104 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { AliasMap[*AI].push_back(i); } + // Initialize the MMBInfos + for (auto &MBB : mf) { + MBBInfo InitialInfo; + MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); + } + + /* + * We want to visit every instruction in every basic block in order to update + * it's execution domain or break any false dependencies. However, for the + * dependency breaking, we need to know clearances from all predecessors + * (including any backedges). One way to do so would be to do two complete + * passes over all basic blocks/instructions, the first for recording + * clearances, the second to break the dependencies. However, for functions + * without backedges, or functions with a lot of straight-line code, and + * a small loop, that would be a lot of unnecessary work (since only the + * BBs that are part of the loop require two passes). As an example, + * consider the following loop. + * + * + * PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT + * ^ | + * +----------------------------------+ + * + * The iteration order is as follows: + * Naive: PH A B C D A' B' C' D' + * Optimized: PH A B C A' B' C' D + * + * Note that we avoid processing D twice, because we can entirely process + * the predecessors before getting to D. We call a block that is ready + * for its second round of processing `done` (isBlockDone). Once we finish + * processing some block, we update the counters in MBBInfos and re-process + * any successors that are now done. + */ + MachineBasicBlock *Entry = &*MF->begin(); ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry); - SmallVector<MachineBasicBlock*, 16> Loops; + SmallVector<MachineBasicBlock *, 4> Workqueue; for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { MachineBasicBlock *MBB = *MBBI; - enterBasicBlock(MBB); - if (SeenUnknownBackEdge) - Loops.push_back(MBB); - for (MachineInstr &MI : *MBB) - visitInstr(&MI); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // N.B: IncomingProcessed and IncomingCompleted were already updated while + // processing this block's predecessors. + MBBInfos[MBB].PrimaryCompleted = true; + MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; + bool Primary = true; + Workqueue.push_back(MBB); + while (!Workqueue.empty()) { + MachineBasicBlock *ActiveMBB = &*Workqueue.back(); + Workqueue.pop_back(); + processBasicBlock(ActiveMBB, Primary); + bool Done = isBlockDone(ActiveMBB); + for (auto *Succ : ActiveMBB->successors()) { + if (!isBlockDone(Succ)) { + if (Primary) { + MBBInfos[Succ].IncomingProcessed++; + } + if (Done) { + MBBInfos[Succ].IncomingCompleted++; + } + if (isBlockDone(Succ)) { + Workqueue.push_back(Succ); + } + } + } + Primary = false; + } } - // Visit all the loop blocks again in order to merge DomainValues from - // back-edges. - for (MachineBasicBlock *MBB : Loops) { - enterBasicBlock(MBB); - for (MachineInstr &MI : *MBB) - if (!MI.isDebugValue()) - processDefs(&MI, false); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // We need to go through again and finalize any blocks that are not done yet. + // This is possible if blocks have dead predecessors, so we didn't visit them + // above. + for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + if (!isBlockDone(MBB)) { + processBasicBlock(MBB, false); + // Don't update successors here. We'll get to them anyway through this + // loop. + } } // Clear the LiveOuts vectors and collapse any remaining DomainValues. for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI); - if (FI == LiveOuts.end() || !FI->second) + auto FI = MBBInfos.find(*MBBI); + if (FI == MBBInfos.end() || !FI->second.OutRegs) continue; for (unsigned i = 0, e = NumRegs; i != e; ++i) - if (FI->second[i].Value) - release(FI->second[i].Value); - delete[] FI->second; + if (FI->second.OutRegs[i].Value) + release(FI->second.OutRegs[i].Value); + delete[] FI->second.OutRegs; } - LiveOuts.clear(); + MBBInfos.clear(); UndefReads.clear(); Avail.clear(); Allocator.DestroyAll(); return false; } - -FunctionPass * -llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) { - return new ExeDepsFix(RC); -} diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp index 0ec79c2e69f9..88d422a0f545 100644 --- a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp +++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp @@ -41,7 +41,7 @@ namespace { char ExpandISelPseudos::ID = 0; char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID; -INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos", +INITIALIZE_PASS(ExpandISelPseudos, DEBUG_TYPE, "Expand ISel Pseudo-instructions", false, false) bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) { diff --git a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp index ab2382e2db6d..27cd639b2a49 100644 --- a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -58,7 +58,7 @@ private: char ExpandPostRA::ID = 0; char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID; -INITIALIZE_PASS(ExpandPostRA, "postrapseudos", +INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE, "Post-RA pseudo instruction expansion pass", false, false) /// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered @@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) { MachineOperand &DstMO = MI->getOperand(0); MachineOperand &SrcMO = MI->getOperand(1); - if (SrcMO.getReg() == DstMO.getReg()) { - DEBUG(dbgs() << "identity copy: " << *MI); + bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg()); + if (IdentityCopy || SrcMO.isUndef()) { + DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy: ") << *MI); // No need to insert an identity copy instruction, but replace with a KILL // if liveness is changed. if (SrcMO.isUndef() || MI->getNumOperands() > 2) { diff --git a/contrib/llvm/lib/CodeGen/ExpandReductions.cpp b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp new file mode 100644 index 000000000000..a40ea28056dd --- /dev/null +++ b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp @@ -0,0 +1,167 @@ +//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for reduction intrinsics, allowing targets +// to enable the experimental intrinsics until just before codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/ExpandReductions.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + +unsigned getOpcode(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + return Instruction::FAdd; + case Intrinsic::experimental_vector_reduce_fmul: + return Instruction::FMul; + case Intrinsic::experimental_vector_reduce_add: + return Instruction::Add; + case Intrinsic::experimental_vector_reduce_mul: + return Instruction::Mul; + case Intrinsic::experimental_vector_reduce_and: + return Instruction::And; + case Intrinsic::experimental_vector_reduce_or: + return Instruction::Or; + case Intrinsic::experimental_vector_reduce_xor: + return Instruction::Xor; + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + return Instruction::ICmp; + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + return Instruction::FCmp; + default: + llvm_unreachable("Unexpected ID"); + } +} + +RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_smax: + return RecurrenceDescriptor::MRK_SIntMax; + case Intrinsic::experimental_vector_reduce_smin: + return RecurrenceDescriptor::MRK_SIntMin; + case Intrinsic::experimental_vector_reduce_umax: + return RecurrenceDescriptor::MRK_UIntMax; + case Intrinsic::experimental_vector_reduce_umin: + return RecurrenceDescriptor::MRK_UIntMin; + case Intrinsic::experimental_vector_reduce_fmax: + return RecurrenceDescriptor::MRK_FloatMax; + case Intrinsic::experimental_vector_reduce_fmin: + return RecurrenceDescriptor::MRK_FloatMin; + default: + return RecurrenceDescriptor::MRK_Invalid; + } +} + +bool expandReductions(Function &F, const TargetTransformInfo *TTI) { + bool Changed = false; + SmallVector<IntrinsicInst*, 4> Worklist; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (auto II = dyn_cast<IntrinsicInst>(&*I)) + Worklist.push_back(II); + + for (auto *II : Worklist) { + IRBuilder<> Builder(II); + Value *Vec = nullptr; + auto ID = II->getIntrinsicID(); + auto MRK = RecurrenceDescriptor::MRK_Invalid; + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + // FMFs must be attached to the call, otherwise it's an ordered reduction + // and it can't be handled by generating this shuffle sequence. + // TODO: Implement scalarization of ordered reductions here for targets + // without native support. + if (!II->getFastMathFlags().unsafeAlgebra()) + continue; + Vec = II->getArgOperand(1); + break; + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + Vec = II->getArgOperand(0); + MRK = getMRK(ID); + break; + default: + continue; + } + if (!TTI->shouldExpandReduction(II)) + continue; + auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + II->replaceAllUsesWith(Rdx); + II->eraseFromParent(); + Changed = true; + } + return Changed; +} + +class ExpandReductions : public FunctionPass { +public: + static char ID; + ExpandReductions() : FunctionPass(ID) { + initializeExpandReductionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + return expandReductions(F, TTI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} + +char ExpandReductions::ID; +INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) + +FunctionPass *llvm::createExpandReductionsPass() { + return new ExpandReductions(); +} + +PreservedAnalyses ExpandReductionsPass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &TTI = AM.getResult<TargetIRAnalysis>(F); + if (!expandReductions(F, &TTI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/contrib/llvm/lib/CodeGen/FEntryInserter.cpp b/contrib/llvm/lib/CodeGen/FEntryInserter.cpp new file mode 100644 index 000000000000..0759bf6713e0 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/FEntryInserter.cpp @@ -0,0 +1,55 @@ +//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file edits function bodies to insert fentry calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +namespace { +struct FEntryInserter : public MachineFunctionPass { + static char ID; // Pass identification, replacement for typeid + FEntryInserter() : MachineFunctionPass(ID) { + initializeFEntryInserterPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; +}; +} + +bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) { + const std::string FEntryName = + MF.getFunction()->getFnAttribute("fentry-call").getValueAsString(); + if (FEntryName != "true") + return false; + + auto &FirstMBB = *MF.begin(); + auto &FirstMI = *FirstMBB.begin(); + + auto *TII = MF.getSubtarget().getInstrInfo(); + BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(), + TII->get(TargetOpcode::FENTRY_CALL)); + return true; +} + +char FEntryInserter::ID = 0; +char &llvm::FEntryInserterID = FEntryInserter::ID; +INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false, + false) diff --git a/contrib/llvm/lib/CodeGen/FaultMaps.cpp b/contrib/llvm/lib/CodeGen/FaultMaps.cpp index 2acafafdb9fc..43f364128978 100644 --- a/contrib/llvm/lib/CodeGen/FaultMaps.cpp +++ b/contrib/llvm/lib/CodeGen/FaultMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- FaultMaps.cpp ---------------------------===// +//===- FaultMaps.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +7,17 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/FaultMaps.h" - +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel, } } - const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) { switch (FT) { default: llvm_unreachable("unhandled fault type!"); - case FaultMaps::FaultingLoad: return "FaultingLoad"; + case FaultMaps::FaultingLoadStore: + return "FaultingLoadStore"; + case FaultMaps::FaultingStore: + return "FaultingStore"; } } diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp index d61afad4db57..0bdd5e64a7f2 100644 --- a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp +++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp @@ -37,7 +37,7 @@ public: char FuncletLayout::ID = 0; char &llvm::FuncletLayoutID = FuncletLayout::ID; -INITIALIZE_PASS(FuncletLayout, "funclet-layout", +INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE, "Contiguously Lay Out Funclets", false, false) bool FuncletLayout::runOnMachineFunction(MachineFunction &F) { diff --git a/contrib/llvm/lib/CodeGen/GCStrategy.cpp b/contrib/llvm/lib/CodeGen/GCStrategy.cpp index 31ab86fdf276..6be4c16c6301 100644 --- a/contrib/llvm/lib/CodeGen/GCStrategy.cpp +++ b/contrib/llvm/lib/CodeGen/GCStrategy.cpp @@ -1,4 +1,4 @@ -//===-- GCStrategy.cpp - Garbage Collector Description --------------------===// +//===- GCStrategy.cpp - Garbage Collector Description ---------------------===// // // The LLVM Compiler Infrastructure // @@ -18,7 +18,4 @@ using namespace llvm; LLVM_INSTANTIATE_REGISTRY(GCRegistry) -GCStrategy::GCStrategy() - : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false), - CustomWriteBarriers(false), CustomRoots(false), InitRoots(true), - UsesMetadata(false) {} +GCStrategy::GCStrategy() = default; diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 13212212fa01..be0c5c2bb70e 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -24,40 +24,42 @@ using namespace llvm; bool CallLowering::lowerCall( - MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg, + MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg, ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const { - auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout(); + auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout(); // First step is to marshall all the function's parameters into the correct // physregs and memory locations. Gather the sequence of argument types that // we'll pass to the assigner function. SmallVector<ArgInfo, 8> OrigArgs; unsigned i = 0; - for (auto &Arg : CI.arg_operands()) { - ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}}; - setArgFlags(OrigArg, i + 1, DL, CI); + unsigned NumFixedArgs = CS.getFunctionType()->getNumParams(); + for (auto &Arg : CS.args()) { + ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}, + i < NumFixedArgs}; + setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS); OrigArgs.push_back(OrigArg); ++i; } MachineOperand Callee = MachineOperand::CreateImm(0); - if (Function *F = CI.getCalledFunction()) + if (const Function *F = CS.getCalledFunction()) Callee = MachineOperand::CreateGA(F, 0); else Callee = MachineOperand::CreateReg(GetCalleeReg(), false); - ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}}; + ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}}; if (!OrigRet.Ty->isVoidTy()) - setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI); + setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS); - return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs); + return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs); } template <typename FuncInfoTy> void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL, const FuncInfoTy &FuncInfo) const { - const AttributeSet &Attrs = FuncInfo.getAttributes(); + const AttributeList &Attrs = FuncInfo.getAttributes(); if (Attrs.hasAttribute(OpIdx, Attribute::ZExt)) Arg.Flags.setZExt(); if (Attrs.hasAttribute(OpIdx, Attribute::SExt)) @@ -81,8 +83,8 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx, // For ByVal, alignment should be passed from FE. BE will guess if // this info is not there but there are cases it cannot get right. unsigned FrameAlign; - if (FuncInfo.getParamAlignment(OpIdx)) - FrameAlign = FuncInfo.getParamAlignment(OpIdx); + if (FuncInfo.getParamAlignment(OpIdx - 2)) + FrameAlign = FuncInfo.getParamAlignment(OpIdx - 2); else FrameAlign = getTLI()->getByValTypeAlignment(ElementTy, DL); Arg.Flags.setByValAlign(FrameAlign); @@ -103,7 +105,6 @@ CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx, const CallInst &FuncInfo) const; bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, - CCAssignFn *AssignFn, ArrayRef<ArgInfo> Args, ValueHandler &Handler) const { MachineFunction &MF = MIRBuilder.getMF(); @@ -116,12 +117,20 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder, unsigned NumArgs = Args.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT CurVT = MVT::getVT(Args[i].Ty); - if (AssignFn(i, CurVT, CurVT, CCValAssign::Full, Args[i].Flags, CCInfo)) + if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo)) return false; } - for (unsigned i = 0, e = Args.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) { + assert(j < ArgLocs.size() && "Skipped too many arg locs"); + + CCValAssign &VA = ArgLocs[j]; + assert(VA.getValNo() == i && "Location doesn't correspond to current arg"); + + if (VA.needsCustom()) { + j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j)); + continue; + } if (VA.isRegLoc()) Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA); diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp index fcd2722f1c2f..29d1209bb02a 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp @@ -26,6 +26,7 @@ void llvm::initializeGlobalISel(PassRegistry &Registry) { void llvm::initializeGlobalISel(PassRegistry &Registry) { initializeIRTranslatorPass(Registry); initializeLegalizerPass(Registry); + initializeLocalizerPass(Registry); initializeRegBankSelectPass(Registry); initializeInstructionSelectPass(Registry); } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 89a042ffc477..afc18a15aa1c 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -12,7 +12,10 @@ #include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFunction.h" @@ -21,11 +24,13 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" @@ -40,11 +45,21 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI", false, false) -static void reportTranslationError(const Value &V, const Twine &Message) { - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << Message << ": " << V << '\n'; - report_fatal_error(Err.str()); +static void reportTranslationError(MachineFunction &MF, + const TargetPassConfig &TPC, + OptimizationRemarkEmitter &ORE, + OptimizationRemarkMissed &R) { + MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) + R << (" (in function: " + MF.getName() + ")").str(); + + if (TPC.isGlobalISelAbortEnabled()) + report_fatal_error(R.getMsg()); + else + ORE.emit(R); } IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) { @@ -59,28 +74,31 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const { unsigned IRTranslator::getOrCreateVReg(const Value &Val) { unsigned &ValReg = ValToVReg[&Val]; - // Check if this is the first time we see Val. - if (!ValReg) { - // Fill ValRegsSequence with the sequence of registers - // we need to concat together to produce the value. - assert(Val.getType()->isSized() && - "Don't know how to create an empty vreg"); - unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL}); - ValReg = VReg; - - if (auto CV = dyn_cast<Constant>(&Val)) { - bool Success = translate(*CV, VReg); - if (!Success) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return VReg; - } - reportTranslationError(Val, "unable to translate constant"); - } + + if (ValReg) + return ValReg; + + // Fill ValRegsSequence with the sequence of registers + // we need to concat together to produce the value. + assert(Val.getType()->isSized() && + "Don't know how to create an empty vreg"); + unsigned VReg = + MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL)); + ValReg = VReg; + + if (auto CV = dyn_cast<Constant>(&Val)) { + bool Success = translate(*CV, VReg); + if (!Success) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + MF->getFunction()->getSubprogram(), + &MF->getFunction()->getEntryBlock()); + R << "unable to translate constant: " << ore::NV("Type", Val.getType()); + reportTranslationError(*MF, *TPC, *ORE, R); + return VReg; } } - return ValReg; + + return VReg; } int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) { @@ -112,28 +130,27 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) { } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) { Alignment = LI->getAlignment(); ValTy = LI->getType(); - } else if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); + } else { + OptimizationRemarkMissed R("gisel-irtranslator", "", &I); + R << "unable to translate memop: " << ore::NV("Opcode", &I); + reportTranslationError(*MF, *TPC, *ORE, R); return 1; - } else - llvm_unreachable("unhandled memory instruction"); + } return Alignment ? Alignment : DL->getABITypeAlignment(ValTy); } -MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) { +MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) { MachineBasicBlock *&MBB = BBToMBB[&BB]; - if (!MBB) { - MBB = MF->CreateMachineBasicBlock(&BB); - MF->push_back(MBB); - - if (BB.hasAddressTaken()) - MBB->setHasAddressTaken(); - } + assert(MBB && "BasicBlock was not encountered before"); return *MBB; } +void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { + assert(NewPred && "new predecessor must be a real MachineBasicBlock"); + MachinePreds[Edge].push_back(NewPred); +} + bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { // FIXME: handle signed/unsigned wrapping flags. @@ -149,6 +166,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, return true; } +bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) { + // -0.0 - X --> G_FNEG + if (isa<Constant>(U.getOperand(0)) && + U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) { + MIRBuilder.buildInstr(TargetOpcode::G_FNEG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(1))); + return true; + } + return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder); +} + bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { const CmpInst *CI = dyn_cast<CmpInst>(&U); @@ -158,9 +187,14 @@ bool IRTranslator::translateCompare(const User &U, CmpInst::Predicate Pred = CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>( cast<ConstantExpr>(U).getPredicate()); - if (CmpInst::isIntPredicate(Pred)) MIRBuilder.buildICmp(Pred, Res, Op0, Op1); + else if (Pred == CmpInst::FCMP_FALSE) + MIRBuilder.buildCopy( + Res, getOrCreateVReg(*Constant::getNullValue(CI->getType()))); + else if (Pred == CmpInst::FCMP_TRUE) + MIRBuilder.buildCopy( + Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType()))); else MIRBuilder.buildFCmp(Pred, Res, Op0, Op1); @@ -183,18 +217,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) { // We want a G_BRCOND to the true BB followed by an unconditional branch. unsigned Tst = getOrCreateVReg(*BrInst.getCondition()); const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++)); - MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt); + MachineBasicBlock &TrueBB = getMBB(TrueTgt); MIRBuilder.buildBrCond(Tst, TrueBB); } const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ)); - MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt); - MIRBuilder.buildBr(TgtBB); + MachineBasicBlock &TgtBB = getMBB(BrTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the unconditional target is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); // Link successors. - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); for (const BasicBlock *Succ : BrInst.successors()) - CurBB.addSuccessor(&getOrCreateBB(*Succ)); + CurBB.addSuccessor(&getMBB(*Succ)); return true; } @@ -209,30 +246,52 @@ bool IRTranslator::translateSwitch(const User &U, const SwitchInst &SwInst = cast<SwitchInst>(U); const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition()); + const BasicBlock *OrigBB = SwInst.getParent(); - LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL); + LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL); for (auto &CaseIt : SwInst.cases()) { const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue()); const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1); MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue); - MachineBasicBlock &CurBB = MIRBuilder.getMBB(); - MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor()); + MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); + const BasicBlock *TrueBB = CaseIt.getCaseSuccessor(); + MachineBasicBlock &TrueMBB = getMBB(*TrueBB); - MIRBuilder.buildBrCond(Tst, TrueBB); - CurBB.addSuccessor(&TrueBB); + MIRBuilder.buildBrCond(Tst, TrueMBB); + CurMBB.addSuccessor(&TrueMBB); + addMachineCFGPred({OrigBB, TrueBB}, &CurMBB); - MachineBasicBlock *FalseBB = + MachineBasicBlock *FalseMBB = MF->CreateMachineBasicBlock(SwInst.getParent()); - MF->push_back(FalseBB); - MIRBuilder.buildBr(*FalseBB); - CurBB.addSuccessor(FalseBB); + // Insert the comparison blocks one after the other. + MF->insert(std::next(CurMBB.getIterator()), FalseMBB); + MIRBuilder.buildBr(*FalseMBB); + CurMBB.addSuccessor(FalseMBB); - MIRBuilder.setMBB(*FalseBB); + MIRBuilder.setMBB(*FalseMBB); } // handle default case - MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest()); - MIRBuilder.buildBr(DefaultBB); - MIRBuilder.getMBB().addSuccessor(&DefaultBB); + const BasicBlock *DefaultBB = SwInst.getDefaultDest(); + MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB); + MIRBuilder.buildBr(DefaultMBB); + MachineBasicBlock &CurMBB = MIRBuilder.getMBB(); + CurMBB.addSuccessor(&DefaultMBB); + addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB); + + return true; +} + +bool IRTranslator::translateIndirectBr(const User &U, + MachineIRBuilder &MIRBuilder) { + const IndirectBrInst &BrInst = cast<IndirectBrInst>(U); + + const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress()); + MIRBuilder.buildBrIndirect(Tgt); + + // Link successors. + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + for (const BasicBlock *Succ : BrInst.successors()) + CurBB.addSuccessor(&getMBB(*Succ)); return true; } @@ -240,47 +299,38 @@ bool IRTranslator::translateSwitch(const User &U, bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { const LoadInst &LI = cast<LoadInst>(U); - if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic()) - return false; - - assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment"); auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; Flags |= MachineMemOperand::MOLoad; unsigned Res = getOrCreateVReg(LI); unsigned Addr = getOrCreateVReg(*LI.getPointerOperand()); - LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL}; + MIRBuilder.buildLoad( Res, Addr, *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()), Flags, DL->getTypeStoreSize(LI.getType()), - getMemOpAlignment(LI))); + getMemOpAlignment(LI), AAMDNodes(), nullptr, + LI.getSynchScope(), LI.getOrdering())); return true; } bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { const StoreInst &SI = cast<StoreInst>(U); - - if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic()) - return false; - - assert(!SI.isAtomic() && "only non-atomic stores supported at the moment"); auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; Flags |= MachineMemOperand::MOStore; unsigned Val = getOrCreateVReg(*SI.getValueOperand()); unsigned Addr = getOrCreateVReg(*SI.getPointerOperand()); - LLT VTy{*SI.getValueOperand()->getType(), *DL}, - PTy{*SI.getPointerOperand()->getType(), *DL}; MIRBuilder.buildStore( Val, Addr, *MF->getMachineMemOperand( MachinePointerInfo(SI.getPointerOperand()), Flags, DL->getTypeStoreSize(SI.getValueOperand()->getType()), - getMemOpAlignment(SI))); + getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSynchScope(), + SI.getOrdering())); return true; } @@ -290,6 +340,15 @@ bool IRTranslator::translateExtractValue(const User &U, Type *Int32Ty = Type::getInt32Ty(U.getContext()); SmallVector<Value *, 1> Indices; + // If Src is a single element ConstantStruct, translate extractvalue + // to that element to avoid inserting a cast instruction. + if (auto CS = dyn_cast<ConstantStruct>(Src)) + if (CS->getNumOperands() == 1) { + unsigned Res = getOrCreateVReg(*CS->getOperand(0)); + ValToVReg[&U] = Res; + return true; + } + // getIndexedOffsetInType is designed for GEPs, so the first index is the // usual array element rather than looking into the actual aggregate. Indices.push_back(ConstantInt::get(Int32Ty, 0)); @@ -305,7 +364,7 @@ bool IRTranslator::translateExtractValue(const User &U, uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices); unsigned Res = getOrCreateVReg(U); - MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src)); + MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset); return true; } @@ -331,29 +390,36 @@ bool IRTranslator::translateInsertValue(const User &U, uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices); unsigned Res = getOrCreateVReg(U); - const Value &Inserted = *U.getOperand(1); - MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), getOrCreateVReg(Inserted), - Offset); + unsigned Inserted = getOrCreateVReg(*U.getOperand(1)); + MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), Inserted, Offset); return true; } bool IRTranslator::translateSelect(const User &U, MachineIRBuilder &MIRBuilder) { - MIRBuilder.buildSelect(getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)), - getOrCreateVReg(*U.getOperand(1)), - getOrCreateVReg(*U.getOperand(2))); + unsigned Res = getOrCreateVReg(U); + unsigned Tst = getOrCreateVReg(*U.getOperand(0)); + unsigned Op0 = getOrCreateVReg(*U.getOperand(1)); + unsigned Op1 = getOrCreateVReg(*U.getOperand(2)); + MIRBuilder.buildSelect(Res, Tst, Op0, Op1); return true; } bool IRTranslator::translateBitCast(const User &U, MachineIRBuilder &MIRBuilder) { - if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) { + // If we're bitcasting to the source type, we can reuse the source vreg. + if (getLLTForType(*U.getOperand(0)->getType(), *DL) == + getLLTForType(*U.getType(), *DL)) { + // Get the source vreg now, to avoid invalidating ValToVReg. + unsigned SrcReg = getOrCreateVReg(*U.getOperand(0)); unsigned &Reg = ValToVReg[&U]; + // If we already assigned a vreg for this bitcast, we can't change that. + // Emit a copy to satisfy the users we already emitted. if (Reg) - MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0))); + MIRBuilder.buildCopy(Reg, SrcReg); else - Reg = getOrCreateVReg(*U.getOperand(0)); + Reg = SrcReg; return true; } return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder); @@ -375,9 +441,10 @@ bool IRTranslator::translateGetElementPtr(const User &U, Value &Op0 = *U.getOperand(0); unsigned BaseReg = getOrCreateVReg(Op0); - LLT PtrTy{*Op0.getType(), *DL}; - unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace()); - LLT OffsetTy = LLT::scalar(PtrSize); + Type *PtrIRTy = Op0.getType(); + LLT PtrTy = getLLTForType(*PtrIRTy, *DL); + Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy); + LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); int64_t Offset = 0; for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U); @@ -399,8 +466,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, if (Offset != 0) { unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy); - unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(OffsetReg, Offset); + unsigned OffsetReg = + getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset)); MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg); BaseReg = NewBaseReg; @@ -408,8 +475,8 @@ bool IRTranslator::translateGetElementPtr(const User &U, } // N = N + Idx * ElementSize; - unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(ElementSizeReg, ElementSize); + unsigned ElementSizeReg = + getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize)); unsigned IdxReg = getOrCreateVReg(*Idx); if (MRI->getType(IdxReg) != OffsetTy) { @@ -428,8 +495,7 @@ bool IRTranslator::translateGetElementPtr(const User &U, } if (Offset != 0) { - unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy); - MIRBuilder.buildConstant(OffsetReg, Offset); + unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset)); MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg); return true; } @@ -438,13 +504,12 @@ bool IRTranslator::translateGetElementPtr(const User &U, return true; } -bool IRTranslator::translateMemcpy(const CallInst &CI, - MachineIRBuilder &MIRBuilder) { - LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL}; - if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() != - 0 || - cast<PointerType>(CI.getArgOperand(1)->getType())->getAddressSpace() != - 0 || +bool IRTranslator::translateMemfunc(const CallInst &CI, + MachineIRBuilder &MIRBuilder, + unsigned ID) { + LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL); + Type *DstTy = CI.getArgOperand(0)->getType(); + if (cast<PointerType>(DstTy)->getAddressSpace() != 0 || SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0)) return false; @@ -454,14 +519,32 @@ bool IRTranslator::translateMemcpy(const CallInst &CI, Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); } - MachineOperand Callee = MachineOperand::CreateES("memcpy"); + const char *Callee; + switch (ID) { + case Intrinsic::memmove: + case Intrinsic::memcpy: { + Type *SrcTy = CI.getArgOperand(1)->getType(); + if(cast<PointerType>(SrcTy)->getAddressSpace() != 0) + return false; + Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove"; + break; + } + case Intrinsic::memset: + Callee = "memset"; + break; + default: + return false; + } - return CLI->lowerCall(MIRBuilder, Callee, + return CLI->lowerCall(MIRBuilder, CI.getCallingConv(), + MachineOperand::CreateES(Callee), CallLowering::ArgInfo(0, CI.getType()), Args); } void IRTranslator::getStackGuard(unsigned DstReg, MachineIRBuilder &MIRBuilder) { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF)); auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD); MIB.addDef(DstReg); @@ -482,7 +565,7 @@ void IRTranslator::getStackGuard(unsigned DstReg, bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, MachineIRBuilder &MIRBuilder) { - LLT Ty{*CI.getOperand(0)->getType(), *DL}; + LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL); LLT s1 = LLT::scalar(1); unsigned Width = Ty.getSizeInBits(); unsigned Res = MRI->createGenericVirtualRegister(Ty); @@ -494,8 +577,8 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, .addUse(getOrCreateVReg(*CI.getOperand(1))); if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) { - unsigned Zero = MRI->createGenericVirtualRegister(s1); - EntryBuilder.buildConstant(Zero, 0); + unsigned Zero = getOrCreateVReg( + *Constant::getNullValue(Type::getInt1Ty(CI.getContext()))); MIB.addUse(Zero); } @@ -508,12 +591,83 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, switch (ID) { default: break; - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - // FIXME: these obviously need to be supported properly. - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // Stack coloring is not enabled in O0 (which we care about now) so we can + // drop these. Make sure someone notices when we start compiling at higher + // opts though. + if (MF->getTarget().getOptLevel() != CodeGenOpt::None) + return false; + return true; + case Intrinsic::dbg_declare: { + const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI); + assert(DI.getVariable() && "Missing variable"); + + const Value *Address = DI.getAddress(); + if (!Address || isa<UndefValue>(Address)) { + DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); + return true; + } + + assert(DI.getVariable()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + auto AI = dyn_cast<AllocaInst>(Address); + if (AI && AI->isStaticAlloca()) { + // Static allocas are tracked at the MF level, no need for DBG_VALUE + // instructions (in fact, they get ignored if they *do* exist). + MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(), + getOrCreateFrameIndex(*AI), DI.getDebugLoc()); + } else + MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address), + DI.getVariable(), DI.getExpression()); + return true; + } + case Intrinsic::vaend: + // No target I know of cares about va_end. Certainly no in-tree target + // does. Simplest intrinsic ever! return true; + case Intrinsic::vastart: { + auto &TLI = *MF->getSubtarget().getTargetLowering(); + Value *Ptr = CI.getArgOperand(0); + unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8; + + MIRBuilder.buildInstr(TargetOpcode::G_VASTART) + .addUse(getOrCreateVReg(*Ptr)) + .addMemOperand(MF->getMachineMemOperand( + MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0)); + return true; + } + case Intrinsic::dbg_value: { + // This form of DBG_VALUE is target-independent. + const DbgValueInst &DI = cast<DbgValueInst>(CI); + const Value *V = DI.getValue(); + assert(DI.getVariable()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + if (!V) { + // Currently the optimizer can produce this; insert an undef to + // help debugging. Probably the optimizer should not do this. + MIRBuilder.buildIndirectDbgValue(0, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + } else if (const auto *CI = dyn_cast<Constant>(V)) { + MIRBuilder.buildConstDbgValue(*CI, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + } else { + unsigned Reg = getOrCreateVReg(*V); + // FIXME: This does not handle register-indirect values at offset 0. The + // direct/indirect thing shouldn't really be handled by something as + // implicit as reg+noreg vs reg+imm in the first palce, but it seems + // pretty baked in right now. + if (DI.getOffset() != 0) + MIRBuilder.buildIndirectDbgValue(Reg, DI.getOffset(), DI.getVariable(), + DI.getExpression()); + else + MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), + DI.getExpression()); + } + return true; + } case Intrinsic::uadd_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder); case Intrinsic::sadd_with_overflow: @@ -526,8 +680,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder); case Intrinsic::smul_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder); + case Intrinsic::pow: + MIRBuilder.buildInstr(TargetOpcode::G_FPOW) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))) + .addUse(getOrCreateVReg(*CI.getArgOperand(1))); + return true; case Intrinsic::memcpy: - return translateMemcpy(CI, MIRBuilder); + case Intrinsic::memmove: + case Intrinsic::memset: + return translateMemfunc(CI, MIRBuilder, ID); case Intrinsic::eh_typeid_for: { GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0)); unsigned Reg = getOrCreateVReg(CI); @@ -546,7 +708,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getStackGuard(getOrCreateVReg(CI), MIRBuilder); return true; case Intrinsic::stackprotector: { - LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL}; + LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy); getStackGuard(GuardVal, MIRBuilder); @@ -564,18 +726,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return false; } +bool IRTranslator::translateInlineAsm(const CallInst &CI, + MachineIRBuilder &MIRBuilder) { + const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue()); + if (!IA.getConstraintString().empty()) + return false; + + unsigned ExtraInfo = 0; + if (IA.hasSideEffects()) + ExtraInfo |= InlineAsm::Extra_HasSideEffects; + if (IA.getDialect() == InlineAsm::AD_Intel) + ExtraInfo |= InlineAsm::Extra_AsmDialect; + + MIRBuilder.buildInstr(TargetOpcode::INLINEASM) + .addExternalSymbol(IA.getAsmString().c_str()) + .addImm(ExtraInfo); + + return true; +} + bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { const CallInst &CI = cast<CallInst>(U); auto TII = MF->getTarget().getIntrinsicInfo(); const Function *F = CI.getCalledFunction(); + if (CI.isInlineAsm()) + return translateInlineAsm(CI, MIRBuilder); + if (!F || !F->isIntrinsic()) { unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI); SmallVector<unsigned, 8> Args; for (auto &Arg: CI.arg_operands()) Args.push_back(getOrCreateVReg(*Arg)); - return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() { + MF->getFrameInfo().setHasCalls(true); + return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() { return getOrCreateVReg(*CI.getCalledValue()); }); } @@ -594,10 +779,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory()); for (auto &Arg : CI.arg_operands()) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) - MIB.addImm(CI->getSExtValue()); - else - MIB.addUse(getOrCreateVReg(*Arg)); + // Some intrinsics take metadata parameters. Reject them. + if (isa<MetadataAsValue>(Arg)) + return false; + MIB.addUse(getOrCreateVReg(*Arg)); } return true; } @@ -610,7 +795,7 @@ bool IRTranslator::translateInvoke(const User &U, const BasicBlock *ReturnBB = I.getSuccessor(0); const BasicBlock *EHPadBB = I.getSuccessor(1); - const Value *Callee(I.getCalledValue()); + const Value *Callee = I.getCalledValue(); const Function *Fn = dyn_cast<Function>(Callee); if (isa<InlineAsm>(Callee)) return false; @@ -634,23 +819,24 @@ bool IRTranslator::translateInvoke(const User &U, MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol); unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I); - SmallVector<CallLowering::ArgInfo, 8> Args; + SmallVector<unsigned, 8> Args; for (auto &Arg: I.arg_operands()) - Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType()); + Args.push_back(getOrCreateVReg(*Arg)); - if (!CLI->lowerCall(MIRBuilder, MachineOperand::CreateGA(Fn, 0), - CallLowering::ArgInfo(Res, I.getType()), Args)) + if (!CLI->lowerCall(MIRBuilder, &I, Res, Args, + [&]() { return getOrCreateVReg(*I.getCalledValue()); })) return false; MCSymbol *EndSymbol = Context.createTempSymbol(); MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol); // FIXME: track probabilities. - MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB), - &ReturnMBB = getOrCreateBB(*ReturnBB); + MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB), + &ReturnMBB = getMBB(*ReturnBB); MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol); MIRBuilder.getMBB().addSuccessor(&ReturnMBB); MIRBuilder.getMBB().addSuccessor(&EHPadMBB); + MIRBuilder.buildBr(ReturnMBB); return true; } @@ -684,37 +870,161 @@ bool IRTranslator::translateLandingPad(const User &U, MIRBuilder.buildInstr(TargetOpcode::EH_LABEL) .addSym(MF->addLandingPad(&MBB)); + LLT Ty = getLLTForType(*LP.getType(), *DL); + unsigned Undef = MRI->createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Undef); + + SmallVector<LLT, 2> Tys; + for (Type *Ty : cast<StructType>(LP.getType())->elements()) + Tys.push_back(getLLTForType(*Ty, *DL)); + assert(Tys.size() == 2 && "Only two-valued landingpads are supported"); + // Mark exception register as live in. - SmallVector<unsigned, 2> Regs; - SmallVector<uint64_t, 2> Offsets; - LLT p0 = LLT::pointer(0, DL->getPointerSizeInBits()); - if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) { - unsigned VReg = MRI->createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(VReg, Reg); - Regs.push_back(VReg); - Offsets.push_back(0); + unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn); + if (!ExceptionReg) + return false; + + MBB.addLiveIn(ExceptionReg); + unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]), + Tmp = MRI->createGenericVirtualRegister(Ty); + MIRBuilder.buildCopy(VReg, ExceptionReg); + MIRBuilder.buildInsert(Tmp, Undef, VReg, 0); + + unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn); + if (!SelectorReg) + return false; + + MBB.addLiveIn(SelectorReg); + + // N.b. the exception selector register always has pointer type and may not + // match the actual IR-level type in the landingpad so an extra cast is + // needed. + unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]); + MIRBuilder.buildCopy(PtrVReg, SelectorReg); + + VReg = MRI->createGenericVirtualRegister(Tys[1]); + MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg); + MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg, + Tys[0].getSizeInBits()); + return true; +} + +bool IRTranslator::translateAlloca(const User &U, + MachineIRBuilder &MIRBuilder) { + auto &AI = cast<AllocaInst>(U); + + if (AI.isStaticAlloca()) { + unsigned Res = getOrCreateVReg(AI); + int FI = getOrCreateFrameIndex(AI); + MIRBuilder.buildFrameIndex(Res, FI); + return true; + } + + // Now we're in the harder dynamic case. + Type *Ty = AI.getAllocatedType(); + unsigned Align = + std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment()); + + unsigned NumElts = getOrCreateVReg(*AI.getArraySize()); + + Type *IntPtrIRTy = DL->getIntPtrType(AI.getType()); + LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL); + if (MRI->getType(NumElts) != IntPtrTy) { + unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy); + MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts); + NumElts = ExtElts; } - if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) { - unsigned VReg = MRI->createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(VReg, Reg); - Regs.push_back(VReg); - Offsets.push_back(p0.getSizeInBits()); + unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy); + unsigned TySize = + getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty))); + MIRBuilder.buildMul(AllocSize, NumElts, TySize); + + LLT PtrTy = getLLTForType(*AI.getType(), *DL); + auto &TLI = *MF->getSubtarget().getTargetLowering(); + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + + unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildCopy(SPTmp, SPReg); + + unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize); + + // Handle alignment. We have to realign if the allocation granule was smaller + // than stack alignment, or the specific alloca requires more than stack + // alignment. + unsigned StackAlign = + MF->getSubtarget().getFrameLowering()->getStackAlignment(); + Align = std::max(Align, StackAlign); + if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) { + // Round the size of the allocation up to the stack alignment size + // by add SA-1 to the size. This doesn't overflow because we're computing + // an address inside an alloca. + unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy); + MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align)); + AllocTmp = AlignedAlloc; } - MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets); + MIRBuilder.buildCopy(SPReg, AllocTmp); + MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp); + + MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI); + assert(MF->getFrameInfo().hasVarSizedObjects()); return true; } -bool IRTranslator::translateStaticAlloca(const AllocaInst &AI, - MachineIRBuilder &MIRBuilder) { - if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca()) - return false; +bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) { + // FIXME: We may need more info about the type. Because of how LLT works, + // we're completely discarding the i64/double distinction here (amongst + // others). Fortunately the ABIs I know of where that matters don't use va_arg + // anyway but that's not guaranteed. + MIRBuilder.buildInstr(TargetOpcode::G_VAARG) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(0))) + .addImm(DL->getABITypeAlignment(U.getType())); + return true; +} + +bool IRTranslator::translateInsertElement(const User &U, + MachineIRBuilder &MIRBuilder) { + // If it is a <1 x Ty> vector, use the scalar as it is + // not a legal vector type in LLT. + if (U.getType()->getVectorNumElements() == 1) { + unsigned Elt = getOrCreateVReg(*U.getOperand(1)); + ValToVReg[&U] = Elt; + return true; + } + unsigned Res = getOrCreateVReg(U); + unsigned Val = getOrCreateVReg(*U.getOperand(0)); + unsigned Elt = getOrCreateVReg(*U.getOperand(1)); + unsigned Idx = getOrCreateVReg(*U.getOperand(2)); + MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx); + return true; +} + +bool IRTranslator::translateExtractElement(const User &U, + MachineIRBuilder &MIRBuilder) { + // If it is a <1 x Ty> vector, use the scalar as it is + // not a legal vector type in LLT. + if (U.getOperand(0)->getType()->getVectorNumElements() == 1) { + unsigned Elt = getOrCreateVReg(*U.getOperand(0)); + ValToVReg[&U] = Elt; + return true; + } + unsigned Res = getOrCreateVReg(U); + unsigned Val = getOrCreateVReg(*U.getOperand(0)); + unsigned Idx = getOrCreateVReg(*U.getOperand(1)); + MIRBuilder.buildExtractVectorElement(Res, Val, Idx); + return true; +} - assert(AI.isStaticAlloca() && "only handle static allocas now"); - unsigned Res = getOrCreateVReg(AI); - int FI = getOrCreateFrameIndex(AI); - MIRBuilder.buildFrameIndex(Res, FI); +bool IRTranslator::translateShuffleVector(const User &U, + MachineIRBuilder &MIRBuilder) { + MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR) + .addDef(getOrCreateVReg(U)) + .addUse(getOrCreateVReg(*U.getOperand(0))) + .addUse(getOrCreateVReg(*U.getOperand(1))) + .addUse(getOrCreateVReg(*U.getOperand(2))); return true; } @@ -736,11 +1046,21 @@ void IRTranslator::finishPendingPhis() { // won't create extra control flow here, otherwise we need to find the // dominating predecessor here (or perhaps force the weirder IRTranslators // to provide a simple boundary). + SmallSet<const BasicBlock *, 4> HandledPreds; + for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) { - assert(BBToMBB[PI->getIncomingBlock(i)]->isSuccessor(MIB->getParent()) && - "I appear to have misunderstood Machine PHIs"); - MIB.addUse(getOrCreateVReg(*PI->getIncomingValue(i))); - MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]); + auto IRPred = PI->getIncomingBlock(i); + if (HandledPreds.count(IRPred)) + continue; + + HandledPreds.insert(IRPred); + unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i)); + for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) { + assert(Pred->isSuccessor(MIB->getParent()) && + "incorrect CFG at MachineBasicBlock level"); + MIB.addUse(ValReg); + MIB.addMBB(Pred); + } } } } @@ -752,9 +1072,7 @@ bool IRTranslator::translate(const Instruction &Inst) { case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder); #include "llvm/IR/Instruction.def" default: - if (!TPC->isGlobalISelAbortEnabled()) - return false; - llvm_unreachable("unknown opcode"); + return false; } } @@ -764,25 +1082,68 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) { else if (auto CF = dyn_cast<ConstantFP>(&C)) EntryBuilder.buildFConstant(Reg, *CF); else if (isa<UndefValue>(C)) - EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg); + EntryBuilder.buildUndef(Reg); else if (isa<ConstantPointerNull>(C)) EntryBuilder.buildConstant(Reg, 0); else if (auto GV = dyn_cast<GlobalValue>(&C)) EntryBuilder.buildGlobalValue(Reg, GV); - else if (auto CE = dyn_cast<ConstantExpr>(&C)) { + else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) { + if (!CAZ->getType()->isVectorTy()) + return false; + // Return the scalar if it is a <1 x Ty> vector. + if (CAZ->getNumElements() == 1) + return translate(*CAZ->getElementValue(0u), Reg); + std::vector<unsigned> Ops; + for (unsigned i = 0; i < CAZ->getNumElements(); ++i) { + Constant &Elt = *CAZ->getElementValue(i); + Ops.push_back(getOrCreateVReg(Elt)); + } + EntryBuilder.buildMerge(Reg, Ops); + } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) { + // Return the scalar if it is a <1 x Ty> vector. + if (CV->getNumElements() == 1) + return translate(*CV->getElementAsConstant(0), Reg); + std::vector<unsigned> Ops; + for (unsigned i = 0; i < CV->getNumElements(); ++i) { + Constant &Elt = *CV->getElementAsConstant(i); + Ops.push_back(getOrCreateVReg(Elt)); + } + EntryBuilder.buildMerge(Reg, Ops); + } else if (auto CE = dyn_cast<ConstantExpr>(&C)) { switch(CE->getOpcode()) { #define HANDLE_INST(NUM, OPCODE, CLASS) \ case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder); #include "llvm/IR/Instruction.def" default: - if (!TPC->isGlobalISelAbortEnabled()) - return false; - llvm_unreachable("unknown opcode"); + return false; } - } else if (!TPC->isGlobalISelAbortEnabled()) + } else if (auto CS = dyn_cast<ConstantStruct>(&C)) { + // Return the element if it is a single element ConstantStruct. + if (CS->getNumOperands() == 1) { + unsigned EltReg = getOrCreateVReg(*CS->getOperand(0)); + EntryBuilder.buildCast(Reg, EltReg); + return true; + } + SmallVector<unsigned, 4> Ops; + SmallVector<uint64_t, 4> Indices; + uint64_t Offset = 0; + for (unsigned i = 0; i < CS->getNumOperands(); ++i) { + unsigned OpReg = getOrCreateVReg(*CS->getOperand(i)); + Ops.push_back(OpReg); + Indices.push_back(Offset); + Offset += MRI->getType(OpReg).getSizeInBits(); + } + EntryBuilder.buildSequence(Reg, Ops, Indices); + } else if (auto CV = dyn_cast<ConstantVector>(&C)) { + if (CV->getNumOperands() == 1) + return translate(*CV->getOperand(0), Reg); + SmallVector<unsigned, 4> Ops; + for (unsigned i = 0; i < CV->getNumOperands(); ++i) { + Ops.push_back(getOrCreateVReg(*CV->getOperand(i))); + } + EntryBuilder.buildMerge(Reg, Ops); + } else return false; - else - llvm_unreachable("unhandled constant kind"); return true; } @@ -793,7 +1154,12 @@ void IRTranslator::finalizeFunction() { PendingPHIs.clear(); ValToVReg.clear(); FrameIndices.clear(); - Constants.clear(); + MachinePreds.clear(); + // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it + // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid + // destroying it twice (in ~IRTranslator() and ~LLVMContext()) + EntryBuilder = MachineIRBuilder(); + CurBuilder = MachineIRBuilder(); } bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { @@ -807,85 +1173,97 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MRI = &MF->getRegInfo(); DL = &F.getParent()->getDataLayout(); TPC = &getAnalysis<TargetPassConfig>(); + ORE = make_unique<OptimizationRemarkEmitter>(&F); assert(PendingPHIs.empty() && "stale PHIs"); - // Setup a separate basic-block for the arguments and constants, falling - // through to the IR-level Function's entry block. + // Release the per-function state when we return, whether we succeeded or not. + auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); }); + + // Setup a separate basic-block for the arguments and constants MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock(); MF->push_back(EntryBB); - EntryBB->addSuccessor(&getOrCreateBB(F.front())); EntryBuilder.setMBB(*EntryBB); + // Create all blocks, in IR order, to preserve the layout. + for (const BasicBlock &BB: F) { + auto *&MBB = BBToMBB[&BB]; + + MBB = MF->CreateMachineBasicBlock(&BB); + MF->push_back(MBB); + + if (BB.hasAddressTaken()) + MBB->setHasAddressTaken(); + } + + // Make our arguments/constants entry block fallthrough to the IR entry block. + EntryBB->addSuccessor(&getMBB(F.front())); + // Lower the actual args into this basic block. SmallVector<unsigned, 8> VRegArgs; for (const Argument &Arg: F.args()) VRegArgs.push_back(getOrCreateVReg(Arg)); - bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs); - if (!Succeeded) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - finalizeFunction(); - return false; - } - report_fatal_error("Unable to lower arguments"); + if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + MF->getFunction()->getSubprogram(), + &MF->getFunction()->getEntryBlock()); + R << "unable to lower arguments: " << ore::NV("Prototype", F.getType()); + reportTranslationError(*MF, *TPC, *ORE, R); + return false; } // And translate the function! for (const BasicBlock &BB: F) { - MachineBasicBlock &MBB = getOrCreateBB(BB); + MachineBasicBlock &MBB = getMBB(BB); // Set the insertion point of all the following translations to // the end of this basic block. CurBuilder.setMBB(MBB); for (const Instruction &Inst: BB) { - Succeeded &= translate(Inst); - if (!Succeeded) { - if (TPC->isGlobalISelAbortEnabled()) - reportTranslationError(Inst, "unable to translate instruction"); - MF->getProperties().set( - MachineFunctionProperties::Property::FailedISel); - break; - } - } - } - - if (Succeeded) { - finishPendingPhis(); - - // Now that the MachineFrameInfo has been configured, no further changes to - // the reserved registers are possible. - MRI->freezeReservedRegs(*MF); - - // Merge the argument lowering and constants block with its single - // successor, the LLVM-IR entry block. We want the basic block to - // be maximal. - assert(EntryBB->succ_size() == 1 && - "Custom BB used for lowering should have only one successor"); - // Get the successor of the current entry block. - MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); - assert(NewEntryBB.pred_size() == 1 && - "LLVM-IR entry block has a predecessor!?"); - // Move all the instruction from the current entry block to the - // new entry block. - NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), - EntryBB->end()); - - // Update the live-in information for the new entry block. - for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) - NewEntryBB.addLiveIn(LiveIn); - NewEntryBB.sortUniqueLiveIns(); + if (translate(Inst)) + continue; - // Get rid of the now empty basic block. - EntryBB->removeSuccessor(&NewEntryBB); - MF->remove(EntryBB); + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << Inst; - assert(&MF->front() == &NewEntryBB && - "New entry wasn't next in the list of basic block!"); + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + Inst.getDebugLoc(), &BB); + R << "unable to translate instruction: " << ore::NV("Opcode", &Inst) + << ": '" << InstStr.str() << "'"; + reportTranslationError(*MF, *TPC, *ORE, R); + return false; + } } - finalizeFunction(); + finishPendingPhis(); + + // Merge the argument lowering and constants block with its single + // successor, the LLVM-IR entry block. We want the basic block to + // be maximal. + assert(EntryBB->succ_size() == 1 && + "Custom BB used for lowering should have only one successor"); + // Get the successor of the current entry block. + MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin(); + assert(NewEntryBB.pred_size() == 1 && + "LLVM-IR entry block has a predecessor!?"); + // Move all the instruction from the current entry block to the + // new entry block. + NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(), + EntryBB->end()); + + // Update the live-in information for the new entry block. + for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins()) + NewEntryBB.addLiveIn(LiveIn); + NewEntryBB.sortUniqueLiveIns(); + + // Get rid of the now empty basic block. + EntryBB->removeSuccessor(&NewEntryBB); + MF->remove(EntryBB); + MF->DeleteMachineBasicBlock(EntryBB); + + assert(&MF->front() == &NewEntryBB && + "New entry wasn't next in the list of basic block!"); return false; } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 1d205cd6c9c8..a16e14fe2db6 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -12,14 +12,19 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #define DEBUG_TYPE "instruction-select" @@ -44,17 +49,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -static void reportSelectionError(const MachineInstr *MI, const Twine &Message) { - const MachineFunction &MF = *MI->getParent()->getParent(); - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << Message << ":\nIn function: " << MF.getName() << '\n'; - if (MI) - Err << *MI << '\n'; - report_fatal_error(Err.str()); -} - bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // No matter what happens, whether we successfully select the function or not, + // nothing is going to use the vreg types after us. Make sure they disappear. + auto ClearVRegTypesOnReturn = + make_scope_exit([&]() { MRI.getVRegToType().clear(); }); + // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) @@ -66,10 +68,10 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector(); assert(ISel && "Cannot work without InstructionSelector"); - // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many - // other MF/MFI fields we need to initialize. + // An optimization remark emitter. Used to report failures. + MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); - const MachineRegisterInfo &MRI = MF.getRegInfo(); + // FIXME: There are many other MF/MFI fields we need to initialize. #ifndef NDEBUG // Check that our input is fully legal: we require the function to have the @@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { // that it has the same layering problem, but we only use inline methods so // end up not needing to link against the GlobalISel library. if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) - for (const MachineBasicBlock &MBB : MF) - for (const MachineInstr &MI : MBB) - if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) - reportSelectionError(&MI, "Instruction is not legal"); + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "instruction is not legal", MI); + return false; + } #endif // FIXME: We could introduce new blocks and will need to fix the outer loop. // Until then, keep track of the number of blocks to assert that we don't. const size_t NumBlocks = MF.size(); - bool Failed = false; for (MachineBasicBlock *MBB : post_order(&MF)) { if (MBB->empty()) continue; @@ -115,14 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Selecting: \n " << MI); + // We could have folded this instruction away already, making it dead. + // If so, erase it. + if (isTriviallyDead(MI, MRI)) { + DEBUG(dbgs() << "Is dead; erasing.\n"); + MI.eraseFromParentAndMarkDBGValuesForRemoval(); + continue; + } + if (!ISel->select(MI)) { - if (TPC.isGlobalISelAbortEnabled()) - // FIXME: It would be nice to dump all inserted instructions. It's - // not - // obvious how, esp. considering select() can insert after MI. - reportSelectionError(&MI, "Cannot select"); - Failed = true; - break; + // FIXME: It would be nice to dump all inserted instructions. It's + // not obvious how, esp. considering select() can insert after MI. + reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI); + return false; } // Dump the range of instructions that MI expanded into. @@ -136,39 +145,47 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { } } + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + // Now that selection is complete, there are no more generic vregs. Verify // that the size of the now-constrained vreg is unchanged and that it has a // register class. for (auto &VRegToType : MRI.getVRegToType()) { unsigned VReg = VRegToType.first; auto *RC = MRI.getRegClassOrNull(VReg); - auto *MI = MRI.def_instr_begin(VReg) == MRI.def_instr_end() - ? nullptr - : &*MRI.def_instr_begin(VReg); - if (!RC) { - if (TPC.isGlobalISelAbortEnabled()) - reportSelectionError(MI, "VReg as no regclass after selection"); - Failed = true; - break; - } + MachineInstr *MI = nullptr; + if (!MRI.def_empty(VReg)) + MI = &*MRI.def_instr_begin(VReg); + else if (!MRI.use_empty(VReg)) + MI = &*MRI.use_instr_begin(VReg); + + if (MI && !RC) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "VReg has no regclass after selection", *MI); + return false; + } else if (!RC) + continue; if (VRegToType.second.isValid() && - VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) { - if (TPC.isGlobalISelAbortEnabled()) - reportSelectionError( - MI, "VReg has explicit size different from class size"); - Failed = true; - break; + VRegToType.second.getSizeInBits() > TRI.getRegSizeInBits(*RC)) { + reportGISelFailure(MF, TPC, MORE, "gisel-select", + "VReg has explicit size different from class size", + *MI); + return false; } } - MRI.getVRegToType().clear(); - - if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) { - MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + if (MF.size() != NumBlocks) { + MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure", + MF.getFunction()->getSubprogram(), + /*MBB=*/nullptr); + R << "inserting blocks is not supported yet"; + reportGISelFailure(MF, TPC, MORE, R); return false; } - assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet"); + + auto &TLI = *MF.getSubtarget().getTargetLowering(); + TLI.finalizeLowering(MF); // FIXME: Should we accurately track changes? return true; diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 5c34da0dc557..4c0b06dffd21 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -14,6 +14,8 @@ #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -55,6 +57,29 @@ bool InstructionSelector::constrainSelectedInstRegOperands( // constrainOperandRegClass does that for us. MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(), Reg, OpI)); + + // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been + // done. + if (MO.isUse()) { + int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO); + if (DefIdx != -1 && !I.isRegTiedToUseOperand(DefIdx)) + I.tieOperands(DefIdx, OpI); + } } return true; } + +bool InstructionSelector::isOperandImmEqual( + const MachineOperand &MO, int64_t Value, + const MachineRegisterInfo &MRI) const { + + if (MO.isReg() && MO.getReg()) + if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI)) + return *VRegVal == Value; + return false; +} + +bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const { + return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() && + MI.implicit_operands().begin() == MI.implicit_operands().end(); +} diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp index e86356880e99..aec379197dfb 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -16,12 +16,16 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <iterator> + #define DEBUG_TYPE "legalizer" using namespace llvm; @@ -92,10 +96,7 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI, "unexpected physical register in G_SEQUENCE"); // Finally we can replace the uses. - for (auto &Use : MRI.use_operands(ExtractReg)) { - Changed = true; - Use.setReg(OrigReg); - } + MRI.replaceRegWith(ExtractReg, OrigReg); } if (AllDefsReplaced) { @@ -114,6 +115,36 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI, return Changed; } +bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI, + const TargetInstrInfo &TII) { + if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES) + return false; + + unsigned NumDefs = MI.getNumOperands() - 1; + unsigned SrcReg = MI.getOperand(NumDefs).getReg(); + MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg); + if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES) + return false; + + if (MergeI.getNumOperands() - 1 != NumDefs) + return false; + + // FIXME: is a COPY appropriate if the types mismatch? We know both registers + // are allocatable by now. + if (MRI.getType(MI.getOperand(0).getReg()) != + MRI.getType(MergeI.getOperand(1).getReg())) + return false; + + for (unsigned Idx = 0; Idx < NumDefs; ++Idx) + MRI.replaceRegWith(MI.getOperand(Idx).getReg(), + MergeI.getOperand(Idx + 1).getReg()); + + MI.eraseFromParent(); + if (MRI.use_empty(MergeI.getOperand(0).getReg())) + MergeI.eraseFromParent(); + return true; +} + bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( @@ -122,7 +153,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n'); init(MF); const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); - const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo(); + MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); LegalizerHelper Helper(MF); // FIXME: an instruction may need more than one pass before it is legal. For @@ -132,7 +163,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // convergence for performance reasons. bool Changed = false; MachineBasicBlock::iterator NextMI; - for (auto &MBB : MF) + for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); MI = NextMI) { // Get the next Instruction before we try to legalize, because there's a // good chance MI will be deleted. @@ -142,27 +173,52 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // and are assumed to be legal. if (!isPreISelGenericOpcode(MI->getOpcode())) continue; - - auto Res = Helper.legalizeInstr(*MI, LegalizerInfo); - - // Error out if we couldn't legalize this instruction. We may want to fall - // back to DAG ISel instead in the future. - if (Res == LegalizerHelper::UnableToLegalize) { - if (!TPC.isGlobalISelAbortEnabled()) { - MF.getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return false; + unsigned NumNewInsns = 0; + SmallVector<MachineInstr *, 4> WorkList; + Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) { + // Only legalize pre-isel generic instructions. + // Legalization process could generate Target specific pseudo + // instructions with generic types. Don't record them + if (isPreISelGenericOpcode(MI->getOpcode())) { + ++NumNewInsns; + WorkList.push_back(MI); } - std::string Msg; - raw_string_ostream OS(Msg); - OS << "unable to legalize instruction: "; - MI->print(OS); - report_fatal_error(OS.str()); - } + }); + WorkList.push_back(&*MI); + + bool Changed = false; + LegalizerHelper::LegalizeResult Res; + unsigned Idx = 0; + do { + Res = Helper.legalizeInstrStep(*WorkList[Idx]); + // Error out if we couldn't legalize this instruction. We may want to + // fall back to DAG ISel instead in the future. + if (Res == LegalizerHelper::UnableToLegalize) { + Helper.MIRBuilder.stopRecordingInsertions(); + if (Res == LegalizerHelper::UnableToLegalize) { + reportGISelFailure(MF, TPC, MORE, "gisel-legalize", + "unable to legalize instruction", + *WorkList[Idx]); + return false; + } + } + Changed |= Res == LegalizerHelper::Legalized; + ++Idx; + +#ifndef NDEBUG + if (NumNewInsns) { + DEBUG(dbgs() << ".. .. Emitted " << NumNewInsns << " insns\n"); + for (auto I = WorkList.end() - NumNewInsns, E = WorkList.end(); + I != E; ++I) + DEBUG(dbgs() << ".. .. New MI: "; (*I)->print(dbgs())); + NumNewInsns = 0; + } +#endif + } while (Idx < WorkList.size()); - Changed |= Res == LegalizerHelper::Legalized; + Helper.MIRBuilder.stopRecordingInsertions(); } - + } MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -172,7 +228,12 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { // good chance MI will be deleted. NextMI = std::next(MI); - Changed |= combineExtracts(*MI, MRI, TII); + // combineExtracts erases MI. + if (combineExtracts(*MI, MRI, TII)) { + Changed = true; + continue; + } + Changed |= combineMerges(*MI, MRI, TII); } } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index eb25b6ca268f..ef5818dabe23 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -24,97 +24,112 @@ #include <sstream> -#define DEBUG_TYPE "legalize-mir" +#define DEBUG_TYPE "legalizer" using namespace llvm; LegalizerHelper::LegalizerHelper(MachineFunction &MF) - : MRI(MF.getRegInfo()) { + : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) { MIRBuilder.setMF(MF); } LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstrStep(MachineInstr &MI, - const LegalizerInfo &LegalizerInfo) { - auto Action = LegalizerInfo.getAction(MI, MRI); +LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { + DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs())); + + auto Action = LI.getAction(MI, MRI); switch (std::get<0>(Action)) { case LegalizerInfo::Legal: + DEBUG(dbgs() << ".. Already legal\n"); return AlreadyLegal; case LegalizerInfo::Libcall: + DEBUG(dbgs() << ".. Convert to libcall\n"); return libcall(MI); case LegalizerInfo::NarrowScalar: + DEBUG(dbgs() << ".. Narrow scalar\n"); return narrowScalar(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::WidenScalar: + DEBUG(dbgs() << ".. Widen scalar\n"); return widenScalar(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::Lower: + DEBUG(dbgs() << ".. Lower\n"); return lower(MI, std::get<1>(Action), std::get<2>(Action)); case LegalizerInfo::FewerElements: + DEBUG(dbgs() << ".. Reduce number of elements\n"); return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action)); + case LegalizerInfo::Custom: + DEBUG(dbgs() << ".. Custom legalization\n"); + return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized + : UnableToLegalize; default: + DEBUG(dbgs() << ".. Unable to legalize\n"); return UnableToLegalize; } } -LegalizerHelper::LegalizeResult -LegalizerHelper::legalizeInstr(MachineInstr &MI, - const LegalizerInfo &LegalizerInfo) { - SmallVector<MachineInstr *, 4> WorkList; - MIRBuilder.recordInsertions( - [&](MachineInstr *MI) { WorkList.push_back(MI); }); - WorkList.push_back(&MI); - - bool Changed = false; - LegalizeResult Res; - unsigned Idx = 0; - do { - Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo); - if (Res == UnableToLegalize) { - MIRBuilder.stopRecordingInsertions(); - return UnableToLegalize; - } - Changed |= Res == Legalized; - ++Idx; - } while (Idx < WorkList.size()); - - MIRBuilder.stopRecordingInsertions(); - - return Changed ? Legalized : AlreadyLegal; -} - void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts, SmallVectorImpl<unsigned> &VRegs) { - unsigned Size = Ty.getSizeInBits(); - SmallVector<uint64_t, 4> Indexes; - for (int i = 0; i < NumParts; ++i) { + for (int i = 0; i < NumParts; ++i) VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); - Indexes.push_back(i * Size); + MIRBuilder.buildUnmerge(VRegs, Reg); +} + +static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { + switch (Opcode) { + case TargetOpcode::G_SDIV: + assert(Size == 32 && "Unsupported size"); + return RTLIB::SDIV_I32; + case TargetOpcode::G_UDIV: + assert(Size == 32 && "Unsupported size"); + return RTLIB::UDIV_I32; + case TargetOpcode::G_FADD: + assert((Size == 32 || Size == 64) && "Unsupported size"); + return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32; + case TargetOpcode::G_FREM: + return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32; + case TargetOpcode::G_FPOW: + return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32; } - MIRBuilder.buildExtract(VRegs, Indexes, Reg); + llvm_unreachable("Unknown libcall function"); +} + +static LegalizerHelper::LegalizeResult +simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, + Type *OpType) { + auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + auto Libcall = getRTLibDesc(MI.getOpcode(), Size); + const char *Name = TLI.getLibcallName(Libcall); + MIRBuilder.getMF().getFrameInfo().setHasCalls(true); + CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall), + MachineOperand::CreateES(Name), + {MI.getOperand(0).getReg(), OpType}, + {{MI.getOperand(1).getReg(), OpType}, + {MI.getOperand(2).getReg(), OpType}}); + MI.eraseFromParent(); + return LegalizerHelper::Legalized; } LegalizerHelper::LegalizeResult LegalizerHelper::libcall(MachineInstr &MI) { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = Ty.getSizeInBits(); + auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); MIRBuilder.setInstr(MI); switch (MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: { + Type *Ty = Type::getInt32Ty(Ctx); + return simpleLibcall(MI, MIRBuilder, Size, Ty); + } + case TargetOpcode::G_FADD: + case TargetOpcode::G_FPOW: case TargetOpcode::G_FREM: { - auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx); - auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); - auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); - const char *Name = - TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32); - - CLI.lowerCall( - MIRBuilder, MachineOperand::CreateES(Name), - {MI.getOperand(0).getReg(), Ty}, - {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}}); - MI.eraseFromParent(); - return Legalized; + return simpleLibcall(MI, MIRBuilder, Size, Ty); } } } @@ -125,19 +140,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, // FIXME: Don't know how to handle secondary types yet. if (TypeIdx != 0) return UnableToLegalize; + + MIRBuilder.setInstr(MI); + switch (MI.getOpcode()) { default: return UnableToLegalize; case TargetOpcode::G_ADD: { // Expand in terms of carry-setting/consuming G_ADDE instructions. - unsigned NarrowSize = NarrowTy.getSizeInBits(); int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowTy.getSizeInBits(); - MIRBuilder.setInstr(MI); - SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; - SmallVector<uint64_t, 2> Indexes; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -152,11 +166,138 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, Src2Regs[i], CarryIn); DstRegs.push_back(DstReg); - Indexes.push_back(i * NarrowSize); CarryIn = CarryOut; } unsigned DstReg = MI.getOperand(0).getReg(); - MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); + MIRBuilder.buildMerge(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_INSERT: { + if (TypeIdx != 0) + return UnableToLegalize; + + int64_t NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + + SmallVector<unsigned, 2> SrcRegs, DstRegs; + SmallVector<uint64_t, 2> Indexes; + extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); + + unsigned OpReg = MI.getOperand(2).getReg(); + int64_t OpStart = MI.getOperand(3).getImm(); + int64_t OpSize = MRI.getType(OpReg).getSizeInBits(); + for (int i = 0; i < NumParts; ++i) { + unsigned DstStart = i * NarrowSize; + + if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { + // No part of the insert affects this subregister, forward the original. + DstRegs.push_back(SrcRegs[i]); + continue; + } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { + // The entire subregister is defined by this insert, forward the new + // value. + DstRegs.push_back(OpReg); + continue; + } + + // OpSegStart is where this destination segment would start in OpReg if it + // extended infinitely in both directions. + int64_t ExtractOffset, InsertOffset, SegSize; + if (OpStart < DstStart) { + InsertOffset = 0; + ExtractOffset = DstStart - OpStart; + SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); + } else { + InsertOffset = OpStart - DstStart; + ExtractOffset = 0; + SegSize = + std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); + } + + unsigned SegReg = OpReg; + if (ExtractOffset != 0 || SegSize != OpSize) { + // A genuine extract is needed. + SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); + MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); + } + + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset); + DstRegs.push_back(DstReg); + } + + assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered"); + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_LOAD: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + LLT NarrowPtrTy = LLT::pointer( + MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + unsigned SrcReg = MRI.createGenericVirtualRegister(NarrowPtrTy); + unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64)); + + MIRBuilder.buildConstant(Offset, i * NarrowSize / 8); + MIRBuilder.buildGEP(SrcReg, MI.getOperand(1).getReg(), Offset); + // TODO: This is conservatively correct, but we probably want to split the + // memory operands in the future. + MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin()); + + DstRegs.push_back(DstReg); + } + unsigned DstReg = MI.getOperand(0).getReg(); + MIRBuilder.buildMerge(DstReg, DstRegs); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_STORE: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + LLT NarrowPtrTy = LLT::pointer( + MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize); + + SmallVector<unsigned, 2> SrcRegs; + extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs); + + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowPtrTy); + unsigned Offset = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIRBuilder.buildConstant(Offset, i * NarrowSize / 8); + MIRBuilder.buildGEP(DstReg, MI.getOperand(1).getReg(), Offset); + // TODO: This is conservatively correct, but we probably want to split the + // memory operands in the future. + MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin()); + } + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_CONSTANT: { + unsigned NarrowSize = NarrowTy.getSizeInBits(); + int NumParts = + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize; + const APInt &Cst = MI.getOperand(1).getCImm()->getValue(); + LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + + SmallVector<unsigned, 2> DstRegs; + for (int i = 0; i < NumParts; ++i) { + unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); + ConstantInt *CI = + ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize)); + MIRBuilder.buildConstant(DstReg, *CI); + DstRegs.push_back(DstReg); + } + unsigned DstReg = MI.getOperand(0).getReg(); + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } @@ -175,7 +316,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_MUL: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: - case TargetOpcode::G_SUB: { + case TargetOpcode::G_SUB: + case TargetOpcode::G_SHL: { // Perform operation at larger width (any extension is fine here, high bits // don't affect the result) and then truncate the result back to the // original type. @@ -195,10 +337,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_SDIV: - case TargetOpcode::G_UDIV: { - unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV - ? TargetOpcode::G_SEXT - : TargetOpcode::G_ZEXT; + case TargetOpcode::G_UDIV: + case TargetOpcode::G_ASHR: + case TargetOpcode::G_LSHR: { + unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV || + MI.getOpcode() == TargetOpcode::G_ASHR + ? TargetOpcode::G_SEXT + : TargetOpcode::G_ZEXT; unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy); MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse( @@ -218,6 +363,85 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_SELECT: { + if (TypeIdx != 0) + return UnableToLegalize; + + // Perform operation at larger width (any extension is fine here, high bits + // don't affect the result) and then truncate the result back to the + // original type. + unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy); + unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg()); + MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg()); + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildInstr(TargetOpcode::G_SELECT) + .addDef(DstExt) + .addReg(MI.getOperand(1).getReg()) + .addUse(Src1Ext) + .addUse(Src2Ext); + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: { + if (TypeIdx != 0) + return UnableToLegalize; + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildInstr(MI.getOpcode()) + .addDef(DstExt) + .addUse(MI.getOperand(1).getReg()); + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: { + if (TypeIdx != 1) + return UnableToLegalize; + + unsigned Src = MI.getOperand(1).getReg(); + unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + + if (MI.getOpcode() == TargetOpcode::G_SITOFP) { + MIRBuilder.buildSExt(SrcExt, Src); + } else { + assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op"); + MIRBuilder.buildZExt(SrcExt, Src); + } + + MIRBuilder.buildInstr(MI.getOpcode()) + .addDef(MI.getOperand(0).getReg()) + .addUse(SrcExt); + + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_INSERT: { + if (TypeIdx != 0) + return UnableToLegalize; + + unsigned Src = MI.getOperand(1).getReg(); + unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); + MIRBuilder.buildAnyExt(SrcExt, Src); + + unsigned DstExt = MRI.createGenericVirtualRegister(WideTy); + auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(), + MI.getOperand(3).getImm()); + for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) { + MIB.addReg(MI.getOperand(OpNum).getReg()); + MIB.addImm(MI.getOperand(OpNum + 1).getImm()); + } + + MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_LOAD: { assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == WideTy.getSizeInBits() && @@ -231,12 +455,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_STORE: { - assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) == - WideTy.getSizeInBits() && - "illegal to increase number of bytes modified by a store"); + if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) || + WideTy != LLT::scalar(8)) + return UnableToLegalize; + + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + auto Content = TLI.getBooleanContents(false, false); + + unsigned ExtOp = TargetOpcode::G_ANYEXT; + if (Content == TargetLoweringBase::ZeroOrOneBooleanContent) + ExtOp = TargetOpcode::G_ZEXT; + else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent) + ExtOp = TargetOpcode::G_SEXT; + else + ExtOp = TargetOpcode::G_ANYEXT; unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy); - MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg()); + MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse( + MI.getOperand(0).getReg()); MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(), **MI.memoperands_begin()); MI.eraseFromParent(); @@ -315,6 +551,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_SMULO: + case TargetOpcode::G_UMULO: { + // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the + // result. + unsigned Res = MI.getOperand(0).getReg(); + unsigned Overflow = MI.getOperand(1).getReg(); + unsigned LHS = MI.getOperand(2).getReg(); + unsigned RHS = MI.getOperand(3).getReg(); + + MIRBuilder.buildMul(Res, LHS, RHS); + + unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO + ? TargetOpcode::G_SMULH + : TargetOpcode::G_UMULH; + + unsigned HiPart = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInstr(Opcode) + .addDef(HiPart) + .addUse(LHS) + .addUse(RHS); + + unsigned Zero = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildConstant(Zero, 0); + MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FNEG: { + // TODO: Handle vector types once we are able to + // represent them. + if (Ty.isVector()) + return UnableToLegalize; + unsigned Res = MI.getOperand(0).getReg(); + Type *ZeroTy; + LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + switch (Ty.getSizeInBits()) { + case 16: + ZeroTy = Type::getHalfTy(Ctx); + break; + case 32: + ZeroTy = Type::getFloatTy(Ctx); + break; + case 64: + ZeroTy = Type::getDoubleTy(Ctx); + break; + default: + llvm_unreachable("unexpected floating-point type"); + } + ConstantFP &ZeroForNegation = + *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy)); + unsigned Zero = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildFConstant(Zero, ZeroForNegation); + MIRBuilder.buildInstr(TargetOpcode::G_FSUB) + .addDef(Res) + .addUse(Zero) + .addUse(MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return Legalized; + } + case TargetOpcode::G_FSUB: { + // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). + // First, check if G_FNEG is marked as Lower. If so, we may + // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. + if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower) + return UnableToLegalize; + unsigned Res = MI.getOperand(0).getReg(); + unsigned LHS = MI.getOperand(1).getReg(); + unsigned RHS = MI.getOperand(2).getReg(); + unsigned Neg = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS); + MIRBuilder.buildInstr(TargetOpcode::G_FADD) + .addDef(Res) + .addUse(LHS) + .addUse(Neg); + MI.eraseFromParent(); + return Legalized; + } } } @@ -335,7 +648,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, MIRBuilder.setInstr(MI); SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs; - SmallVector<uint64_t, 2> Indexes; extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs); extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs); @@ -343,10 +655,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy); MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]); DstRegs.push_back(DstReg); - Indexes.push_back(i * NarrowSize); } - MIRBuilder.buildSequence(DstReg, DstRegs, Indexes); + MIRBuilder.buildMerge(DstReg, DstRegs); MI.eraseFromParent(); return Legalized; } diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index e49662075ed5..4d4591042296 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -41,6 +41,8 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) { DefaultActions[TargetOpcode::G_STORE] = NarrowScalar; DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar; + DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar; + DefaultActions[TargetOpcode::G_FNEG] = Lower; } void LegalizerInfo::computeTables() { @@ -71,28 +73,36 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const { // These *have* to be implemented for now, they're the fundamental basis of // how everything else is transformed. - // Nothing is going to go well with types that aren't a power of 2 yet, so - // don't even try because we might make things worse. - if (!isPowerOf2_64(Aspect.Type.getSizeInBits())) - return std::make_pair(Unsupported, LLT()); - // FIXME: the long-term plan calls for expansion in terms of load/store (if // they're not legal). if (Aspect.Opcode == TargetOpcode::G_SEQUENCE || - Aspect.Opcode == TargetOpcode::G_EXTRACT) + Aspect.Opcode == TargetOpcode::G_EXTRACT || + Aspect.Opcode == TargetOpcode::G_MERGE_VALUES || + Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES) return std::make_pair(Legal, Aspect.Type); + LLT Ty = Aspect.Type; LegalizeAction Action = findInActions(Aspect); + // LegalizerHelper is not able to handle non-power-of-2 types right now, so do + // not try to legalize them unless they are marked as Legal or Custom. + // FIXME: This is a temporary hack until the general non-power-of-2 + // legalization works. + if (!isPowerOf2_64(Ty.getSizeInBits()) && + !(Action == Legal || Action == Custom)) + return std::make_pair(Unsupported, LLT()); + if (Action != NotFound) return findLegalAction(Aspect, Action); unsigned Opcode = Aspect.Opcode; - LLT Ty = Aspect.Type; if (!Ty.isVector()) { auto DefaultAction = DefaultActions.find(Aspect.Opcode); if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal) return std::make_pair(Legal, Ty); + if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower) + return std::make_pair(Lower, Ty); + if (DefaultAction == DefaultActions.end() || DefaultAction->second != NarrowScalar) return std::make_pair(Unsupported, LLT()); @@ -152,7 +162,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI, return std::get<0>(getAction(MI, MRI)) == Legal; } -LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, +Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const { switch(Action) { default: @@ -160,23 +170,30 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, case Legal: case Lower: case Libcall: + case Custom: return Aspect.Type; case NarrowScalar: { return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); + [](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); } case WidenScalar: { - return findLegalType(Aspect, [&](LLT Ty) -> LLT { + return findLegalType(Aspect, [](LLT Ty) -> LLT { return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize(); }); } case FewerElements: { return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.halfElements(); }); + [](LLT Ty) -> LLT { return Ty.halfElements(); }); } case MoreElements: { return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.doubleElements(); }); + [](LLT Ty) -> LLT { return Ty.doubleElements(); }); } } } + +bool LegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + return false; +} diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp new file mode 100644 index 000000000000..bdca732b4e33 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp @@ -0,0 +1,125 @@ +//===- Localizer.cpp ---------------------- Localize some instrs -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the Localizer class. +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GlobalISel/Localizer.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "localizer" + +using namespace llvm; + +char Localizer::ID = 0; +INITIALIZE_PASS(Localizer, DEBUG_TYPE, + "Move/duplicate certain instructions close to their use", false, + false); + +Localizer::Localizer() : MachineFunctionPass(ID) { + initializeLocalizerPass(*PassRegistry::getPassRegistry()); +} + +void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); } + +bool Localizer::shouldLocalize(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + // Constants-like instructions should be close to their users. + // We don't want long live-ranges for them. + case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FRAME_INDEX: + return true; + } +} + +bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def, + MachineBasicBlock *&InsertMBB) { + MachineInstr &MIUse = *MOUse.getParent(); + InsertMBB = MIUse.getParent(); + if (MIUse.isPHI()) + InsertMBB = MIUse.getOperand(MIUse.getOperandNo(&MOUse) + 1).getMBB(); + return InsertMBB == Def.getParent(); +} + +bool Localizer::runOnMachineFunction(MachineFunction &MF) { + // If the ISel pipeline failed, do not bother running that pass. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + + DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n'); + + init(MF); + + bool Changed = false; + // Keep track of the instructions we localized. + // We won't need to process them if we see them later in the CFG. + SmallPtrSet<MachineInstr *, 16> LocalizedInstrs; + DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef; + // TODO: Do bottom up traversal. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) + continue; + DEBUG(dbgs() << "Should localize: " << MI); + assert(MI.getDesc().getNumDefs() == 1 && + "More than one definition not supported yet"); + unsigned Reg = MI.getOperand(0).getReg(); + // Check if all the users of MI are local. + // We are going to invalidation the list of use operands, so we + // can't use range iterator. + for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); + MOIt != MOItEnd;) { + MachineOperand &MOUse = *MOIt++; + // Check if the use is already local. + MachineBasicBlock *InsertMBB; + DEBUG(MachineInstr &MIUse = *MOUse.getParent(); + dbgs() << "Checking use: " << MIUse + << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); + if (isLocalUse(MOUse, MI, InsertMBB)) + continue; + DEBUG(dbgs() << "Fixing non-local use\n"); + Changed = true; + auto MBBAndReg = std::make_pair(InsertMBB, Reg); + auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); + if (NewVRegIt == MBBWithLocalDef.end()) { + // Create the localized instruction. + MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); + LocalizedInstrs.insert(LocalizedMI); + // Move it at the right place. + MachineInstr &MIUse = *MOUse.getParent(); + if (MIUse.getParent() == InsertMBB) + InsertMBB->insert(MIUse, LocalizedMI); + else + InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); + + // Set a new register for the definition. + unsigned NewReg = + MRI->createGenericVirtualRegister(MRI->getType(Reg)); + MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); + LocalizedMI->getOperand(0).setReg(NewReg); + NewVRegIt = + MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; + DEBUG(dbgs() << "Inserted: " << *LocalizedMI); + } + DEBUG(dbgs() << "Update use with: " << PrintReg(NewVRegIt->second) + << '\n'); + // Update the user reg. + MOUse.setReg(NewVRegIt->second); + } + } + } + return Changed; +} diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index c04f6e4ae897..54ef7e5c5a1b 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -54,7 +55,7 @@ void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB, void MachineIRBuilder::recordInsertions( std::function<void(MachineInstr *)> Inserted) { - InsertedInstr = Inserted; + InsertedInstr = std::move(Inserted); } void MachineIRBuilder::stopRecordingInsertions() { @@ -82,6 +83,70 @@ MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) { return MIB; } +MachineInstrBuilder MachineIRBuilder::buildDirectDbgValue( + unsigned Reg, const MDNode *Variable, const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addReg(Reg, RegState::Debug) + .addReg(0, RegState::Debug) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildIndirectDbgValue( + unsigned Reg, unsigned Offset, const MDNode *Variable, const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addReg(Reg, RegState::Debug) + .addImm(Offset) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI, + const MDNode *Variable, + const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + return buildInstr(TargetOpcode::DBG_VALUE) + .addFrameIndex(FI) + .addImm(0) + .addMetadata(Variable) + .addMetadata(Expr); +} + +MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, + unsigned Offset, + const MDNode *Variable, + const MDNode *Expr) { + assert(isa<DILocalVariable>(Variable) && "not a variable"); + assert(cast<DIExpression>(Expr)->isValid() && "not an expression"); + assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + auto MIB = buildInstr(TargetOpcode::DBG_VALUE); + if (auto *CI = dyn_cast<ConstantInt>(&C)) { + if (CI->getBitWidth() > 64) + MIB.addCImm(CI); + else + MIB.addImm(CI->getZExtValue()); + } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) { + MIB.addFPImm(CFP); + } else { + // Insert %noreg if we didn't find a usable constant and had to drop it. + MIB.addReg(0U); + } + + return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr); +} + MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) { assert(MRI->getType(Res).isPointer() && "invalid operand type"); return buildInstr(TargetOpcode::G_FRAME_INDEX) @@ -126,6 +191,17 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0, + uint32_t NumBits) { + assert(MRI->getType(Res).isPointer() && + MRI->getType(Res) == MRI->getType(Op0) && "type mismatch"); + + return buildInstr(TargetOpcode::G_PTR_MASK) + .addDef(Res) + .addUse(Op0) + .addImm(NumBits); +} + MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0, unsigned Op1) { assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && @@ -152,10 +228,27 @@ MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0, + unsigned Op1) { + assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) && + "invalid operand type"); + assert(MRI->getType(Res) == MRI->getType(Op0) && + MRI->getType(Res) == MRI->getType(Op1) && "type mismatch"); + + return buildInstr(TargetOpcode::G_AND) + .addDef(Res) + .addUse(Op0) + .addUse(Op1); +} + MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) { return buildInstr(TargetOpcode::G_BR).addMBB(&Dest); } +MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) { + return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt); +} + MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) { return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op); } @@ -262,34 +355,56 @@ MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res, return buildInstr(Opcode).addDef(Res).addUse(Op); } -MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results, - ArrayRef<uint64_t> Indices, - unsigned Src) { -#ifndef NDEBUG - assert(Results.size() == Indices.size() && "inconsistent number of regs"); - assert(!Results.empty() && "invalid trivial extract"); - assert(std::is_sorted(Indices.begin(), Indices.end()) && - "extract offsets must be in ascending order"); +MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res, + unsigned Op) { + unsigned Opcode = TargetOpcode::COPY; + if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits()) + Opcode = TargetOpcode::G_ZEXT; + else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits()) + Opcode = TargetOpcode::G_TRUNC; - assert(MRI->getType(Src).isValid() && "invalid operand type"); - for (auto Res : Results) - assert(MRI->getType(Res).isValid() && "invalid operand type"); -#endif + return buildInstr(Opcode).addDef(Res).addUse(Op); +} - auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT)); - for (auto Res : Results) - MIB.addDef(Res); - MIB.addUse(Src); +MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) { + LLT SrcTy = MRI->getType(Src); + LLT DstTy = MRI->getType(Dst); + if (SrcTy == DstTy) + return buildCopy(Dst, Src); + + unsigned Opcode; + if (SrcTy.isPointer() && DstTy.isScalar()) + Opcode = TargetOpcode::G_PTRTOINT; + else if (DstTy.isPointer() && SrcTy.isScalar()) + Opcode = TargetOpcode::G_INTTOPTR; + else { + assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet"); + Opcode = TargetOpcode::G_BITCAST; + } - for (auto Idx : Indices) - MIB.addImm(Idx); + return buildInstr(Opcode).addDef(Dst).addUse(Src); +} - getMBB().insert(getInsertPt(), MIB); - if (InsertedInstr) - InsertedInstr(MIB); +MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src, + uint64_t Index) { +#ifndef NDEBUG + assert(MRI->getType(Src).isValid() && "invalid operand type"); + assert(MRI->getType(Res).isValid() && "invalid operand type"); + assert(Index + MRI->getType(Res).getSizeInBits() <= + MRI->getType(Src).getSizeInBits() && + "extracting off end of register"); +#endif - return MIB; + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) { + assert(Index == 0 && "insertion past the end of a register"); + return buildCast(Res, Src); + } + + return buildInstr(TargetOpcode::G_EXTRACT) + .addDef(Res) + .addUse(Src) + .addImm(Index); } MachineInstrBuilder @@ -316,6 +431,64 @@ MachineIRBuilder::buildSequence(unsigned Res, return MIB; } +MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) { + return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res); +} + +MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res, + ArrayRef<unsigned> Ops) { + +#ifndef NDEBUG + assert(!Ops.empty() && "invalid trivial sequence"); + LLT Ty = MRI->getType(Ops[0]); + for (auto Reg : Ops) + assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); + assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() == + MRI->getType(Res).getSizeInBits() && + "input operands do not cover output register"); +#endif + + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES); + MIB.addDef(Res); + for (unsigned i = 0; i < Ops.size(); ++i) + MIB.addUse(Ops[i]); + return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res, + unsigned Op) { + +#ifndef NDEBUG + assert(!Res.empty() && "invalid trivial sequence"); + LLT Ty = MRI->getType(Res[0]); + for (auto Reg : Res) + assert(MRI->getType(Reg) == Ty && "type mismatch in input list"); + assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() == + MRI->getType(Op).getSizeInBits() && + "input operands do not cover output register"); +#endif + + MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES); + for (unsigned i = 0; i < Res.size(); ++i) + MIB.addDef(Res[i]); + MIB.addUse(Op); + return MIB; +} + +MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src, + unsigned Op, unsigned Index) { + if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) { + assert(Index == 0 && "insertion past the end of a register"); + return buildCast(Res, Op); + } + + return buildInstr(TargetOpcode::G_INSERT) + .addDef(Res) + .addUse(Src) + .addUse(Op) + .addImm(Index); +} + MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID, unsigned Res, bool HasSideEffects) { @@ -395,9 +568,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst, if (ResTy.isScalar() || ResTy.isPointer()) assert(MRI->getType(Tst).isScalar() && "type mismatch"); else - assert(MRI->getType(Tst).isVector() && - MRI->getType(Tst).getNumElements() == - MRI->getType(Op0).getNumElements() && + assert((MRI->getType(Tst).isScalar() || + (MRI->getType(Tst).isVector() && + MRI->getType(Tst).getNumElements() == + MRI->getType(Op0).getNumElements())) && "type mismatch"); #endif @@ -408,6 +582,47 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst, .addUse(Op1); } +MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res, + unsigned Val, + unsigned Elt, + unsigned Idx) { +#ifndef NDEBUG + LLT ResTy = MRI->getType(Res); + LLT ValTy = MRI->getType(Val); + LLT EltTy = MRI->getType(Elt); + LLT IdxTy = MRI->getType(Idx); + assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type"); + assert(IdxTy.isScalar() && "invalid operand type"); + assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch"); + assert(ResTy.getElementType() == EltTy && "type mismatch"); +#endif + + return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT) + .addDef(Res) + .addUse(Val) + .addUse(Elt) + .addUse(Idx); +} + +MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res, + unsigned Val, + unsigned Idx) { +#ifndef NDEBUG + LLT ResTy = MRI->getType(Res); + LLT ValTy = MRI->getType(Val); + LLT IdxTy = MRI->getType(Idx); + assert(ValTy.isVector() && "invalid operand type"); + assert((ResTy.isScalar() || ResTy.isPointer()) && "invalid operand type"); + assert(IdxTy.isScalar() && "invalid operand type"); + assert(ValTy.getElementType() == ResTy && "type mismatch"); +#endif + + return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT) + .addDef(Res) + .addUse(Val) + .addUse(Idx); +} + void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src, bool IsExtend) { #ifndef NDEBUG diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp index cc026ef27296..2eb3cdee694d 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -71,6 +72,7 @@ void RegBankSelect::init(MachineFunction &MF) { MBPI = nullptr; } MIRBuilder.setMF(MF); + MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); } void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { @@ -202,30 +204,28 @@ uint64_t RegBankSelect::getRepairCost( // TODO: use a dedicated constant for ImpossibleCost. if (Cost != UINT_MAX) return Cost; - assert(!TPC->isGlobalISelAbortEnabled() && - "Legalization not available yet"); // Return the legalization cost of that repairing. } - assert(!TPC->isGlobalISelAbortEnabled() && - "Complex repairing not implemented yet"); return UINT_MAX; } -RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( +const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( MachineInstr &MI, RegisterBankInfo::InstructionMappings &PossibleMappings, SmallVectorImpl<RepairingPlacement> &RepairPts) { assert(!PossibleMappings.empty() && "Do not know how to map this instruction"); - RegisterBankInfo::InstructionMapping *BestMapping = nullptr; + const RegisterBankInfo::InstructionMapping *BestMapping = nullptr; MappingCost Cost = MappingCost::ImpossibleCost(); SmallVector<RepairingPlacement, 4> LocalRepairPts; - for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) { - MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost); + for (const RegisterBankInfo::InstructionMapping *CurMapping : + PossibleMappings) { + MappingCost CurCost = + computeMapping(MI, *CurMapping, LocalRepairPts, &Cost); if (CurCost < Cost) { DEBUG(dbgs() << "New best: " << CurCost << '\n'); Cost = CurCost; - BestMapping = &CurMapping; + BestMapping = CurMapping; RepairPts.clear(); for (RepairingPlacement &RepairPt : LocalRepairPts) RepairPts.emplace_back(std::move(RepairPt)); @@ -235,7 +235,7 @@ RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping( // If none of the mapping worked that means they are all impossible. // Thus, pick the first one and set an impossible repairing point. // It will trigger the failed isel mode. - BestMapping = &(*PossibleMappings.begin()); + BestMapping = *PossibleMappings.begin(); RepairPts.emplace_back( RepairingPlacement(MI, 0, *TRI, *this, RepairingPlacement::Impossible)); } else @@ -448,6 +448,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping( // Sums up the repairing cost of MO at each insertion point. uint64_t RepairCost = getRepairCost(MO, ValMapping); + + // This is an impossible to repair cost. + if (RepairCost == UINT_MAX) + continue; + // Bias used for splitting: 5%. const uint64_t PercentageForBias = 5; uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100; @@ -541,10 +546,10 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) { // Remember the repairing placement for all the operands. SmallVector<RepairingPlacement, 4> RepairPts; - RegisterBankInfo::InstructionMapping BestMapping; + const RegisterBankInfo::InstructionMapping *BestMapping; if (OptMode == RegBankSelect::Mode::Fast) { - BestMapping = RBI->getInstrMapping(MI); - MappingCost DefaultCost = computeMapping(MI, BestMapping, RepairPts); + BestMapping = &RBI->getInstrMapping(MI); + MappingCost DefaultCost = computeMapping(MI, *BestMapping, RepairPts); (void)DefaultCost; if (DefaultCost == MappingCost::ImpossibleCost()) return false; @@ -553,16 +558,16 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) { RBI->getInstrPossibleMappings(MI); if (PossibleMappings.empty()) return false; - BestMapping = std::move(findBestMapping(MI, PossibleMappings, RepairPts)); + BestMapping = &findBestMapping(MI, PossibleMappings, RepairPts); } // Make sure the mapping is valid for MI. - assert(BestMapping.verify(MI) && "Invalid instruction mapping"); + assert(BestMapping->verify(MI) && "Invalid instruction mapping"); - DEBUG(dbgs() << "Best Mapping: " << BestMapping << '\n'); + DEBUG(dbgs() << "Best Mapping: " << *BestMapping << '\n'); // After this call, MI may not be valid anymore. // Do not use it. - return applyMapping(MI, BestMapping, RepairPts); + return applyMapping(MI, *BestMapping, RepairPts); } bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { @@ -585,18 +590,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { // LegalizerInfo as it's currently in the separate GlobalISel library. const MachineRegisterInfo &MRI = MF.getRegInfo(); if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) { - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) { - if (!TPC->isGlobalISelAbortEnabled()) { - MF.getProperties().set( - MachineFunctionProperties::Property::FailedISel); - return false; - } - std::string ErrStorage; - raw_string_ostream Err(ErrStorage); - Err << "Instruction is not legal: " << MI << '\n'; - report_fatal_error(Err.str()); + reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", + "instruction is not legal", MI); + return false; } } } @@ -622,9 +621,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { continue; if (!assignInstr(MI)) { - if (TPC->isGlobalISelAbortEnabled()) - report_fatal_error("Unable to map instruction"); - MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect", + "unable to map instruction", MI); return false; } } @@ -968,10 +966,12 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const { LocalFreq == Cost.LocalFreq; } -void RegBankSelect::MappingCost::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegBankSelect::MappingCost::print(raw_ostream &OS) const { if (*this == ImpossibleCost()) { diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp index 49d676f11da6..83b21e637097 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -19,10 +19,11 @@ using namespace llvm; const unsigned RegisterBank::InvalidID = UINT_MAX; -RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size, - const uint32_t *CoveredClasses) +RegisterBank::RegisterBank( + unsigned ID, const char *Name, unsigned Size, + const uint32_t *CoveredClasses, unsigned NumRegClasses) : ID(ID), Name(Name), Size(Size) { - ContainedRegClasses.resize(200); + ContainedRegClasses.resize(NumRegClasses); ContainedRegClasses.setBitsInMask(CoveredClasses); } @@ -47,7 +48,7 @@ bool RegisterBank::verify(const TargetRegisterInfo &TRI) const { // Verify that the Size of the register bank is big enough to cover // all the register classes it covers. - assert((getSize() >= SubRC.getSize() * 8) && + assert(getSize() >= TRI.getRegSizeInBits(SubRC) && "Size is not big enough for all the subclasses!"); assert(covers(SubRC) && "Not all subclasses are covered"); } @@ -75,9 +76,11 @@ bool RegisterBank::operator==(const RegisterBank &OtherRB) const { return &OtherRB == this; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const { print(dbgs(), /* IsForDebug */ true, TRI); } +#endif void RegisterBank::print(raw_ostream &OS, bool IsForDebug, const TargetRegisterInfo *TRI) const { diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index da5ab0b9fb7b..a841902feed1 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -45,6 +45,10 @@ STATISTIC(NumOperandsMappingsCreated, "Number of operands mappings dynamically created"); STATISTIC(NumOperandsMappingsAccessed, "Number of operands mappings dynamically accessed"); +STATISTIC(NumInstructionMappingsCreated, + "Number of instruction mappings dynamically created"); +STATISTIC(NumInstructionMappingsAccessed, + "Number of instruction mappings dynamically accessed"); const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX; const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1; @@ -63,13 +67,6 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, #endif // NDEBUG } -RegisterBankInfo::~RegisterBankInfo() { - for (auto It : MapOfPartialMappings) - delete It.second; - for (auto It : MapOfValueMappings) - delete It.second; -} - bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const { #ifndef NDEBUG for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { @@ -133,19 +130,27 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister( return &RC; } -RegisterBankInfo::InstructionMapping +/// Check whether or not \p MI should be treated like a copy +/// for the mappings. +/// Copy like instruction are special for mapping because +/// they don't have actual register constraints. Moreover, +/// they sometimes have register classes assigned and we can +/// just use that instead of failing to provide a generic mapping. +static bool isCopyLike(const MachineInstr &MI) { + return MI.isCopy() || MI.isPHI() || + MI.getOpcode() == TargetOpcode::REG_SEQUENCE; +} + +const RegisterBankInfo::InstructionMapping & RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // For copies we want to walk over the operands and try to find one // that has a register bank since the instruction itself will not get // us any constraint. - bool isCopyLike = MI.isCopy() || MI.isPHI(); + bool IsCopyLike = isCopyLike(MI); // For copy like instruction, only the mapping of the definition // is important. The rest is not constrained. - unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands(); + unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands(); - RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1, - /*OperandsMapping*/ nullptr, - NumOperandsForMapping); const MachineFunction &MF = *MI.getParent()->getParent(); const TargetSubtargetInfo &STI = MF.getSubtarget(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); @@ -175,7 +180,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // For copy-like instruction, we want to reuse the register bank // that is already set on Reg, if any, since those instructions do // not have any constraints. - const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr; + const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr; if (!CurRegBank) { // If this is a target specific instruction, we can deduce // the register bank from the encoding constraints. @@ -184,15 +189,15 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { // All our attempts failed, give up. CompleteMapping = false; - if (!isCopyLike) + if (!IsCopyLike) // MI does not carry enough information to guess the mapping. - return InstructionMapping(); + return getInvalidInstructionMapping(); continue; } } const ValueMapping *ValMapping = &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank); - if (isCopyLike) { + if (IsCopyLike) { OperandsMapping[0] = ValMapping; CompleteMapping = true; break; @@ -200,13 +205,15 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const { OperandsMapping[OpIdx] = ValMapping; } - if (isCopyLike && !CompleteMapping) + if (IsCopyLike && !CompleteMapping) // No way to deduce the type from what we have. - return InstructionMapping(); + return getInvalidInstructionMapping(); assert(CompleteMapping && "Setting an uncomplete mapping"); - Mapping.setOperandsMapping(getOperandsMapping(OperandsMapping)); - return Mapping; + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping(OperandsMapping), + NumOperandsForMapping); } /// Hashing function for PartialMapping. @@ -234,8 +241,8 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length, ++NumPartialMappingsCreated; - const PartialMapping *&PartMapping = MapOfPartialMappings[Hash]; - PartMapping = new PartialMapping{StartIdx, Length, RegBank}; + auto &PartMapping = MapOfPartialMappings[Hash]; + PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank); return *PartMapping; } @@ -268,8 +275,8 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown, ++NumValueMappingsCreated; - const ValueMapping *&ValMapping = MapOfValueMappings[Hash]; - ValMapping = new ValueMapping{BreakDown, NumBreakDowns}; + auto &ValMapping = MapOfValueMappings[Hash]; + ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns); return *ValMapping; } @@ -282,9 +289,9 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { // The addresses of the value mapping are unique. // Therefore, we can use them directly to hash the operand mapping. hash_code Hash = hash_combine_range(Begin, End); - const auto &It = MapOfOperandsMappings.find(Hash); - if (It != MapOfOperandsMappings.end()) - return It->second; + auto &Res = MapOfOperandsMappings[Hash]; + if (Res) + return Res.get(); ++NumOperandsMappingsCreated; @@ -293,8 +300,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { // mapping, because we use the pointer of the ValueMapping // to hash and we expect them to uniquely identify an instance // of value mapping. - ValueMapping *&Res = MapOfOperandsMappings[Hash]; - Res = new ValueMapping[std::distance(Begin, End)]; + Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End)); unsigned Idx = 0; for (Iterator It = Begin; It != End; ++It, ++Idx) { const ValueMapping *ValMap = *It; @@ -302,7 +308,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const { continue; Res[Idx] = *ValMap; } - return Res; + return Res.get(); } const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping( @@ -317,9 +323,44 @@ const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping( return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end()); } -RegisterBankInfo::InstructionMapping +static hash_code +hashInstructionMapping(unsigned ID, unsigned Cost, + const RegisterBankInfo::ValueMapping *OperandsMapping, + unsigned NumOperands) { + return hash_combine(ID, Cost, OperandsMapping, NumOperands); +} + +const RegisterBankInfo::InstructionMapping & +RegisterBankInfo::getInstructionMappingImpl( + bool IsInvalid, unsigned ID, unsigned Cost, + const RegisterBankInfo::ValueMapping *OperandsMapping, + unsigned NumOperands) const { + assert(((IsInvalid && ID == InvalidMappingID && Cost == 0 && + OperandsMapping == nullptr && NumOperands == 0) || + !IsInvalid) && + "Mismatch argument for invalid input"); + ++NumInstructionMappingsAccessed; + + hash_code Hash = + hashInstructionMapping(ID, Cost, OperandsMapping, NumOperands); + const auto &It = MapOfInstructionMappings.find(Hash); + if (It != MapOfInstructionMappings.end()) + return *It->second; + + ++NumInstructionMappingsCreated; + + auto &InstrMapping = MapOfInstructionMappings[Hash]; + if (IsInvalid) + InstrMapping = llvm::make_unique<InstructionMapping>(); + else + InstrMapping = llvm::make_unique<InstructionMapping>( + ID, Cost, OperandsMapping, NumOperands); + return *InstrMapping; +} + +const RegisterBankInfo::InstructionMapping & RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { - RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI); + const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; llvm_unreachable("The target must implement this"); @@ -329,14 +370,14 @@ RegisterBankInfo::InstructionMappings RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const { InstructionMappings PossibleMappings; // Put the default mapping first. - PossibleMappings.push_back(getInstrMapping(MI)); + PossibleMappings.push_back(&getInstrMapping(MI)); // Then the alternative mapping, if any. InstructionMappings AltMappings = getInstrAlternativeMappings(MI); - for (InstructionMapping &AltMapping : AltMappings) - PossibleMappings.emplace_back(std::move(AltMapping)); + for (const InstructionMapping *AltMapping : AltMappings) + PossibleMappings.push_back(AltMapping); #ifndef NDEBUG - for (const InstructionMapping &Mapping : PossibleMappings) - assert(Mapping.verify(MI) && "Mapping is invalid"); + for (const InstructionMapping *Mapping : PossibleMappings) + assert(Mapping->verify(MI) && "Mapping is invalid"); #endif return PossibleMappings; } @@ -349,6 +390,7 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const { void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); DEBUG(dbgs() << "Applying default-like mapping\n"); for (unsigned OpIdx = 0, EndIdx = OpdMapper.getInstrMapping().getNumOperands(); @@ -359,6 +401,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { DEBUG(dbgs() << " is not a register, nothing to be done\n"); continue; } + if (!MO.getReg()) { + DEBUG(dbgs() << " is %%noreg, nothing to be done\n"); + continue; + } + assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns != + 0 && + "Invalid mapping"); assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns == 1 && "This mapping is too complex for this function"); @@ -368,9 +417,25 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { DEBUG(dbgs() << " has not been repaired, nothing to be done\n"); continue; } - DEBUG(dbgs() << " changed, replace " << MO.getReg()); - MO.setReg(*NewRegs.begin()); - DEBUG(dbgs() << " with " << MO.getReg()); + unsigned OrigReg = MO.getReg(); + unsigned NewReg = *NewRegs.begin(); + DEBUG(dbgs() << " changed, replace " << PrintReg(OrigReg, nullptr)); + MO.setReg(NewReg); + DEBUG(dbgs() << " with " << PrintReg(NewReg, nullptr)); + + // The OperandsMapper creates plain scalar, we may have to fix that. + // Check if the types match and if not, fix that. + LLT OrigTy = MRI.getType(OrigReg); + LLT NewTy = MRI.getType(NewReg); + if (OrigTy != NewTy) { + assert(OrigTy.getSizeInBits() == NewTy.getSizeInBits() && + "Types with difference size cannot be handled by the default " + "mapping"); + DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to " + << OrigTy); + MRI.setType(NewReg, OrigTy); + } + DEBUG(dbgs() << '\n'); } } @@ -394,16 +459,18 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg, RC = MRI.getRegClass(Reg); } assert(RC && "Unable to deduce the register class"); - return RC->getSize() * 8; + return TRI.getRegSizeInBits(*RC); } //------------------------------------------------------------------------------ // Helper classes implementation. //------------------------------------------------------------------------------ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif bool RegisterBankInfo::PartialMapping::verify() const { assert(RegBank && "Register bank not set"); @@ -451,10 +518,12 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const { return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const { OS << "#BreakDown: " << NumBreakDowns << " "; @@ -472,8 +541,7 @@ bool RegisterBankInfo::InstructionMapping::verify( // Check that all the register operands are properly mapped. // Check the constructor invariant. // For PHI, we only care about mapping the definition. - assert(NumOperands == - ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) && + assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) && "NumOperands must match, see constructor"); assert(MI.getParent() && MI.getParent()->getParent() && "MI must be connected to a MachineFunction"); @@ -503,10 +571,12 @@ bool RegisterBankInfo::InstructionMapping::verify( return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const { print(dbgs()); dbgs() << '\n'; } +#endif void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const { OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: "; @@ -576,6 +646,11 @@ void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) { for (unsigned &NewVReg : NewVRegsForOpIdx) { assert(PartMap != ValMapping.end() && "Out-of-bound access"); assert(NewVReg == 0 && "Register has already been created"); + // The new registers are always bound to scalar with the right size. + // The actual type has to be set when the target does the mapping + // of the instruction. + // The rationale is that this generic code cannot guess how the + // target plans to split the input type. NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length)); MRI.setRegBank(NewVReg, *PartMap->RegBank); ++PartMap; @@ -619,10 +694,12 @@ RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx, return Res; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const { print(dbgs(), true); dbgs() << '\n'; } +#endif void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS, bool ForDebug) const { diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp index e50091833c26..254bdf10d804 100644 --- a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -11,10 +11,14 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -43,3 +47,74 @@ unsigned llvm::constrainOperandRegClass( return Reg; } + +bool llvm::isTriviallyDead(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { + // If we can move an instruction, we can remove it. Otherwise, it has + // a side-effect of some sort. + bool SawStore = false; + if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore)) + return false; + + // Instructions without side-effects are dead iff they only define dead vregs. + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI.use_nodbg_empty(Reg)) + return false; + } + return true; +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, + MachineOptimizationRemarkEmitter &MORE, + MachineOptimizationRemarkMissed &R) { + MF.getProperties().set(MachineFunctionProperties::Property::FailedISel); + + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled()) + R << (" (in function: " + MF.getName() + ")").str(); + + if (TPC.isGlobalISelAbortEnabled()) + report_fatal_error(R.getMsg()); + else + MORE.emit(R); +} + +void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, + MachineOptimizationRemarkEmitter &MORE, + const char *PassName, StringRef Msg, + const MachineInstr &MI) { + MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ", + MI.getDebugLoc(), MI.getParent()); + R << Msg << ": " << ore::MNV("Inst", MI); + reportGISelFailure(MF, TPC, MORE, R); +} + +Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(VReg); + if (MI->getOpcode() != TargetOpcode::G_CONSTANT) + return None; + + if (MI->getOperand(1).isImm()) + return MI->getOperand(1).getImm(); + + if (MI->getOperand(1).isCImm() && + MI->getOperand(1).getCImm()->getBitWidth() <= 64) + return MI->getOperand(1).getCImm()->getSExtValue(); + + return None; +} + +const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(VReg); + if (TargetOpcode::G_FCONSTANT != MI->getOpcode()) + return nullptr; + return MI->getOperand(1).getFPImm(); +} diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp index 1ea534939948..23812a2a2344 100644 --- a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp @@ -192,10 +192,7 @@ namespace { } // end anonymous namespace char GlobalMerge::ID = 0; -INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables", - false, false) -INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables", - false, false) +INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false) bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const { diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp index b9f3d86eabd8..1c33f3b6800e 100644 --- a/contrib/llvm/lib/CodeGen/IfConversion.cpp +++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp @@ -39,7 +39,7 @@ using namespace llvm; -#define DEBUG_TYPE "ifcvt" +#define DEBUG_TYPE "if-converter" // Hidden options for help debugging. static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden); @@ -316,9 +316,9 @@ namespace { char &llvm::IfConverterID = IfConverter::ID; -INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false) +INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false) +INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false) bool IfConverter::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction()) || (PredicateFtor && !PredicateFtor(MF))) @@ -588,19 +588,6 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI, return TExit && TExit == FalseBBI.BB; } -/// Shrink the provided inclusive range by one instruction. -/// If the range was one instruction (\p It == \p Begin), It is not modified, -/// but \p Empty is set to true. -static inline void shrinkInclusiveRange( - MachineBasicBlock::iterator &Begin, - MachineBasicBlock::iterator &It, - bool &Empty) { - if (It == Begin) - Empty = true; - else - It--; -} - /// Count duplicated instructions and move the iterators to show where they /// are. /// @param TIB True Iterator Begin @@ -633,10 +620,8 @@ bool IfConverter::CountDuplicatedInstructions( while (TIB != TIE && FIB != FIE) { // Skip dbg_value instructions. These do not count. TIB = skipDebugInstructionsForward(TIB, TIE); - if(TIB == TIE) - break; FIB = skipDebugInstructionsForward(FIB, FIE); - if(FIB == FIE) + if (TIB == TIE || FIB == FIE) break; if (!TIB->isIdenticalTo(*FIB)) break; @@ -656,58 +641,42 @@ bool IfConverter::CountDuplicatedInstructions( if (TIB == TIE || FIB == FIE) return true; // Now, in preparation for counting duplicate instructions at the ends of the - // blocks, move the end iterators up past any branch instructions. - --TIE; - --FIE; - - // After this point TIB and TIE define an inclusive range, which means that - // TIB == TIE is true when there is one more instruction to consider, not at - // the end. Because we may not be able to go before TIB, we need a flag to - // indicate a completely empty range. - bool TEmpty = false, FEmpty = false; - - // Upon exit TIE and FIE will both point at the last non-shared instruction. - // They need to be moved forward to point past the last non-shared - // instruction if the range they delimit is non-empty. - auto IncrementEndIteratorsOnExit = make_scope_exit([&]() { - if (!TEmpty) - ++TIE; - if (!FEmpty) - ++FIE; - }); + // blocks, switch to reverse_iterators. Note that getReverse() returns an + // iterator that points to the same instruction, unlike std::reverse_iterator. + // We have to do our own shifting so that we get the same range. + MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse()); + MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse()); + const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse()); + const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse()); if (!TBB.succ_empty() || !FBB.succ_empty()) { if (SkipUnconditionalBranches) { - while (!TEmpty && TIE->isUnconditionalBranch()) - shrinkInclusiveRange(TIB, TIE, TEmpty); - while (!FEmpty && FIE->isUnconditionalBranch()) - shrinkInclusiveRange(FIB, FIE, FEmpty); + while (RTIE != RTIB && RTIE->isUnconditionalBranch()) + ++RTIE; + while (RFIE != RFIB && RFIE->isUnconditionalBranch()) + ++RFIE; } } - // If Dups1 includes all of a block, then don't count duplicate - // instructions at the end of the blocks. - if (TEmpty || FEmpty) - return true; - // Count duplicate instructions at the ends of the blocks. - while (!TEmpty && !FEmpty) { + while (RTIE != RTIB && RFIE != RFIB) { // Skip dbg_value instructions. These do not count. - TIE = skipDebugInstructionsBackward(TIE, TIB); - FIE = skipDebugInstructionsBackward(FIE, FIB); - TEmpty = TIE == TIB && TIE->isDebugValue(); - FEmpty = FIE == FIB && FIE->isDebugValue(); - if (TEmpty || FEmpty) + // Note that these are reverse iterators going forward. + RTIE = skipDebugInstructionsForward(RTIE, RTIB); + RFIE = skipDebugInstructionsForward(RFIE, RFIB); + if (RTIE == RTIB || RFIE == RFIB) break; - if (!TIE->isIdenticalTo(*FIE)) + if (!RTIE->isIdenticalTo(*RFIE)) break; // We have to verify that any branch instructions are the same, and then we // don't count them toward the # of duplicate instructions. - if (!TIE->isBranch()) + if (!RTIE->isBranch()) ++Dups2; - shrinkInclusiveRange(TIB, TIE, TEmpty); - shrinkInclusiveRange(FIB, FIE, FEmpty); + ++RTIE; + ++RFIE; } + TIE = std::next(RTIE.getReverse()); + FIE = std::next(RFIE.getReverse()); return true; } @@ -741,25 +710,21 @@ bool IfConverter::RescanInstructions( static void verifySameBranchInstructions( MachineBasicBlock *MBB1, MachineBasicBlock *MBB2) { - MachineBasicBlock::iterator B1 = MBB1->begin(); - MachineBasicBlock::iterator B2 = MBB2->begin(); - MachineBasicBlock::iterator E1 = std::prev(MBB1->end()); - MachineBasicBlock::iterator E2 = std::prev(MBB2->end()); - bool Empty1 = false, Empty2 = false; - while (!Empty1 && !Empty2) { - E1 = skipDebugInstructionsBackward(E1, B1); - E2 = skipDebugInstructionsBackward(E2, B2); - Empty1 = E1 == B1 && E1->isDebugValue(); - Empty2 = E2 == B2 && E2->isDebugValue(); - - if (Empty1 && Empty2) + const MachineBasicBlock::reverse_iterator B1 = MBB1->rend(); + const MachineBasicBlock::reverse_iterator B2 = MBB2->rend(); + MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin(); + MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin(); + while (E1 != B1 && E2 != B2) { + skipDebugInstructionsForward(E1, B1); + skipDebugInstructionsForward(E2, B2); + if (E1 == B1 && E2 == B2) break; - if (Empty1) { + if (E1 == B1) { assert(!E2->isBranch() && "Branch mis-match, one block is empty."); break; } - if (Empty2) { + if (E2 == B2) { assert(!E1->isBranch() && "Branch mis-match, one block is empty."); break; } @@ -769,8 +734,8 @@ static void verifySameBranchInstructions( "Branch mis-match, branch instructions don't match."); else break; - shrinkInclusiveRange(B1, E1, Empty1); - shrinkInclusiveRange(B2, E2, Empty2); + ++E1; + ++E2; } } #endif @@ -1353,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) { return false; PI = I++; } - return true; + // Finally see if the last I is indeed a successor to PI. + return PI->isSuccessor(&*I); } /// Invalidate predecessor BB info so it would be re-analyzed to determine if it @@ -2183,7 +2149,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // unknown probabilities into known ones. // FIXME: This usage is too tricky and in the future we would like to // eliminate all unknown probabilities in MBB. - ToBBI.BB->normalizeSuccProbs(); + if (ToBBI.IsBrAnalyzable) + ToBBI.BB->normalizeSuccProbs(); SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(), FromMBB.succ_end()); @@ -2263,7 +2230,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // Normalize the probabilities of ToBBI.BB's successors with all adjustment // we've done above. - ToBBI.BB->normalizeSuccProbs(); + if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable) + ToBBI.BB->normalizeSuccProbs(); ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end()); FromBBI.Predicate.clear(); diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp index 9588dfb72058..24e289dd4f1b 100644 --- a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp +++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp @@ -22,6 +22,7 @@ // With the help of a runtime that understands the .fault_maps section, // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs // a page fault. +// Store and LoadStore are also supported. // //===----------------------------------------------------------------------===// @@ -29,6 +30,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass { const TargetRegisterInfo *TRI = nullptr; AliasAnalysis *AA = nullptr; MachineModuleInfo *MMI = nullptr; + MachineFrameInfo *MFI = nullptr; bool analyzeBlockForNullChecks(MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList); - MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB, - MachineBasicBlock *HandlerMBB); + MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *HandlerMBB); void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList); - /// Is \p MI a memory operation that can be used to implicitly null check the - /// value in \p PointerReg? \p PrevInsts is the set of instruction seen since + enum AliasResult { + AR_NoAlias, + AR_MayAlias, + AR_WillAliasEverything + }; + /// Returns AR_NoAlias if \p MI memory operation does not alias with + /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if + /// they may alias and any further memory operation may alias with \p PrevMI. + AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI); + + enum SuitabilityResult { + SR_Suitable, + SR_Unsuitable, + SR_Impossible + }; + /// Return SR_Suitable if \p MI a memory operation that can be used to + /// implicitly null check the value in \p PointerReg, SR_Unsuitable if + /// \p MI cannot be used to null check and SR_Impossible if there is + /// no sense to continue lookup due to any other instruction will not be able + /// to be used. \p PrevInsts is the set of instruction seen since /// the explicit null check on \p PointerReg. - bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, - ArrayRef<MachineInstr *> PrevInsts); + SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, + ArrayRef<MachineInstr *> PrevInsts); /// Return true if \p FaultingMI can be hoisted from after the the /// instructions in \p InstsSeenSoFar to before them. Set \p Dependence to a /// non-null value if we also need to (and legally can) hoist a depedency. - bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg, - ArrayRef<MachineInstr *> InstsSeenSoFar, - MachineBasicBlock *NullSucc, MachineInstr *&Dependence); + bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg, + ArrayRef<MachineInstr *> InstsSeenSoFar, + MachineBasicBlock *NullSucc, MachineInstr *&Dependence); public: static char ID; @@ -193,7 +214,7 @@ public: } bool ImplicitNullChecks::canHandle(const MachineInstr *MI) { - if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects()) + if (MI->isCall() || MI->hasUnmodeledSideEffects()) return false; auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); }; (void)IsRegMask; @@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A, unsigned RegB = MOB.getReg(); - if (TRI->regsOverlap(RegA, RegB)) + if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef())) return false; } } @@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getRegInfo().getTargetRegisterInfo(); MMI = &MF.getMMI(); + MFI = &MF.getFrameInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); SmallVector<NullCheck, 16> NullCheckList; @@ -283,36 +305,91 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI, return false; } -bool ImplicitNullChecks::isSuitableMemoryOp( - MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) { +ImplicitNullChecks::AliasResult +ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI, + MachineInstr *PrevMI) { + // If it is not memory access, skip the check. + if (!(PrevMI->mayStore() || PrevMI->mayLoad())) + return AR_NoAlias; + // Load-Load may alias + if (!(MI.mayStore() || PrevMI->mayStore())) + return AR_NoAlias; + // We lost info, conservatively alias. If it was store then no sense to + // continue because we won't be able to check against it further. + if (MI.memoperands_empty()) + return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias; + if (PrevMI->memoperands_empty()) + return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias; + + for (MachineMemOperand *MMO1 : MI.memoperands()) { + // MMO1 should have a value due it comes from operation we'd like to use + // as implicit null check. + assert(MMO1->getValue() && "MMO1 should have a Value!"); + for (MachineMemOperand *MMO2 : PrevMI->memoperands()) { + if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) { + if (PSV->mayAlias(MFI)) + return AR_MayAlias; + continue; + } + llvm::AliasResult AAResult = AA->alias( + MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, + MMO1->getAAInfo()), + MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, + MMO2->getAAInfo())); + if (AAResult != NoAlias) + return AR_MayAlias; + } + } + return AR_NoAlias; +} + +ImplicitNullChecks::SuitabilityResult +ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg, + ArrayRef<MachineInstr *> PrevInsts) { int64_t Offset; unsigned BaseReg; if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) || BaseReg != PointerReg) - return false; - - // We want the load to be issued at a sane offset from PointerReg, so that - // if PointerReg is null then the load reliably page faults. - if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize)) - return false; - - // Finally, we need to make sure that the load instruction actually is - // loading from PointerReg, and there isn't some re-definition of PointerReg - // between the compare and the load. + return SR_Unsuitable; + + // We want the mem access to be issued at a sane offset from PointerReg, + // so that if PointerReg is null then the access reliably page faults. + if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() && + Offset < PageSize)) + return SR_Unsuitable; + + // Finally, we need to make sure that the access instruction actually is + // accessing from PointerReg, and there isn't some re-definition of PointerReg + // between the compare and the memory access. + // If PointerReg has been redefined before then there is no sense to continue + // lookup due to this condition will fail for any further instruction. + SuitabilityResult Suitable = SR_Suitable; for (auto *PrevMI : PrevInsts) - for (auto &PrevMO : PrevMI->operands()) - if (PrevMO.isReg() && PrevMO.getReg() && + for (auto &PrevMO : PrevMI->operands()) { + if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() && TRI->regsOverlap(PrevMO.getReg(), PointerReg)) - return false; - - return true; + return SR_Impossible; + + // Check whether the current memory access aliases with previous one. + // If we already found that it aliases then no need to continue. + // But we continue base pointer check as it can result in SR_Impossible. + if (Suitable == SR_Suitable) { + AliasResult AR = areMemoryOpsAliased(MI, PrevMI); + if (AR == AR_WillAliasEverything) + return SR_Impossible; + if (AR == AR_MayAlias) + Suitable = SR_Unsuitable; + } + } + return Suitable; } -bool ImplicitNullChecks::canHoistLoadInst( - MachineInstr *FaultingMI, unsigned PointerReg, - ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc, - MachineInstr *&Dependence) { +bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, + unsigned PointerReg, + ArrayRef<MachineInstr *> InstsSeenSoFar, + MachineBasicBlock *NullSucc, + MachineInstr *&Dependence) { auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar); if (!DepResult.CanReorder) return false; @@ -359,7 +436,8 @@ bool ImplicitNullChecks::canHoistLoadInst( // The Dependency can't be re-defining the base register -- then we won't // get the memory operation on the address we want. This is already // checked in \c IsSuitableMemoryOp. - assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) && + assert(!(DependenceMO.isDef() && + TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) && "Should have been checked before!"); } @@ -481,9 +559,11 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( return false; MachineInstr *Dependence; - if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) && - canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, - Dependence)) { + SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar); + if (SR == SR_Impossible) + return false; + if (SR == SR_Suitable && + canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) { NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc, NullSucc, Dependence); return true; @@ -495,36 +575,42 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( return false; } -/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine -/// instruction. The FAULTING_LOAD_OP instruction does the same load as LoadMI -/// (defining the same register), and branches to HandlerMBB if the load -/// faults. The FAULTING_LOAD_OP instruction is inserted at the end of MBB. -MachineInstr * -ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI, - MachineBasicBlock *MBB, - MachineBasicBlock *HandlerMBB) { +/// Wrap a machine instruction, MI, into a FAULTING machine instruction. +/// The FAULTING instruction does the same load/store as MI +/// (defining the same register), and branches to HandlerMBB if the mem access +/// faults. The FAULTING instruction is inserted at the end of MBB. +MachineInstr *ImplicitNullChecks::insertFaultingInstr( + MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) { const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for // all targets. DebugLoc DL; - unsigned NumDefs = LoadMI->getDesc().getNumDefs(); + unsigned NumDefs = MI->getDesc().getNumDefs(); assert(NumDefs <= 1 && "other cases unhandled!"); unsigned DefReg = NoRegister; if (NumDefs != 0) { - DefReg = LoadMI->defs().begin()->getReg(); - assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 && + DefReg = MI->defs().begin()->getReg(); + assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 && "expected exactly one def!"); } - auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg) + FaultMaps::FaultKind FK; + if (MI->mayLoad()) + FK = + MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad; + else + FK = FaultMaps::FaultingStore; + + auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg) + .addImm(FK) .addMBB(HandlerMBB) - .addImm(LoadMI->getOpcode()); + .addImm(MI->getOpcode()); - for (auto &MO : LoadMI->uses()) - MIB.addOperand(MO); + for (auto &MO : MI->uses()) + MIB.add(MO); - MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end()); + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return MIB; } @@ -545,18 +631,18 @@ void ImplicitNullChecks::rewriteNullChecks( NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI); } - // Insert a faulting load where the conditional branch was originally. We - // check earlier ensures that this bit of code motion is legal. We do not - // touch the successors list for any basic block since we haven't changed - // control flow, we've just made it implicit. - MachineInstr *FaultingLoad = insertFaultingLoad( + // Insert a faulting instruction where the conditional branch was + // originally. We check earlier ensures that this bit of code motion + // is legal. We do not touch the successors list for any basic block + // since we haven't changed control flow, we've just made it implicit. + MachineInstr *FaultingInstr = insertFaultingInstr( NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc()); // Now the values defined by MemOperation, if any, are live-in of // the block of MemOperation. - // The original load operation may define implicit-defs alongside - // the loaded value. + // The original operation may define implicit-defs alongside + // the value. MachineBasicBlock *MBB = NC.getMemOperation()->getParent(); - for (const MachineOperand &MO : FaultingLoad->operands()) { + for (const MachineOperand &MO : FaultingInstr->operands()) { if (!MO.isReg() || !MO.isDef()) continue; unsigned Reg = MO.getReg(); @@ -588,8 +674,8 @@ void ImplicitNullChecks::rewriteNullChecks( char ImplicitNullChecks::ID = 0; char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID; -INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks", +INITIALIZE_PASS_BEGIN(ImplicitNullChecks, DEBUG_TYPE, "Implicit null checks", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks", +INITIALIZE_PASS_END(ImplicitNullChecks, DEBUG_TYPE, "Implicit null checks", false, false) diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp index 3d81184f774a..b7ab404070b1 100644 --- a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp @@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI); // We take the DebugLoc from MI, since OrigMI may be attributed to a - // different source location. + // different source location. auto *NewMI = LIS.getInstructionFromIndex(DefIdx); NewMI->setDebugLoc(MI.getDebugLoc()); @@ -686,7 +686,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) { return true; } -#if !defined(NDEBUG) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD // Dump the range of instructions from B to E with their slot indexes. static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E, @@ -887,20 +888,10 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { // Debug values are not allowed to affect codegen. if (MI->isDebugValue()) { // Modify DBG_VALUE now that the value is in a spill slot. - bool IsIndirect = MI->isIndirectDebugValue(); - uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0; - const MDNode *Var = MI->getDebugVariable(); - const MDNode *Expr = MI->getDebugExpression(); - DebugLoc DL = MI->getDebugLoc(); - DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI); MachineBasicBlock *MBB = MI->getParent(); - assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) && - "Expected inlined-at fields to agree"); - BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE)) - .addFrameIndex(StackSlot) - .addImm(Offset) - .addMetadata(Var) - .addMetadata(Expr); + DEBUG(dbgs() << "Modifying debug info due to spill:\t" << *MI); + buildDbgValueForSpill(*MBB, MI, *MI, StackSlot); + MBB->erase(MI); continue; } diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp index ec35b3f6449e..ee4929c91482 100644 --- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -45,6 +45,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/Support/Debug.h" @@ -68,8 +69,7 @@ class InterleavedAccess : public FunctionPass { public: static char ID; - InterleavedAccess(const TargetMachine *TM = nullptr) - : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) { + InterleavedAccess() : FunctionPass(ID), DT(nullptr), TLI(nullptr) { initializeInterleavedAccessPass(*PassRegistry::getPassRegistry()); } @@ -84,7 +84,6 @@ public: private: DominatorTree *DT; - const TargetMachine *TM; const TargetLowering *TLI; /// The maximum supported interleave factor. @@ -108,18 +107,16 @@ private: } // end anonymous namespace. char InterleavedAccess::ID = 0; -INITIALIZE_TM_PASS_BEGIN( - InterleavedAccess, "interleaved-access", +INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE, "Lower interleaved memory accesses to target specific intrinsics", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_TM_PASS_END( - InterleavedAccess, "interleaved-access", +INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE, "Lower interleaved memory accesses to target specific intrinsics", false, false) -FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) { - return new InterleavedAccess(TM); +FunctionPass *llvm::createInterleavedAccessPass() { + return new InterleavedAccess(); } /// \brief Check if the mask is a DE-interleave mask of the given factor @@ -426,13 +423,15 @@ bool InterleavedAccess::lowerInterleavedStore( } bool InterleavedAccess::runOnFunction(Function &F) { - if (!TM || !LowerInterleavedAccesses) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC || !LowerInterleavedAccesses) return false; DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TM = TPC->getTM<TargetMachine>(); + TLI = TM.getSubtargetImpl(F)->getTargetLowering(); MaxFactor = TLI->getMaxSupportedInterleaveFactor(); // Holds dead instructions that will be erased later. diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp index afd24067ace7..c6cc909e25d3 100644 --- a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp +++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp @@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) { Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::memmove: M.getOrInsertFunction("memmove", Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::memset: M.getOrInsertFunction("memset", Type::getInt8PtrTy(Context), Type::getInt8PtrTy(Context), Type::getInt32Ty(M.getContext()), - DL.getIntPtrType(Context), nullptr); + DL.getIntPtrType(Context)); break; case Intrinsic::sqrt: EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl"); diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp index 26794e28020e..be3b258315bb 100644 --- a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp +++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp @@ -42,8 +42,8 @@ static cl::opt<cl::boolOrDefault> EnableFastISelOption("fast-isel", cl::Hidden, cl::desc("Enable the \"fast\" instruction selector")); -static cl::opt<bool> - EnableGlobalISel("global-isel", cl::Hidden, cl::init(false), +static cl::opt<cl::boolOrDefault> + EnableGlobalISel("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector")); void LLVMTargetMachine::initAsmInfo() { @@ -85,7 +85,7 @@ void LLVMTargetMachine::initAsmInfo() { LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef DataLayoutString, const Triple &TT, StringRef CPU, - StringRef FS, TargetOptions Options, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) { @@ -109,26 +109,24 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer = nullptr) { - // When in emulated TLS mode, add the LowerEmuTLS pass. - if (TM->Options.EmulatedTLS) - PM.add(createLowerEmuTLSPass(TM)); - - PM.add(createPreISelIntrinsicLoweringPass()); - - // Add internal analysis passes from the target machine. - PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); - // Targets may override createPassConfig to provide a target-specific // subclass. TargetPassConfig *PassConfig = TM->createPassConfig(PM); PassConfig->setStartStopPasses(StartBefore, StartAfter, StopBefore, StopAfter); - // Set PassConfig options provided by TargetMachine. PassConfig->setDisableVerify(DisableVerify); - PM.add(PassConfig); + // When in emulated TLS mode, add the LowerEmuTLS pass. + if (TM->Options.EmulatedTLS) + PM.add(createLowerEmuTLSPass()); + + PM.add(createPreISelIntrinsicLoweringPass()); + + // Add internal analysis passes from the target machine. + PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); + PassConfig->addIRPasses(); PassConfig->addCodeGenPrepare(); @@ -149,7 +147,9 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, TM->setFastISel(true); // Ask the target for an isel. - if (LLVM_UNLIKELY(EnableGlobalISel)) { + // Enable GlobalISel if the target wants to, but allow that to be overriden. + if (EnableGlobalISel == cl::BOU_TRUE || (EnableGlobalISel == cl::BOU_UNSET && + PassConfig->isGlobalISelEnabled())) { if (PassConfig->addIRTranslator()) return nullptr; @@ -172,11 +172,12 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, // Pass to reset the MachineFunction if the ISel failed. PM.add(createResetMachineFunctionPass( - PassConfig->reportDiagnosticWhenGlobalISelFallback())); + PassConfig->reportDiagnosticWhenGlobalISelFallback(), + PassConfig->isGlobalISelAbortEnabled())); // Provide a fallback path when we do not want to abort on // not-yet-supported input. - if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) && + if (!PassConfig->isGlobalISelAbortEnabled() && PassConfig->addInstSelector()) return nullptr; diff --git a/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp new file mode 100644 index 000000000000..996d40ca6e1e --- /dev/null +++ b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp @@ -0,0 +1,97 @@ +///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===// +/// +/// The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// This is an alternative analysis pass to MachineBlockFrequencyInfo. The +/// difference is that with this pass the block frequencies are not computed +/// when the analysis pass is executed but rather when the BFI result is +/// explicitly requested by the analysis client. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "lazy-machine-block-freq" + +INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, + "Lazy Machine Block Frequency Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE, + "Lazy Machine Block Frequency Analysis", true, true) + +char LazyMachineBlockFrequencyInfoPass::ID = 0; + +LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass() + : MachineFunctionPass(ID) { + initializeLazyMachineBlockFrequencyInfoPassPass( + *PassRegistry::getPassRegistry()); +} + +void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS, + const Module *M) const { + getBFI().print(OS, M); +} + +void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +void LazyMachineBlockFrequencyInfoPass::releaseMemory() { + OwnedMBFI.reset(); + OwnedMLI.reset(); + OwnedMDT.reset(); +} + +MachineBlockFrequencyInfo & +LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const { + auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>(); + if (MBFI) { + DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n"); + return *MBFI; + } + + auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>(); + auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>(); + auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>(); + DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n"); + DEBUG(if (MLI) dbgs() << "LoopInfo is available\n"); + + if (!MLI) { + DEBUG(dbgs() << "Building LoopInfo on the fly\n"); + // First create a dominator tree. + DEBUG(if (MDT) dbgs() << "DominatorTree is available\n"); + + if (!MDT) { + DEBUG(dbgs() << "Building DominatorTree on the fly\n"); + OwnedMDT = make_unique<MachineDominatorTree>(); + OwnedMDT->getBase().recalculate(*MF); + MDT = OwnedMDT.get(); + } + + // Generate LoopInfo from it. + OwnedMLI = make_unique<MachineLoopInfo>(); + OwnedMLI->getBase().analyze(MDT->getBase()); + MLI = OwnedMLI.get(); + } + + OwnedMBFI = make_unique<MachineBlockFrequencyInfo>(); + OwnedMBFI->calculate(*MF, MBPI, *MLI); + return *OwnedMBFI.get(); +} + +bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction( + MachineFunction &F) { + MF = &F; + return false; +} diff --git a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp index 834ed5f06c94..40ee7ea785f0 100644 --- a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp +++ b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp @@ -14,14 +14,23 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <string> +#include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "lexicalscopes" @@ -38,6 +47,10 @@ void LexicalScopes::reset() { /// initialize - Scan machine function and constuct lexical scope nest. void LexicalScopes::initialize(const MachineFunction &Fn) { + // Don't attempt any lexical scope creation for a NoDebug compile unit. + if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() == + DICompileUnit::NoDebug) + return; reset(); MF = &Fn; SmallVector<InsnRange, 4> MIRanges; @@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) { void LexicalScopes::extractLexicalScopes( SmallVectorImpl<InsnRange> &MIRanges, DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { - // Scan each instruction and create scopes. First build working set of scopes. for (const auto &MBB : *MF) { const MachineInstr *RangeBeginMI = nullptr; @@ -74,8 +86,9 @@ void LexicalScopes::extractLexicalScopes( continue; } - // Ignore DBG_VALUE. It does not contribute to any instruction in output. - if (MInsn.isDebugValue()) + // Ignore DBG_VALUE and similar instruction that do not contribute to any + // instruction in the output. + if (MInsn.isMetaInstruction()) continue; if (RangeBeginMI) { @@ -127,6 +140,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) { LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope, const DILocation *IA) { if (IA) { + // Skip scopes inlined from a NoDebug compile unit. + if (Scope->getSubprogram()->getUnit()->getEmissionKind() == + DICompileUnit::NoDebug) + return getOrCreateLexicalScope(IA); // Create an abstract scope for inlined function. getOrCreateAbstractScope(Scope); // Create an inlined scope for inlined function. @@ -181,10 +198,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope, else Parent = getOrCreateLexicalScope(InlinedAt); - I = InlinedLexicalScopeMap.emplace(std::piecewise_construct, - std::forward_as_tuple(P), - std::forward_as_tuple(Parent, Scope, - InlinedAt, false)) + I = InlinedLexicalScopeMap + .emplace(std::piecewise_construct, std::forward_as_tuple(P), + std::forward_as_tuple(Parent, Scope, InlinedAt, false)) .first; return &I->second; } @@ -241,7 +257,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) { void LexicalScopes::assignInstructionRanges( SmallVectorImpl<InsnRange> &MIRanges, DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) { - LexicalScope *PrevLexicalScope = nullptr; for (const auto &R : MIRanges) { LexicalScope *S = MI2ScopeMap.lookup(R.first); @@ -299,9 +314,8 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) { return Result; } -/// dump - Print data structures. -void LexicalScope::dump(unsigned Indent) const { -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const { raw_ostream &err = dbgs(); err.indent(Indent); err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n"; @@ -316,5 +330,5 @@ void LexicalScope::dump(unsigned Indent) const { for (unsigned i = 0, e = Children.size(); i != e; ++i) if (Children[i] != this) Children[i]->dump(Indent + 2); -#endif } +#endif diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp index c945376560f7..b5e705f6455d 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp +++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -24,13 +24,16 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/CodeGen/LexicalScopes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DebugInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -40,7 +43,7 @@ using namespace llvm; -#define DEBUG_TYPE "live-debug-values" +#define DEBUG_TYPE "livedebugvalues" STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); @@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass { private: const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; + const TargetFrameLowering *TFI; LexicalScopes LS; /// Keeps track of lexical scopes associated with a user value's source @@ -127,11 +131,13 @@ private: if (int RegNo = isDbgValueDescribedByReg(MI)) { Kind = RegisterKind; Loc.RegisterLoc.RegNo = RegNo; - uint64_t Offset = + int64_t Offset = MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0; // We don't support offsets larger than 4GiB here. They are // slated to be replaced with DIExpressions anyway. - if (Offset >= (1ULL << 32)) + // With indirect debug values used for spill locations, Offset + // can be negative. + if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32)) Kind = InvalidKind; else Loc.RegisterLoc.Offset = Offset; @@ -150,7 +156,9 @@ private: /// dominates MBB. bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); } - void dump() const { MI.dump(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { MI.dump(); } +#endif bool operator==(const VarLoc &Other) const { return Var == Other.Var && Loc.Hash == Other.Loc.Hash; @@ -167,6 +175,11 @@ private: typedef UniqueVector<VarLoc> VarLocMap; typedef SparseBitVector<> VarLocSet; typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB; + struct SpillDebugPair { + MachineInstr *SpillInst; + MachineInstr *DebugInst; + }; + typedef SmallVector<SpillDebugPair, 4> SpillMap; /// This holds the working set of currently open ranges. For fast /// access, this is done both as a set of VarLocIDs, and a map of @@ -216,14 +229,21 @@ private: } }; + bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF, + unsigned &Reg); + int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg); + void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs); + void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges, + VarLocMap &VarLocIDs, SpillMap &Spills); void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges, const VarLocMap &VarLocIDs); bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs); bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, - VarLocInMBB &OutLocs, VarLocMap &VarLocIDs); + VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills, + bool transferSpills); bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs, const VarLocMap &VarLocIDs, @@ -263,7 +283,7 @@ public: char LiveDebugValues::ID = 0; char &llvm::LiveDebugValuesID = LiveDebugValues::ID; -INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis", +INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false, false) /// Default construct and initialize the pass. @@ -282,6 +302,7 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const { // Debug Range Extension Implementation //===----------------------------------------------------------------------===// +#ifndef NDEBUG void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V, const VarLocMap &VarLocIDs, @@ -300,6 +321,22 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, } Out << "\n"; } +#endif + +/// Given a spill instruction, extract the register and offset used to +/// address the spill location in a target independent way. +int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI, + unsigned &Reg) { + assert(MI.hasOneMemOperand() && + "Spill instruction does not have exactly one memory operand?"); + auto MMOI = MI.memoperands_begin(); + const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue(); + assert(PVal->kind() == PseudoSourceValue::FixedStack && + "Inconsistent memory operand in spill instruction"); + int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex(); + const MachineBasicBlock *MBB = MI.getParent(); + return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg); +} /// End all previous ranges related to @MI and start a new range from @MI /// if it is a DBG_VALUE instr. @@ -336,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI, unsigned SP = TLI->getStackPointerRegisterToSaveRestore(); SparseBitVector<> KillSet; for (const MachineOperand &MO : MI.operands()) { + // Determine whether the operand is a register def. Assume that call + // instructions never clobber SP, because some backends (e.g., AArch64) + // never list SP in the regmask. if (MO.isReg() && MO.isDef() && MO.getReg() && - TRI->isPhysicalRegister(MO.getReg())) { + TRI->isPhysicalRegister(MO.getReg()) && + !(MI.isCall() && MO.getReg() == SP)) { // Remove ranges of all aliased registers. for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI) for (unsigned ID : OpenRanges.getVarLocs()) @@ -358,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI, OpenRanges.erase(KillSet, VarLocIDs); } +/// Decide if @MI is a spill instruction and return true if it is. We use 2 +/// criteria to make this decision: +/// - Is this instruction a store to a spill slot? +/// - Is there a register operand that is both used and killed? +/// TODO: Store optimization can fold spills into other stores (including +/// other spills). We do not handle this yet (more than one memory operand). +bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI, + MachineFunction *MF, unsigned &Reg) { + const MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + int FI; + const MachineMemOperand *MMO; + + // TODO: Handle multiple stores folded into one. + if (!MI.hasOneMemOperand()) + return false; + + // To identify a spill instruction, use the same criteria as in AsmPrinter. + if (!((TII->isStoreToStackSlotPostFE(MI, FI) || + TII->hasStoreToStackSlot(MI, MMO, FI)) && + FrameInfo.isSpillSlotObjectIndex(FI))) + return false; + + // In a spill instruction generated by the InlineSpiller the spilled register + // has its kill flag set. Return false if we don't find such a register. + Reg = 0; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isUse() && MO.isKill()) { + Reg = MO.getReg(); + break; + } + } + return Reg != 0; +} + +/// A spilled register may indicate that we have to end the current range of +/// a variable and create a new one for the spill location. +/// We don't want to insert any instructions in transfer(), so we just create +/// the DBG_VALUE witout inserting it and keep track of it in @Spills. +/// It will be inserted into the BB when we're done iterating over the +/// instructions. +void LiveDebugValues::transferSpillInst(MachineInstr &MI, + OpenRangesSet &OpenRanges, + VarLocMap &VarLocIDs, + SpillMap &Spills) { + unsigned Reg; + MachineFunction *MF = MI.getParent()->getParent(); + if (!isSpillInstruction(MI, MF, Reg)) + return; + + // Check if the register is the location of a debug value. + for (unsigned ID : OpenRanges.getVarLocs()) { + if (VarLocIDs[ID].isDescribedByReg() == Reg) { + DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '(' + << VarLocIDs[ID].Var.getVar()->getName() << ")\n"); + + // Create a DBG_VALUE instruction to describe the Var in its spilled + // location, but don't insert it yet to avoid invalidating the + // iterator in our caller. + unsigned SpillBase; + int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase); + const MachineInstr *DMI = &VarLocIDs[ID].MI; + MachineInstr *SpDMI = + BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0, + DMI->getDebugVariable(), DMI->getDebugExpression()); + SpDMI->getOperand(1).setImm(SpillOffset); + DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: "; + SpDMI->print(dbgs(), false, TII)); + + // The newly created DBG_VALUE instruction SpDMI must be inserted after + // MI. Keep track of the pairing. + SpillDebugPair MIP = {&MI, SpDMI}; + Spills.push_back(MIP); + + // End all previous ranges of Var. + OpenRanges.erase(VarLocIDs[ID].Var); + + // Add the VarLoc to OpenRanges. + VarLoc VL(*SpDMI, LS); + unsigned SpillLocID = VarLocIDs.insert(VL); + OpenRanges.insert(SpillLocID, VL.Var); + return; + } + } +} + /// Terminate all open ranges at the end of the current basic block. bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges, @@ -383,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, /// This routine creates OpenRanges and OutLocs. bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges, - VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) { + VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, + SpillMap &Spills, bool transferSpills) { bool Changed = false; transferDebugValue(MI, OpenRanges, VarLocIDs); transferRegisterDef(MI, OpenRanges, VarLocIDs); + if (transferSpills) + transferSpillInst(MI, OpenRanges, VarLocIDs, Spills); Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs); return Changed; } @@ -475,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { bool OLChanged = false; bool MBBJoined = false; - VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. + VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors. OpenRangesSet OpenRanges; // Ranges that are open until end of bb. - VarLocInMBB OutLocs; // Ranges that exist beyond bb. - VarLocInMBB InLocs; // Ranges that are incoming after joining. + VarLocInMBB OutLocs; // Ranges that exist beyond bb. + VarLocInMBB InLocs; // Ranges that are incoming after joining. + SpillMap Spills; // DBG_VALUEs associated with spills. DenseMap<unsigned int, MachineBasicBlock *> OrderToBB; DenseMap<MachineBasicBlock *, unsigned int> BBToOrder; @@ -490,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { Pending; // Initialize every mbb with OutLocs. + // We are not looking at any spill instructions during the initial pass + // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE + // instructions for spills of registers that are known to be user variables + // within the BB in which the spill occurs. for (auto &MBB : MF) for (auto &MI : MBB) - transfer(MI, OpenRanges, OutLocs, VarLocIDs); + transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, + /*transferSpills=*/false); DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization", dbgs())); @@ -524,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { if (MBBJoined) { MBBJoined = false; Changed = true; + // Now that we have started to extend ranges across BBs we need to + // examine spill instructions to see whether they spill registers that + // correspond to user variables. for (auto &MI : *MBB) - OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs); + OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills, + /*transferSpills=*/true); + + // Add any DBG_VALUE instructions necessitated by spills. + for (auto &SP : Spills) + MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst), + SP.DebugInst); + Spills.clear(); DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after propagating", dbgs())); @@ -559,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); TII = MF.getSubtarget().getInstrInfo(); + TFI = MF.getSubtarget().getFrameLowering(); LS.initialize(MF); bool Changed = ExtendRanges(MF); diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp index 0934d8cfeaa1..bbd783367c9e 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -45,7 +45,7 @@ using namespace llvm; -#define DEBUG_TYPE "livedebug" +#define DEBUG_TYPE "livedebugvars" static cl::opt<bool> EnableLDV("live-debug-variables", cl::init(true), @@ -54,11 +54,11 @@ EnableLDV("live-debug-variables", cl::init(true), STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted"); char LiveDebugVariables::ID = 0; -INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars", +INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars", +INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const { @@ -944,7 +944,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, IsIndirect, Loc.getReg(), offset, Variable, Expression); else BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(Loc) + .add(Loc) .addImm(offset) .addMetadata(Variable) .addMetadata(Expression); @@ -1005,7 +1005,7 @@ bool LiveDebugVariables::doInitialization(Module &M) { return Pass::doInitialization(M); } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void LiveDebugVariables::dump() { if (pImpl) static_cast<LDVImpl*>(pImpl)->print(dbgs()); diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp index 623af492fcd4..9ef9f238fdce 100644 --- a/contrib/llvm/lib/CodeGen/LiveInterval.cpp +++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp @@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() { SubRanges = nullptr; } +void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator, + LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) { + + LaneBitmask ToApply = LaneMask; + for (SubRange &SR : subranges()) { + LaneBitmask SRMask = SR.LaneMask; + LaneBitmask Matching = SRMask & LaneMask; + if (Matching.none()) + continue; + + SubRange *MatchingRange; + if (SRMask == Matching) { + // The subrange fits (it does not cover bits outside \p LaneMask). + MatchingRange = &SR; + } else { + // We have to split the subrange into a matching and non-matching part. + // Reduce lanemask of existing lane to non-matching part. + SR.LaneMask = SRMask & ~Matching; + // Create a new subrange for the matching part + MatchingRange = createSubRangeFrom(Allocator, Matching, SR); + } + Apply(*MatchingRange); + ToApply &= ~Matching; + } + // Create a new subrange if there are uncovered bits left. + if (ToApply.any()) { + SubRange *NewRange = createSubRange(Allocator, ToApply); + Apply(*NewRange); + } +} + unsigned LiveInterval::getSize() const { unsigned Sum = 0; for (const Segment &S : segments) @@ -1032,6 +1063,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const { // When they exist, Spills.back().start <= LastStart, // and WriteI[-1].start <= LastStart. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LiveRangeUpdater::print(raw_ostream &OS) const { if (!isDirty()) { if (LR) @@ -1058,6 +1090,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const { LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const { print(errs()); } +#endif // Determine if A and B should be coalesced. static inline bool coalescable(const LiveRange::Segment &A, diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp index 70d34838b237..0c05dbeacba0 100644 --- a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===// +//===- LiveIntervalAnalysis.cpp - Live Interval Analysis ------------------===// // // The LLVM Compiler Infrastructure // @@ -7,35 +7,52 @@ // //===----------------------------------------------------------------------===// // -// This file implements the LiveInterval analysis pass which is used -// by the Linear Scan Register allocator. This pass linearizes the -// basic blocks of the function in DFS order and computes live intervals for -// each virtual and physical register. +/// \file This file implements the LiveInterval analysis pass which is used +/// by the Linear Scan Register allocator. This pass linearizes the +/// basic blocks of the function in DFS order and computes live intervals for +/// each virtual and physical register. // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "LiveRangeCalc.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/IR/Value.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> -#include <cmath> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "regalloc" @@ -59,11 +76,13 @@ static bool EnablePrecomputePhysRegs = false; #endif // NDEBUG namespace llvm { + cl::opt<bool> UseSegmentSetForPhysRegs( "use-segment-set-for-physregs", cl::Hidden, cl::init(true), cl::desc( "Use segment set for the computation of the live ranges of physregs.")); -} + +} // end namespace llvm void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); @@ -78,8 +97,7 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -LiveIntervals::LiveIntervals() : MachineFunctionPass(ID), - DomTree(nullptr), LRCalc(nullptr) { +LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) { initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); } @@ -96,16 +114,14 @@ void LiveIntervals::releaseMemory() { RegMaskBits.clear(); RegMaskBlocks.clear(); - for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) - delete RegUnitRanges[i]; + for (LiveRange *LR : RegUnitRanges) + delete LR; RegUnitRanges.clear(); // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd. VNInfoAllocator.Reset(); } -/// runOnMachineFunction - calculates LiveIntervals -/// bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { MF = &fn; MRI = &MF->getRegInfo(); @@ -135,14 +151,13 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { return true; } -/// print - Implement the dump method. void LiveIntervals::print(raw_ostream &OS, const Module* ) const { OS << "********** INTERVALS **********\n"; // Dump the regunits. - for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i) - if (LiveRange *LR = RegUnitRanges[i]) - OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n'; + for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit) + if (LiveRange *LR = RegUnitRanges[Unit]) + OS << PrintRegUnit(Unit, TRI) << ' ' << *LR << '\n'; // Dump the virtregs. for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { @@ -152,8 +167,8 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const { } OS << "RegMasks:"; - for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i) - OS << ' ' << RegMaskSlots[i]; + for (SlotIndex Idx : RegMaskSlots) + OS << ' ' << Idx; OS << '\n'; printInstrs(OS); @@ -165,20 +180,17 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void LiveIntervals::dumpInstrs() const { +LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const { printInstrs(dbgs()); } #endif LiveInterval* LiveIntervals::createInterval(unsigned reg) { - float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? - llvm::huge_valf : 0.0F; + float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F; return new LiveInterval(reg, Weight); } - -/// computeVirtRegInterval - Compute the live interval of a virtual register, -/// based on defs and uses. +/// Compute the live interval of a virtual register, based on defs and uses. void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LRCalc && "LRCalc not initialized."); assert(LI.empty() && "Should only compute empty intervals."); @@ -200,7 +212,7 @@ void LiveIntervals::computeRegMasks() { RegMaskBlocks.resize(MF->getNumBlockIDs()); // Find all instructions with regmask operands. - for (MachineBasicBlock &MBB : *MF) { + for (const MachineBasicBlock &MBB : *MF) { std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()]; RMB.first = RegMaskSlots.size(); @@ -210,7 +222,7 @@ void LiveIntervals::computeRegMasks() { RegMaskBits.push_back(Mask); } - for (MachineInstr &MI : MBB) { + for (const MachineInstr &MI : MBB) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isRegMask()) continue; @@ -245,9 +257,9 @@ void LiveIntervals::computeRegMasks() { // interference. // -/// computeRegUnitInterval - Compute the live range of a register unit, based -/// on the uses and defs of aliasing registers. The range should be empty, -/// or contain only dead phi-defs from ABI blocks. +/// Compute the live range of a register unit, based on the uses and defs of +/// aliasing registers. The range should be empty, or contain only dead +/// phi-defs from ABI blocks. void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { assert(LRCalc && "LRCalc not initialized."); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -257,22 +269,30 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { // may share super-registers. That's OK because createDeadDefs() is // idempotent. It is very rare for a register unit to have multiple roots, so // uniquing super-registers is probably not worthwhile. - for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { - for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); - Supers.isValid(); ++Supers) { - if (!MRI->reg_empty(*Supers)) - LRCalc->createDeadDefs(LR, *Supers); + bool IsReserved = true; + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { + for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); + Super.isValid(); ++Super) { + unsigned Reg = *Super; + if (!MRI->reg_empty(Reg)) + LRCalc->createDeadDefs(LR, Reg); + // A register unit is considered reserved if all its roots and all their + // super registers are reserved. + if (!MRI->isReserved(Reg)) + IsReserved = false; } } // Now extend LR to reach all uses. // Ignore uses of reserved registers. We only track defs of those. - for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) { - for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true); - Supers.isValid(); ++Supers) { - unsigned Reg = *Supers; - if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg)) - LRCalc->extendToUses(LR, Reg); + if (!IsReserved) { + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { + for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); + Super.isValid(); ++Super) { + unsigned Reg = *Super; + if (!MRI->reg_empty(Reg)) + LRCalc->extendToUses(LR, Reg); + } } } @@ -281,11 +301,9 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { LR.flushSegmentSet(); } - -/// computeLiveInRegUnits - Precompute the live ranges of any register units -/// that are live-in to an ABI block somewhere. Register values can appear -/// without a corresponding def when entering the entry block or a landing pad. -/// +/// Precompute the live ranges of any register units that are live-in to an ABI +/// block somewhere. Register values can appear without a corresponding def when +/// entering the entry block or a landing pad. void LiveIntervals::computeLiveInRegUnits() { RegUnitRanges.resize(TRI->getNumRegUnits()); DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); @@ -294,18 +312,15 @@ void LiveIntervals::computeLiveInRegUnits() { SmallVector<unsigned, 8> NewRanges; // Check all basic blocks for live-ins. - for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); - MFI != MFE; ++MFI) { - const MachineBasicBlock *MBB = &*MFI; - + for (const MachineBasicBlock &MBB : *MF) { // We only care about ABI blocks: Entry + landing pads. - if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty()) + if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty()) continue; // Create phi-defs at Begin for all live-in registers. - SlotIndex Begin = Indexes->getMBBStartIdx(MBB); - DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber()); - for (const auto &LI : MBB->liveins()) { + SlotIndex Begin = Indexes->getMBBStartIdx(&MBB); + DEBUG(dbgs() << Begin << "\tBB#" << MBB.getNumber()); + for (const auto &LI : MBB.liveins()) { for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { unsigned Unit = *Units; LiveRange *LR = RegUnitRanges[Unit]; @@ -324,16 +339,13 @@ void LiveIntervals::computeLiveInRegUnits() { DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. - for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) { - unsigned Unit = NewRanges[i]; + for (unsigned Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); - } } - static void createSegmentsForValues(LiveRange &LR, - iterator_range<LiveInterval::vni_iterator> VNIs) { - for (auto VNI : VNIs) { + iterator_range<LiveInterval::vni_iterator> VNIs) { + for (VNInfo *VNI : VNIs) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; @@ -341,7 +353,7 @@ static void createSegmentsForValues(LiveRange &LR, } } -typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList; +using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>; static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, ShrinkToUsesWorkList &WorkList, @@ -349,7 +361,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, // Keep track of the PHIs that are in use. SmallPtrSet<VNInfo*, 8> UsedPHIs; // Blocks that have already been added to WorkList as live-out. - SmallPtrSet<MachineBasicBlock*, 16> LiveOut; + SmallPtrSet<const MachineBasicBlock*, 16> LiveOut; // Extend intervals to reach all uses in WorkList. while (!WorkList.empty()) { @@ -368,7 +380,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, !UsedPHIs.insert(VNI).second) continue; // The PHI is live, make sure the predecessors are live-out. - for (auto &Pred : MBB->predecessors()) { + for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -384,7 +396,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI)); // Make sure VNI is live-out from the predecessors. - for (auto &Pred : MBB->predecessors()) { + for (const MachineBasicBlock *Pred : MBB->predecessors()) { if (!LiveOut.insert(Pred).second) continue; SlotIndex Stop = Indexes.getMBBEndIdx(Pred); @@ -415,22 +427,20 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, ShrinkToUsesWorkList WorkList; // Visit all instructions reading li->reg. - for (MachineRegisterInfo::reg_instr_iterator - I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end(); - I != E; ) { - MachineInstr *UseMI = &*(I++); - if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg)) + unsigned Reg = li->reg; + for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) { + if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg)) continue; - SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot(); + SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot(); LiveQueryResult LRQ = li->Query(Idx); VNInfo *VNI = LRQ.valueIn(); if (!VNI) { // This shouldn't happen: readsVirtualRegister returns true, but there is // no live value. It is likely caused by a target getting <undef> flags // wrong. - DEBUG(dbgs() << Idx << '\t' << *UseMI + DEBUG(dbgs() << Idx << '\t' << UseMI << "Warning: Instr claims to read non-existent value in " - << *li << '\n'); + << *li << '\n'); continue; } // Special case: An early-clobber tied operand reads and writes the @@ -458,7 +468,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, bool LiveIntervals::computeDeadValues(LiveInterval &LI, SmallVectorImpl<MachineInstr*> *dead) { bool MayHaveSplitComponents = false; - for (auto VNI : LI.valnos) { + for (VNInfo *VNI : LI.valnos) { if (VNI->isUnused()) continue; SlotIndex Def = VNI->def; @@ -548,7 +558,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) { SR.segments.swap(NewLR.segments); // Remove dead PHI value numbers - for (auto VNI : SR.valnos) { + for (VNInfo *VNI : SR.valnos) { if (VNI->isUnused()) continue; const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def); @@ -571,8 +581,8 @@ void LiveIntervals::extendToIndices(LiveRange &LR, ArrayRef<SlotIndex> Undefs) { assert(LRCalc && "LRCalc not initialized."); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); - for (unsigned i = 0, e = Indices.size(); i != e; ++i) - LRCalc->extend(LR, Indices[i], /*PhysReg=*/0, Undefs); + for (SlotIndex Idx : Indices) + LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs); } void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, @@ -599,13 +609,11 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, // Find all blocks that are reachable from KillMBB without leaving VNI's live // range. It is possible that KillMBB itself is reachable, so start a DFS // from each successor. - typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy; + using VisitedTy = df_iterator_default_set<MachineBasicBlock*,9>; VisitedTy Visited; - for (MachineBasicBlock::succ_iterator - SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end(); - SuccI != SuccE; ++SuccI) { + for (MachineBasicBlock *Succ : KillMBB->successors()) { for (df_ext_iterator<MachineBasicBlock*, VisitedTy> - I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited); + I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited); I != E;) { MachineBasicBlock *MBB = *I; @@ -657,9 +665,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid(); - ++Units) { - const LiveRange &RURange = getRegUnit(*Units); + for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + ++Unit) { + const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) continue; RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end))); @@ -802,9 +810,8 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const { // Conservatively return true instead of scanning huge predecessor lists. if (PHIMBB->pred_size() > 100) return true; - for (MachineBasicBlock::const_pred_iterator - PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI) - if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI))) + for (const MachineBasicBlock *Pred : PHIMBB->predecessors()) + if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred))) return true; } return false; @@ -831,7 +838,6 @@ LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) { return S; } - //===----------------------------------------------------------------------===// // Register mask functions //===----------------------------------------------------------------------===// @@ -864,7 +870,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, return false; bool Found = false; - for (;;) { + while (true) { assert(*SlotI >= LiveI->start); // Loop over all slots overlapping this segment. while (*SlotI < LiveI->end) { @@ -895,7 +901,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, // IntervalUpdate class. //===----------------------------------------------------------------------===// -// HMEditor is a toolkit used by handleMove to trim or extend live intervals. +/// Toolkit used by handleMove to trim or extend live intervals. class LiveIntervals::HMEditor { private: LiveIntervals& LIS; @@ -1241,10 +1247,12 @@ private: LiveRange::iterator NewIdxIn = NewIdxOut; assert(NewIdxIn == LR.find(NewIdx.getBaseIndex())); const SlotIndex SplitPos = NewIdxDef; + OldIdxVNI = OldIdxIn->valno; // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut. + OldIdxOut->valno->def = OldIdxIn->start; *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end, - OldIdxIn->valno); + OldIdxOut->valno); // OldIdxIn and OldIdxVNI are now undef and can be overridden. // We Slide [NewIdxIn, OldIdxIn) down one position. // |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -| @@ -1514,8 +1522,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, } } - for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) { - unsigned Reg = OrigRegs[i]; + for (unsigned Reg : OrigRegs) { if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; @@ -1524,16 +1531,16 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, if (!LI.hasAtLeastOneValue()) continue; - for (LiveInterval::SubRange &S : LI.subranges()) { + for (LiveInterval::SubRange &S : LI.subranges()) repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask); - } + repairOldRegInRange(Begin, End, endIdx, LI, Reg); } } void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) { - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { - if (LiveRange *LR = getCachedRegUnit(*Units)) + for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { + if (LiveRange *LR = getCachedRegUnit(*Unit)) if (VNInfo *VNI = LR->getVNInfoAt(Pos)) LR->removeValNo(VNI); } diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp index fc2f233f6d68..b4aa0dc326a5 100644 --- a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp +++ b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===// +//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===// // // The LLVM Compiler Infrastructure // @@ -13,19 +13,19 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveIntervalUnion.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SparseBitVector.h" -#include "llvm/Support/Debug.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" -#include <algorithm> +#include <cassert> +#include <cstdlib> using namespace llvm; #define DEBUG_TYPE "regalloc" - // Merge a LiveInterval's segments. Guarantee no overlaps. void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) { if (Range.empty()) @@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) { LiveRange::const_iterator RegEnd = Range.end(); SegmentIter SegPos = Segments.find(RegPos->start); - for (;;) { + while (true) { assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval"); SegPos.erase(); if (!SegPos.valid()) @@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { CheckedFirstInterference = true; // Quickly skip interference check for empty sets. - if (VirtReg->empty() || LiveUnion->empty()) { + if (LR->empty() || LiveUnion->empty()) { SeenAllInterferences = true; return 0; } - // In most cases, the union will start before VirtReg. - VirtRegI = VirtReg->begin(); + // In most cases, the union will start before LR. + LRI = LR->begin(); LiveUnionI.setMap(LiveUnion->getMap()); - LiveUnionI.find(VirtRegI->start); + LiveUnionI.find(LRI->start); } - LiveInterval::iterator VirtRegEnd = VirtReg->end(); + LiveRange::const_iterator LREnd = LR->end(); LiveInterval *RecentReg = nullptr; while (LiveUnionI.valid()) { - assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg"); + assert(LRI != LREnd && "Reached end of LR"); // Check for overlapping interference. - while (VirtRegI->start < LiveUnionI.stop() && - VirtRegI->end > LiveUnionI.start()) { + while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) { // This is an overlap, record the interfering register. LiveInterval *VReg = LiveUnionI.value(); if (VReg != RecentReg && !isSeenInterference(VReg)) { @@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) { } // The iterators are now not overlapping, LiveUnionI has been advanced - // beyond VirtRegI. - assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap"); + // beyond LRI. + assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap"); // Advance the iterator that ends first. - VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start()); - if (VirtRegI == VirtRegEnd) + LRI = LR->advanceTo(LRI, LiveUnionI.start()); + if (LRI == LREnd) break; // Detect overlap, handle above. - if (VirtRegI->start < LiveUnionI.stop()) + if (LRI->start < LiveUnionI.stop()) continue; // Still not overlapping. Catch up LiveUnionI. - LiveUnionI.advanceTo(VirtRegI->start); + LiveUnionI.advanceTo(LRI->start); } SeenAllInterferences = true; return InterferingVRegs.size(); diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp index dcc41c1718a6..0dc1079b2ad4 100644 --- a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp +++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp @@ -53,7 +53,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) { continue; removeReg(Reg); } else if (O->isRegMask()) - removeRegsInMask(*O, nullptr); + removeRegsInMask(*O); } // Add uses to the set. @@ -120,12 +120,11 @@ void LivePhysRegs::print(raw_ostream &OS) const { OS << "\n"; } -/// Dumps the currently live registers to the debug output. -LLVM_DUMP_METHOD void LivePhysRegs::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LivePhysRegs::dump() const { dbgs() << " " << *this; -#endif } +#endif bool LivePhysRegs::available(const MachineRegisterInfo &MRI, unsigned Reg) const { @@ -143,63 +142,85 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI, /// Add live-in registers of basic block \p MBB to \p LiveRegs. void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) { for (const auto &LI : MBB.liveins()) { - MCSubRegIndexIterator S(LI.PhysReg, TRI); - if (LI.LaneMask.all() || (LI.LaneMask.any() && !S.isValid())) { - addReg(LI.PhysReg); + unsigned Reg = LI.PhysReg; + LaneBitmask Mask = LI.LaneMask; + MCSubRegIndexIterator S(Reg, TRI); + assert(Mask.any() && "Invalid livein mask"); + if (Mask.all() || !S.isValid()) { + addReg(Reg); continue; } for (; S.isValid(); ++S) { unsigned SI = S.getSubRegIndex(); - if ((LI.LaneMask & TRI->getSubRegIndexLaneMask(SI)).any()) + if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any()) addReg(S.getSubReg()); } } } -/// Add pristine registers to the given \p LiveRegs. This function removes -/// actually saved callee save registers when \p InPrologueEpilogue is false. -static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF, - const MachineFrameInfo &MFI, - const TargetRegisterInfo &TRI) { - for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) +/// Adds all callee saved registers to \p LiveRegs. +static void addCalleeSavedRegs(LivePhysRegs &LiveRegs, + const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) LiveRegs.addReg(*CSR); +} + +/// Adds pristine registers to the given \p LiveRegs. Pristine registers are +/// callee saved registers that are unused in the function. +static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid()) + return; + /// Add all callee saved regs, then remove the ones that are saved+restored. + addCalleeSavedRegs(LiveRegs, MF); + /// Remove the ones that are not saved/restored; they are pristine. for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) LiveRegs.removeReg(Info.getReg()); } void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) { - // To get the live-outs we simply merge the live-ins of all successors. - for (const MachineBasicBlock *Succ : MBB.successors()) - addBlockLiveIns(*Succ); + if (!MBB.succ_empty()) { + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addBlockLiveIns(*Succ); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers that are saved and + // restored (somewhere); This does not include callee saved registers that + // are unused and hence not saved and restored; they are called pristine. + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) + addReg(Info.getReg()); + } + } } void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) { - const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) { - if (MBB.isReturnBlock()) { - // The return block has no successors whose live-ins we could merge - // below. So instead we add the callee saved registers manually. - for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) - addReg(*I); - } else { - addPristines(*this, MF, MFI, *TRI); - } + if (!MBB.succ_empty()) { + const MachineFunction &MF = *MBB.getParent(); + addPristines(*this, MF); + addLiveOutsNoPristines(MBB); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers. + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) + addCalleeSavedRegs(*this, MF); } - - addLiveOutsNoPristines(MBB); } void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) - addPristines(*this, MF, MFI, *TRI); + addPristines(*this, MF); addBlockLiveIns(MBB); } -void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, +void llvm::computeLiveIns(LivePhysRegs &LiveRegs, + const MachineRegisterInfo &MRI, MachineBasicBlock &MBB) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); assert(MBB.livein_empty()); LiveRegs.init(TRI); LiveRegs.addLiveOutsNoPristines(MBB); @@ -207,10 +228,12 @@ void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, LiveRegs.stepBackward(MI); for (unsigned Reg : LiveRegs) { + if (MRI.isReserved(Reg)) + continue; // Skip the register if we are about to add one of its super registers. bool ContainsSuperReg = false; for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) { - if (LiveRegs.contains(*SReg)) { + if (LiveRegs.contains(*SReg) && !MRI.isReserved(*SReg)) { ContainsSuperReg = true; break; } diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp index 012837608628..398066bf8903 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp @@ -75,34 +75,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) { LI.createSubRangeFrom(*Alloc, ClassMask, LI); } - LaneBitmask Mask = SubMask; - for (LiveInterval::SubRange &S : LI.subranges()) { - // A Mask for subregs common to the existing subrange and current def. - LaneBitmask Common = S.LaneMask & Mask; - if (Common.none()) - continue; - LiveInterval::SubRange *CommonRange; - // A Mask for subregs covered by the subrange but not the current def. - LaneBitmask RM = S.LaneMask & ~Mask; - if (RM.any()) { - // Split the subrange S into two parts: one covered by the current - // def (CommonRange), and the one not affected by it (updated S). - S.LaneMask = RM; - CommonRange = LI.createSubRangeFrom(*Alloc, Common, S); - } else { - assert(Common == S.LaneMask); - CommonRange = &S; - } + LI.refineSubRanges(*Alloc, SubMask, + [&MO, this](LiveInterval::SubRange &SR) { if (MO.isDef()) - createDeadDef(*Indexes, *Alloc, *CommonRange, MO); - Mask &= ~Common; - } - // Create a new SubRange for subregs we did not cover yet. - if (Mask.any()) { - LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask); - if (MO.isDef()) - createDeadDef(*Indexes, *Alloc, *NewRange, MO); - } + createDeadDef(*Indexes, *Alloc, SR, MO); + }); } // Create the def in the main liverange. We do not have to do this if @@ -289,8 +266,7 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, if (UndefOnEntry[BN]) return false; - auto MarkDefined = - [this,BN,&DefOnEntry,&UndefOnEntry] (MachineBasicBlock &B) -> bool { + auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool { for (MachineBasicBlock *S : B.successors()) DefOnEntry[S->getNumber()] = true; DefOnEntry[BN] = true; @@ -311,7 +287,12 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs, return MarkDefined(B); SlotIndex Begin, End; std::tie(Begin, End) = Indexes->getMBBRange(&B); - LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), End); + // Treat End as not belonging to B. + // If LR has a segment S that starts at the next block, i.e. [End, ...), + // std::upper_bound will return the segment following S. Instead, + // S should be treated as the first segment that does not overlap B. + LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), + End.getPrevSlot()); if (UB != LR.begin()) { LiveRange::Segment &Seg = *std::prev(UB); if (Seg.end > Begin) { diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp index 7f1c69c0b4a2..92cca1a54951 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } LiveInterval &LI = LIS.createEmptyInterval(VReg); + if (Parent && !Parent->isSpillable()) + LI.markNotSpillable(); // Create empty subranges if the OldReg's interval has them. Do not create // the main range here---it will be constructed later after the subranges // have been finalized. @@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) { if (VRM) { VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg)); } + // FIXME: Getting the interval here actually computes it. + // In theory, this may not be what we want, but in practice + // the createEmptyIntervalFrom API is used when this is not + // the case. Generally speaking we just want to annotate the + // LiveInterval when it gets created but we cannot do that at + // the moment. + if (Parent && !Parent->isSpillable()) + LIS.getInterval(VReg).markNotSpillable(); return VReg; } @@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg) if (VRM) VRM->grow(); - if (Parent && !Parent->isSpillable()) - LIS.getInterval(VReg).markNotSpillable(); - NewRegs.push_back(VReg); } diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp index 7a51386aa9ca..882de1a3fad9 100644 --- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -1,4 +1,4 @@ -//===-- LiveRegMatrix.cpp - Track register interference -------------------===// +//===- LiveRegMatrix.cpp - Track register interference --------------------===// // // The LLVM Compiler Infrastructure // @@ -11,15 +11,22 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveRegMatrix.h" #include "RegisterCoalescer.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Pass.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> using namespace llvm; @@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix", "Live Register Matrix", false, false) -LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID), - UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {} +LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {} void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); @@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg, return Result; } -LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg, +LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR, unsigned RegUnit) { LiveIntervalUnion::Query &Q = Queries[RegUnit]; - Q.init(UserTag, &VirtReg, &Matrix[RegUnit]); + Q.init(UserTag, LR, Matrix[RegUnit]); return Q; } @@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) { return IK_RegUnit; // Check the matrix for virtual register interference. - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) - if (query(VirtReg, *Units).checkInterference()) - return IK_VirtReg; + bool Interference = foreachUnit(TRI, VirtReg, PhysReg, + [&](unsigned Unit, const LiveRange &LR) { + return query(LR, Unit).checkInterference(); + }); + if (Interference) + return IK_VirtReg; return IK_Free; } diff --git a/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp new file mode 100644 index 000000000000..dff555f49565 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp @@ -0,0 +1,126 @@ +//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file imlements the LiveRegUnits set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) { + for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { + for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { + if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) + Units.reset(U); + } + } +} + +void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) { + for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) { + for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) { + if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) + Units.set(U); + } + } +} + +void LiveRegUnits::stepBackward(const MachineInstr &MI) { + // Remove defined registers and regmask kills from the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (O->isReg()) { + if (!O->isDef()) + continue; + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + removeReg(Reg); + } else if (O->isRegMask()) + removeRegsNotPreserved(O->getRegMask()); + } + + // Add uses to the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (!O->isReg() || !O->readsReg()) + continue; + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + addReg(Reg); + } +} + +void LiveRegUnits::accumulateBackward(const MachineInstr &MI) { + // Add defs, uses and regmask clobbers to the set. + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + if (O->isReg()) { + unsigned Reg = O->getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + if (!O->isDef() && !O->readsReg()) + continue; + addReg(Reg); + } else if (O->isRegMask()) + addRegsInMask(O->getRegMask()); + } +} + +/// Add live-in registers of basic block \p MBB to \p LiveUnits. +static void addLiveIns(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { + for (const auto &LI : MBB.liveins()) + LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask); +} + +static void addLiveOuts(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) { + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addLiveIns(LiveUnits, *Succ); +} + +/// Add pristine registers to the given \p LiveUnits. This function removes +/// actually saved callee save registers when \p InPrologueEpilogue is false. +static void removeSavedRegs(LiveRegUnits &LiveUnits, const MachineFunction &MF, + const MachineFrameInfo &MFI, + const TargetRegisterInfo &TRI) { + for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) + LiveUnits.removeReg(Info.getReg()); +} + +void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) + addReg(*I); + if (!MBB.isReturnBlock()) + removeSavedRegs(*this, MF, MFI, *TRI); + } + ::addLiveOuts(*this, MBB); +} + +void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) { + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) + addReg(*I); + if (&MBB != &MF.front()) + removeSavedRegs(*this, MF, MFI, *TRI); + } + ::addLiveIns(*this, MBB); +} diff --git a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp index dbf1f96102d1..b51f8b0aa6bb 100644 --- a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp +++ b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp @@ -25,10 +25,10 @@ using namespace llvm; #define DEBUG_TYPE "livestacks" char LiveStacks::ID = 0; -INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks", +INITIALIZE_PASS_BEGIN(LiveStacks, DEBUG_TYPE, "Live Stack Slot Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_END(LiveStacks, "livestacks", +INITIALIZE_PASS_END(LiveStacks, DEBUG_TYPE, "Live Stack Slot Analysis", false, false) char &llvm::LiveStacksID = LiveStacks::ID; diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp index 269b990a3149..a9aec926115a 100644 --- a/contrib/llvm/lib/CodeGen/LiveVariables.cpp +++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp @@ -64,8 +64,8 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const { return nullptr; } -LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { dbgs() << " Alive in blocks: "; for (SparseBitVector<>::iterator I = AliveBlocks.begin(), E = AliveBlocks.end(); I != E; ++I) @@ -78,8 +78,8 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const { dbgs() << "\n #" << i << ": " << *Kills[i]; dbgs() << "\n"; } -#endif } +#endif /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg. LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) { @@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *SuccBB) { const unsigned NumNew = BB->getNumber(); - SmallSet<unsigned, 16> Defs, Kills; + DenseSet<unsigned> Defs, Kills; MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end(); for (; BBI != BBE && BBI->isPHI(); ++BBI) { diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index e189fb0dd89d..17cab0ae910e 100644 --- a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -103,10 +103,10 @@ namespace { char LocalStackSlotPass::ID = 0; char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID; -INITIALIZE_PASS_BEGIN(LocalStackSlotPass, "localstackalloc", +INITIALIZE_PASS_BEGIN(LocalStackSlotPass, DEBUG_TYPE, "Local Stack Slot Allocation", false, false) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc", +INITIALIZE_PASS_END(LocalStackSlotPass, DEBUG_TYPE, "Local Stack Slot Allocation", false, false) diff --git a/contrib/llvm/lib/CodeGen/LowLevelType.cpp b/contrib/llvm/lib/CodeGen/LowLevelType.cpp index d74b7306e0f4..1c682e72fa49 100644 --- a/contrib/llvm/lib/CodeGen/LowLevelType.cpp +++ b/contrib/llvm/lib/CodeGen/LowLevelType.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===// +//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -18,54 +18,21 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -LLT::LLT(Type &Ty, const DataLayout &DL) { +LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) { if (auto VTy = dyn_cast<VectorType>(&Ty)) { - SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits(); - ElementsOrAddrSpace = VTy->getNumElements(); - Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; + auto NumElements = VTy->getNumElements(); + LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL); + if (NumElements == 1) + return ScalarTy; + return LLT::vector(NumElements, ScalarTy); } else if (auto PTy = dyn_cast<PointerType>(&Ty)) { - Kind = Pointer; - SizeInBits = DL.getTypeSizeInBits(&Ty); - ElementsOrAddrSpace = PTy->getAddressSpace(); + return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty)); } else if (Ty.isSized()) { // Aggregates are no different from real scalars as far as GlobalISel is // concerned. - Kind = Scalar; - SizeInBits = DL.getTypeSizeInBits(&Ty); - ElementsOrAddrSpace = 1; + auto SizeInBits = DL.getTypeSizeInBits(&Ty); assert(SizeInBits != 0 && "invalid zero-sized type"); - } else { - Kind = Invalid; - SizeInBits = ElementsOrAddrSpace = 0; + return LLT::scalar(SizeInBits); } -} - -LLT::LLT(MVT VT) { - if (VT.isVector()) { - SizeInBits = VT.getVectorElementType().getSizeInBits(); - ElementsOrAddrSpace = VT.getVectorNumElements(); - Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector; - } else if (VT.isValid()) { - // Aggregates are no different from real scalars as far as GlobalISel is - // concerned. - Kind = Scalar; - SizeInBits = VT.getSizeInBits(); - ElementsOrAddrSpace = 1; - assert(SizeInBits != 0 && "invalid zero-sized type"); - } else { - Kind = Invalid; - SizeInBits = ElementsOrAddrSpace = 0; - } -} - -void LLT::print(raw_ostream &OS) const { - if (isVector()) - OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">"; - else if (isPointer()) - OS << "p" << getAddressSpace(); - else if (isValid()) { - assert(isScalar() && "unexpected type"); - OS << "s" << getScalarSizeInBits(); - } else - llvm_unreachable("trying to print an invalid type"); + return LLT(); } diff --git a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp index 6966c8ca4a5f..0fc48d4e0b6b 100644 --- a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp +++ b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" @@ -28,14 +29,12 @@ using namespace llvm; namespace { class LowerEmuTLS : public ModulePass { - const TargetMachine *TM; public: static char ID; // Pass identification, replacement for typeid - explicit LowerEmuTLS() : ModulePass(ID), TM(nullptr) { } - explicit LowerEmuTLS(const TargetMachine *TM) - : ModulePass(ID), TM(TM) { + LowerEmuTLS() : ModulePass(ID) { initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry()); } + bool runOnModule(Module &M) override; private: bool addEmuTlsVar(Module &M, const GlobalVariable *GV); @@ -54,19 +53,22 @@ private: char LowerEmuTLS::ID = 0; -INITIALIZE_PASS(LowerEmuTLS, "loweremutls", - "Add __emutls_[vt]. variables for emultated TLS model", - false, false) +INITIALIZE_PASS(LowerEmuTLS, DEBUG_TYPE, + "Add __emutls_[vt]. variables for emultated TLS model", false, + false) -ModulePass *llvm::createLowerEmuTLSPass(const TargetMachine *TM) { - return new LowerEmuTLS(TM); -} +ModulePass *llvm::createLowerEmuTLSPass() { return new LowerEmuTLS(); } bool LowerEmuTLS::runOnModule(Module &M) { if (skipModule(M)) return false; - if (!TM || !TM->Options.EmulatedTLS) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<TargetMachine>(); + if (!TM.Options.EmulatedTLS) return false; bool Changed = false; diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp index c8bed0890dd6..1d36ff4e1458 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -12,11 +12,13 @@ //===----------------------------------------------------------------------===// #include "MIParser.h" + #include "MILexer.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/AsmParser/Parser.h" #include "llvm/AsmParser/SlotMapping.h" +#include "llvm/CodeGen/MIRPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -41,8 +43,11 @@ using namespace llvm; PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF, - SourceMgr &SM, const SlotMapping &IRSlots) - : MF(MF), SM(&SM), IRSlots(IRSlots) { + SourceMgr &SM, const SlotMapping &IRSlots, + const Name2RegClassMap &Names2RegClasses, + const Name2RegBankMap &Names2RegBanks) + : MF(MF), SM(&SM), IRSlots(IRSlots), Names2RegClasses(Names2RegClasses), + Names2RegBanks(Names2RegBanks) { } VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) { @@ -131,7 +136,8 @@ public: bool parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots); - bool parseBasicBlock(MachineBasicBlock &MBB); + bool parseBasicBlock(MachineBasicBlock &MBB, + MachineBasicBlock *&AddFalthroughFrom); bool parseBasicBlockLiveins(MachineBasicBlock &MBB); bool parseBasicBlockSuccessors(MachineBasicBlock &MBB); @@ -139,6 +145,7 @@ public: bool parseVirtualRegister(VRegInfo *&Info); bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo); bool parseRegisterFlag(unsigned &Flags); + bool parseRegisterClassOrBank(VRegInfo &RegInfo); bool parseSubRegisterIndex(unsigned &SubReg); bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx); bool parseRegisterOperand(MachineOperand &Dest, @@ -172,6 +179,7 @@ public: bool parseIntrinsicOperand(MachineOperand &Dest); bool parsePredicateOperand(MachineOperand &Dest); bool parseTargetIndexOperand(MachineOperand &Dest); + bool parseCustomRegisterMaskOperand(MachineOperand &Dest); bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest); bool parseMachineOperand(MachineOperand &Dest, Optional<unsigned> &TiedDefIdx); @@ -184,6 +192,7 @@ public: bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags); bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV); bool parseMachinePointerInfo(MachinePointerInfo &Dest); + bool parseOptionalAtomicOrdering(AtomicOrdering &Order); bool parseMachineMemoryOperand(MachineMemOperand *&Dest); private: @@ -512,7 +521,8 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { return false; } -bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { +bool MIParser::parseBasicBlock(MachineBasicBlock &MBB, + MachineBasicBlock *&AddFalthroughFrom) { // Skip the definition. assert(Token.is(MIToken::MachineBasicBlockLabel)); lex(); @@ -532,10 +542,12 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { // // is equivalent to // liveins: %edi, %esi + bool ExplicitSuccesors = false; while (true) { if (Token.is(MIToken::kw_successors)) { if (parseBasicBlockSuccessors(MBB)) return true; + ExplicitSuccesors = true; } else if (Token.is(MIToken::kw_liveins)) { if (parseBasicBlockLiveins(MBB)) return true; @@ -551,10 +563,9 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { // Parse the instructions. bool IsInBundle = false; MachineInstr *PrevMI = nullptr; - while (true) { - if (Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof)) - return false; - else if (consumeIfPresent(MIToken::Newline)) + while (!Token.is(MIToken::MachineBasicBlockLabel) && + !Token.is(MIToken::Eof)) { + if (consumeIfPresent(MIToken::Newline)) continue; if (consumeIfPresent(MIToken::rbrace)) { // The first parsing pass should verify that all closing '}' have an @@ -586,6 +597,22 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { assert(Token.isNewlineOrEOF() && "MI is not fully parsed"); lex(); } + + // Construct successor list by searching for basic block machine operands. + if (!ExplicitSuccesors) { + SmallVector<MachineBasicBlock*,4> Successors; + bool IsFallthrough; + guessSuccessors(MBB, Successors, IsFallthrough); + for (MachineBasicBlock *Succ : Successors) + MBB.addSuccessor(Succ); + + if (IsFallthrough) { + AddFalthroughFrom = &MBB; + } else { + MBB.normalizeSuccProbs(); + } + } + return false; } @@ -599,11 +626,18 @@ bool MIParser::parseBasicBlocks() { // The first parsing pass should have verified that this token is a MBB label // in the 'parseBasicBlockDefinitions' method. assert(Token.is(MIToken::MachineBasicBlockLabel)); + MachineBasicBlock *AddFalthroughFrom = nullptr; do { MachineBasicBlock *MBB = nullptr; if (parseMBBReference(MBB)) return true; - if (parseBasicBlock(*MBB)) + if (AddFalthroughFrom) { + if (!AddFalthroughFrom->isSuccessor(MBB)) + AddFalthroughFrom->addSuccessor(MBB); + AddFalthroughFrom->normalizeSuccProbs(); + AddFalthroughFrom = nullptr; + } + if (parseBasicBlock(*MBB, AddFalthroughFrom)) return true; // The method 'parseBasicBlock' should parse the whole block until the next // block or the end of file. @@ -878,6 +912,66 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) { } } +bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) { + if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore)) + return error("expected '_', register class, or register bank name"); + StringRef::iterator Loc = Token.location(); + StringRef Name = Token.stringValue(); + + // Was it a register class? + auto RCNameI = PFS.Names2RegClasses.find(Name); + if (RCNameI != PFS.Names2RegClasses.end()) { + lex(); + const TargetRegisterClass &RC = *RCNameI->getValue(); + + switch (RegInfo.Kind) { + case VRegInfo::UNKNOWN: + case VRegInfo::NORMAL: + RegInfo.Kind = VRegInfo::NORMAL; + if (RegInfo.Explicit && RegInfo.D.RC != &RC) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + return error(Loc, Twine("conflicting register classes, previously: ") + + Twine(TRI.getRegClassName(RegInfo.D.RC))); + } + RegInfo.D.RC = &RC; + RegInfo.Explicit = true; + return false; + + case VRegInfo::GENERIC: + case VRegInfo::REGBANK: + return error(Loc, "register class specification on generic register"); + } + llvm_unreachable("Unexpected register kind"); + } + + // Should be a register bank or a generic register. + const RegisterBank *RegBank = nullptr; + if (Name != "_") { + auto RBNameI = PFS.Names2RegBanks.find(Name); + if (RBNameI == PFS.Names2RegBanks.end()) + return error(Loc, "expected '_', register class, or register bank name"); + RegBank = RBNameI->getValue(); + } + + lex(); + + switch (RegInfo.Kind) { + case VRegInfo::UNKNOWN: + case VRegInfo::GENERIC: + case VRegInfo::REGBANK: + RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC; + if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank) + return error(Loc, "conflicting generic register banks"); + RegInfo.D.RegBank = RegBank; + RegInfo.Explicit = true; + return false; + + case VRegInfo::NORMAL: + return error(Loc, "register bank specification on normal register"); + } + llvm_unreachable("Unexpected register kind"); +} + bool MIParser::parseRegisterFlag(unsigned &Flags) { const unsigned OldFlags = Flags; switch (Token.kind()) { @@ -1004,6 +1098,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest, if (!TargetRegisterInfo::isVirtualRegister(Reg)) return error("subregister index expects a virtual register"); } + if (Token.is(MIToken::colon)) { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return error("register class specification expects a virtual register"); + lex(); + if (parseRegisterClassOrBank(*RegInfo)) + return true; + } MachineRegisterInfo &MRI = MF.getRegInfo(); if ((Flags & RegState::Define) == 0) { if (consumeIfPresent(MIToken::lparen)) { @@ -1598,6 +1699,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) { return false; } +bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) { + assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + assert(TRI && "Expected target register info"); + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + + uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs()); + while (true) { + if (Token.isNot(MIToken::NamedRegister)) + return error("expected a named register"); + unsigned Reg; + if (parseNamedRegister(Reg)) + return true; + lex(); + Mask[Reg / 32] |= 1U << (Reg % 32); + // TODO: Report an error if the same register is used more than once. + if (Token.isNot(MIToken::comma)) + break; + lex(); + } + + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = MachineOperand::CreateRegMask(Mask); + return false; +} + bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) { assert(Token.is(MIToken::kw_liveout)); const auto *TRI = MF.getSubtarget().getRegisterInfo(); @@ -1695,8 +1825,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest, Dest = MachineOperand::CreateRegMask(RegMask); lex(); break; - } - LLVM_FALLTHROUGH; + } else + return parseCustomRegisterMaskOperand(Dest); default: // FIXME: Parse the MCSymbol machine operand. return error("expected a machine operand"); @@ -1969,6 +2099,28 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) { return false; } +bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) { + Order = AtomicOrdering::NotAtomic; + if (Token.isNot(MIToken::Identifier)) + return false; + + Order = StringSwitch<AtomicOrdering>(Token.stringValue()) + .Case("unordered", AtomicOrdering::Unordered) + .Case("monotonic", AtomicOrdering::Monotonic) + .Case("acquire", AtomicOrdering::Acquire) + .Case("release", AtomicOrdering::Release) + .Case("acq_rel", AtomicOrdering::AcquireRelease) + .Case("seq_cst", AtomicOrdering::SequentiallyConsistent) + .Default(AtomicOrdering::NotAtomic); + + if (Order != AtomicOrdering::NotAtomic) { + lex(); + return false; + } + + return error("expected an atomic scope, ordering or a size integer literal"); +} + bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (expectAndConsume(MIToken::lparen)) return true; @@ -1986,6 +2138,21 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { Flags |= MachineMemOperand::MOStore; lex(); + // Optional "singlethread" scope. + SynchronizationScope Scope = SynchronizationScope::CrossThread; + if (Token.is(MIToken::Identifier) && Token.stringValue() == "singlethread") { + Scope = SynchronizationScope::SingleThread; + lex(); + } + + // Up to two atomic orderings (cmpxchg provides guarantees on failure). + AtomicOrdering Order, FailureOrder; + if (parseOptionalAtomicOrdering(Order)) + return true; + + if (parseOptionalAtomicOrdering(FailureOrder)) + return true; + if (Token.isNot(MIToken::IntegerLiteral)) return error("expected the size integer literal after memory operation"); uint64_t Size; @@ -2040,8 +2207,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { } if (expectAndConsume(MIToken::rparen)) return true; - Dest = - MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range); + Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range, + Scope, Order, FailureOrder); return false; } diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h index 93a4d84ba62f..9b3879cf8377 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h @@ -45,11 +45,16 @@ struct VRegInfo { unsigned PreferredReg = 0; }; +typedef StringMap<const TargetRegisterClass*> Name2RegClassMap; +typedef StringMap<const RegisterBank*> Name2RegBankMap; + struct PerFunctionMIParsingState { BumpPtrAllocator Allocator; MachineFunction &MF; SourceMgr *SM; const SlotMapping &IRSlots; + const Name2RegClassMap &Names2RegClasses; + const Name2RegBankMap &Names2RegBanks; DenseMap<unsigned, MachineBasicBlock *> MBBSlots; DenseMap<unsigned, VRegInfo*> VRegInfos; @@ -59,7 +64,9 @@ struct PerFunctionMIParsingState { DenseMap<unsigned, unsigned> JumpTableSlots; PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM, - const SlotMapping &IRSlots); + const SlotMapping &IRSlots, + const Name2RegClassMap &Names2RegClasses, + const Name2RegBankMap &Names2RegBanks); VRegInfo &getVRegInfo(unsigned VReg); }; diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 3dff1147631b..bd04acd049db 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -55,9 +55,9 @@ class MIRParserImpl { StringMap<std::unique_ptr<yaml::MachineFunction>> Functions; SlotMapping IRSlots; /// Maps from register class names to register classes. - StringMap<const TargetRegisterClass *> Names2RegClasses; + Name2RegClassMap Names2RegClasses; /// Maps from register bank names to register banks. - StringMap<const RegisterBank *> Names2RegBanks; + Name2RegBankMap Names2RegBanks; public: MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename, @@ -325,11 +325,15 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { return error(Twine("no machine function information for function '") + MF.getName() + "' in the MIR file"); // TODO: Recreate the machine function. + initNames2RegClasses(MF); + initNames2RegBanks(MF); const yaml::MachineFunction &YamlMF = *It->getValue(); if (YamlMF.Alignment) MF.setAlignment(YamlMF.Alignment); MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); + if (YamlMF.NoVRegs) + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); if (YamlMF.Legalized) MF.getProperties().set(MachineFunctionProperties::Property::Legalized); if (YamlMF.RegBankSelected) @@ -338,7 +342,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { if (YamlMF.Selected) MF.getProperties().set(MachineFunctionProperties::Property::Selected); - PerFunctionMIParsingState PFS(MF, SM, IRSlots); + PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses, + Names2RegBanks); if (parseRegisterInfo(PFS, YamlMF)) return true; if (!YamlMF.Constants.empty()) { @@ -362,9 +367,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { } PFS.SM = &SM; - if (MF.empty()) - return error(Twine("machine function '") + Twine(MF.getName()) + - "' requires at least one machine basic block in its body"); // Initialize the frame information after creating all the MBBs so that the // MBB references in the frame information can be resolved. if (initializeFrameInfo(PFS, YamlMF)) @@ -462,17 +464,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS, RegInfo.addLiveIn(Reg, VReg); } - // Parse the callee saved register mask. - BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size()); - if (!YamlMF.CalleeSavedRegisters) - return false; - for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { - unsigned Reg = 0; - if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) - return error(Error, RegSource.SourceRange); - CalleeSavedRegisterMask[Reg] = true; + // Parse the callee saved registers (Registers that will + // be saved for the caller). + if (YamlMF.CalleeSavedRegisters) { + SmallVector<MCPhysReg, 16> CalleeSavedRegisters; + for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { + unsigned Reg = 0; + if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error)) + return error(Error, RegSource.SourceRange); + CalleeSavedRegisters.push_back(Reg); + } + RegInfo.setCalleeSavedRegs(CalleeSavedRegisters); } - RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip()); + return false; } @@ -505,14 +509,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS, } // Compute MachineRegisterInfo::UsedPhysRegMask - if (!YamlMF.CalleeSavedRegisters) { - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); - } + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isRegMask()) + continue; + MRI.addPhysRegsUsedFromRegMask(MO.getRegMask()); } } } @@ -539,7 +541,8 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS, MFI.ensureMaxAlignment(YamlMFI.MaxAlignment); MFI.setAdjustsStack(YamlMFI.AdjustsStack); MFI.setHasCalls(YamlMFI.HasCalls); - MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize); + if (YamlMFI.MaxCallFrameSize != ~0u) + MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize); MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment); MFI.setHasVAStart(YamlMFI.HasVAStart); MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc); @@ -818,7 +821,6 @@ void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) { const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF, StringRef Name) { - initNames2RegClasses(MF); auto RegClassInfo = Names2RegClasses.find(Name); if (RegClassInfo == Names2RegClasses.end()) return nullptr; @@ -827,7 +829,6 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF, const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF, StringRef Name) { - initNames2RegBanks(MF); auto RegBankInfo = Names2RegBanks.find(Name); if (RegBankInfo == Names2RegBanks.end()) return nullptr; diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp index db87092177ca..6f6a67d81b0f 100644 --- a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp @@ -12,7 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "MIRPrinter.h" +#include "llvm/CodeGen/MIRPrinter.h" + #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" @@ -34,6 +35,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Format.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Options.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -42,6 +44,9 @@ using namespace llvm; +static cl::opt<bool> SimplifyMIR("simplify-mir", + cl::desc("Leave out unnecessary information when printing MIR")); + namespace { /// This structure describes how to print out stack object references. @@ -105,6 +110,9 @@ class MIPrinter { const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds; const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping; + bool canPredictBranchProbabilities(const MachineBasicBlock &MBB) const; + bool canPredictSuccessors(const MachineBasicBlock &MBB) const; + public: MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST, const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds, @@ -175,6 +183,8 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.Alignment = MF.getAlignment(); YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); + YamlMF.NoVRegs = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::NoVRegs); YamlMF.Legalized = MF.getProperties().hasProperty( MachineFunctionProperties::Property::Legalized); YamlMF.RegBankSelected = MF.getProperties().hasProperty( @@ -205,6 +215,25 @@ void MIRPrinter::print(const MachineFunction &MF) { Out << YamlMF; } +static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS, + const TargetRegisterInfo *TRI) { + assert(RegMask && "Can't print an empty register mask"); + OS << StringRef("CustomRegMask("); + + bool IsRegInRegMaskFound = false; + for (int I = 0, E = TRI->getNumRegs(); I < E; I++) { + // Check whether the register is asserted in regmask. + if (RegMask[I / 32] & (1u << (I % 32))) { + if (IsRegInRegMaskFound) + OS << ','; + printReg(I, OS, TRI); + IsRegInRegMaskFound = true; + } + } + + OS << ')'; +} + void MIRPrinter::convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo, const TargetRegisterInfo *TRI) { @@ -239,20 +268,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF, printReg(I->second, LiveIn.VirtualRegister, TRI); MF.LiveIns.push_back(LiveIn); } - // The used physical register mask is printed as an inverted callee saved - // register mask. - const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask(); - if (UsedPhysRegMask.none()) - return; - std::vector<yaml::FlowStringValue> CalleeSavedRegisters; - for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) { - if (!UsedPhysRegMask[I]) { + + // Prints the callee saved registers. + if (RegInfo.isUpdatedCSRsInitialized()) { + const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs(); + std::vector<yaml::FlowStringValue> CalleeSavedRegisters; + for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) { yaml::FlowStringValue Reg; - printReg(I, Reg, TRI); + printReg(*I, Reg, TRI); CalleeSavedRegisters.push_back(Reg); } + MF.CalleeSavedRegisters = CalleeSavedRegisters; } - MF.CalleeSavedRegisters = CalleeSavedRegisters; } void MIRPrinter::convert(ModuleSlotTracker &MST, @@ -267,7 +294,8 @@ void MIRPrinter::convert(ModuleSlotTracker &MST, YamlMFI.MaxAlignment = MFI.getMaxAlignment(); YamlMFI.AdjustsStack = MFI.adjustsStack(); YamlMFI.HasCalls = MFI.hasCalls(); - YamlMFI.MaxCallFrameSize = MFI.getMaxCallFrameSize(); + YamlMFI.MaxCallFrameSize = MFI.isMaxCallFrameSizeComputed() + ? MFI.getMaxCallFrameSize() : ~0u; YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment(); YamlMFI.HasVAStart = MFI.hasVAStart(); YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc(); @@ -434,6 +462,63 @@ void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) { RegisterMaskIds.insert(std::make_pair(Mask, I++)); } +void llvm::guessSuccessors(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineBasicBlock*> &Result, + bool &IsFallthrough) { + SmallPtrSet<MachineBasicBlock*,8> Seen; + + for (const MachineInstr &MI : MBB) { + if (MI.isPHI()) + continue; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isMBB()) + continue; + MachineBasicBlock *Succ = MO.getMBB(); + auto RP = Seen.insert(Succ); + if (RP.second) + Result.push_back(Succ); + } + } + MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr(); + IsFallthrough = I == MBB.end() || !I->isBarrier(); +} + +bool +MIPrinter::canPredictBranchProbabilities(const MachineBasicBlock &MBB) const { + if (MBB.succ_size() <= 1) + return true; + if (!MBB.hasSuccessorProbabilities()) + return true; + + SmallVector<BranchProbability,8> Normalized(MBB.Probs.begin(), + MBB.Probs.end()); + BranchProbability::normalizeProbabilities(Normalized.begin(), + Normalized.end()); + SmallVector<BranchProbability,8> Equal(Normalized.size()); + BranchProbability::normalizeProbabilities(Equal.begin(), Equal.end()); + + return std::equal(Normalized.begin(), Normalized.end(), Equal.begin()); +} + +bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const { + SmallVector<MachineBasicBlock*,8> GuessedSuccs; + bool GuessedFallthrough; + guessSuccessors(MBB, GuessedSuccs, GuessedFallthrough); + if (GuessedFallthrough) { + const MachineFunction &MF = *MBB.getParent(); + MachineFunction::const_iterator NextI = std::next(MBB.getIterator()); + if (NextI != MF.end()) { + MachineBasicBlock *Next = const_cast<MachineBasicBlock*>(&*NextI); + if (!is_contained(GuessedSuccs, Next)) + GuessedSuccs.push_back(Next); + } + } + if (GuessedSuccs.size() != MBB.succ_size()) + return false; + return std::equal(MBB.succ_begin(), MBB.succ_end(), GuessedSuccs.begin()); +} + + void MIPrinter::print(const MachineBasicBlock &MBB) { assert(MBB.getNumber() >= 0 && "Invalid MBB number"); OS << "bb." << MBB.getNumber(); @@ -472,13 +557,15 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { bool HasLineAttributes = false; // Print the successors - if (!MBB.succ_empty()) { + bool canPredictProbs = canPredictBranchProbabilities(MBB); + if (!MBB.succ_empty() && (!SimplifyMIR || !canPredictProbs || + !canPredictSuccessors(MBB))) { OS.indent(2) << "successors: "; for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) { if (I != MBB.succ_begin()) OS << ", "; printMBBReference(**I); - if (MBB.hasSuccessorProbabilities()) + if (!SimplifyMIR || !canPredictProbs) OS << '(' << format("0x%08" PRIx32, MBB.getSuccProbability(I).getNumerator()) << ')'; @@ -860,7 +947,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, if (RegMaskInfo != RegisterMaskIds.end()) OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower(); else - llvm_unreachable("Can't print this machine register mask yet."); + printCustomRegMask(Op.getRegMask(), OS, TRI); break; } case MachineOperand::MO_RegisterLiveOut: { @@ -926,6 +1013,15 @@ void MIPrinter::print(const MachineMemOperand &Op) { assert(Op.isStore() && "Non load machine operand must be a store"); OS << "store "; } + + if (Op.getSynchScope() == SynchronizationScope::SingleThread) + OS << "singlethread "; + + if (Op.getOrdering() != AtomicOrdering::NotAtomic) + OS << toIRString(Op.getOrdering()) << ' '; + if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic) + OS << toIRString(Op.getFailureOrdering()) << ' '; + OS << Op.getSize(); if (const Value *Val = Op.getValue()) { OS << (Op.isLoad() ? " from " : " into "); diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.h b/contrib/llvm/lib/CodeGen/MIRPrinter.h deleted file mode 100644 index 16aa9038b6b2..000000000000 --- a/contrib/llvm/lib/CodeGen/MIRPrinter.h +++ /dev/null @@ -1,33 +0,0 @@ -//===- MIRPrinter.h - MIR serialization format printer --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the functions that print out the LLVM IR and the machine -// functions using the MIR serialization format. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_CODEGEN_MIRPRINTER_H -#define LLVM_LIB_CODEGEN_MIRPRINTER_H - -namespace llvm { - -class MachineFunction; -class Module; -class raw_ostream; - -/// Print LLVM IR using the MIR serialization format to the given output stream. -void printMIR(raw_ostream &OS, const Module &M); - -/// Print a machine function using the MIR serialization format to the given -/// output stream. -void printMIR(raw_ostream &OS, const MachineFunction &MF); - -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp index c690bcfad567..671cf1eddc2d 100644 --- a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp +++ b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp @@ -12,7 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "MIRPrinter.h" +#include "llvm/CodeGen/MIRPrinter.h" + #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MIRYamlMapping.h" diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp index 3869f976854d..06112723497b 100644 --- a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -148,8 +149,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { + const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); + iterator E = end(); - while (I != E && (I->isPHI() || I->isPosition())) + while (I != E && (I->isPHI() || I->isPosition() || + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels // inside the bundle. @@ -160,8 +164,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) { MachineBasicBlock::iterator MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) { + const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo(); + iterator E = end(); - while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue())) + while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() || + TII->isBasicBlockPrologue(*I))) ++I; // FIXME: This needs to change if we wish to bundle labels / dbg_values // inside the bundle. @@ -225,7 +232,7 @@ StringRef MachineBasicBlock::getName() const { if (const BasicBlock *LBB = getBasicBlock()) return LBB->getName(); else - return "(null)"; + return StringRef("", 0); } /// Return a hopefully unique identifier for this block. @@ -417,7 +424,7 @@ void MachineBasicBlock::updateTerminator() { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc DL; // FIXME: this is nowhere + DebugLoc DL = findBranchDebugLoc(); bool B = TII->analyzeBranch(*this, TBB, FBB, Cond); (void) B; assert(!B && "UpdateTerminators requires analyzable predecessors!"); @@ -485,7 +492,7 @@ void MachineBasicBlock::updateTerminator() { // FIXME: This does not seem like a reasonable pattern to support, but it // has been seen in the wild coming out of degenerate ARM test cases. TII->removeBranch(*this); - + // Finally update the unconditional successor to be reached via a branch if // it would not be reached by fallthrough. if (!isLayoutSuccessor(TBB)) @@ -681,16 +688,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const { return std::next(I) == MachineFunction::const_iterator(MBB); } -bool MachineBasicBlock::canFallThrough() { +MachineBasicBlock *MachineBasicBlock::getFallThrough() { MachineFunction::iterator Fallthrough = getIterator(); ++Fallthrough; // If FallthroughBlock is off the end of the function, it can't fall through. if (Fallthrough == getParent()->end()) - return false; + return nullptr; // If FallthroughBlock isn't a successor, no fallthrough is possible. if (!isSuccessor(&*Fallthrough)) - return false; + return nullptr; // Analyze the branches, if any, at the end of the block. MachineBasicBlock *TBB = nullptr, *FBB = nullptr; @@ -702,25 +709,31 @@ bool MachineBasicBlock::canFallThrough() { // is possible. The isPredicated check is needed because this code can be // called during IfConversion, where an instruction which is normally a // Barrier is predicated and thus no longer an actual control barrier. - return empty() || !back().isBarrier() || TII->isPredicated(back()); + return (empty() || !back().isBarrier() || TII->isPredicated(back())) + ? &*Fallthrough + : nullptr; } // If there is no branch, control always falls through. - if (!TBB) return true; + if (!TBB) return &*Fallthrough; // If there is some explicit branch to the fallthrough block, it can obviously // reach, even though the branch should get folded to fall through implicitly. if (MachineFunction::iterator(TBB) == Fallthrough || MachineFunction::iterator(FBB) == Fallthrough) - return true; + return &*Fallthrough; // If it's an unconditional branch to some block not the fall through, it // doesn't fall through. - if (Cond.empty()) return false; + if (Cond.empty()) return nullptr; // Otherwise, if it is conditional and has no explicit false block, it falls // through. - return FBB == nullptr; + return (FBB == nullptr) ? &*Fallthrough : nullptr; +} + +bool MachineBasicBlock::canFallThrough() { + return getFallThrough() != nullptr; } MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, @@ -1144,6 +1157,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return {}; } +/// Find and return the merged DebugLoc of the branch instructions of the block. +/// Return UnknownLoc if there is none. +DebugLoc +MachineBasicBlock::findBranchDebugLoc() { + DebugLoc DL; + auto TI = getFirstTerminator(); + while (TI != end() && !TI->isBranch()) + ++TI; + + if (TI != end()) { + DL = TI->getDebugLoc(); + for (++TI ; TI != end() ; ++TI) + if (TI->isBranch()) + DL = DILocation::getMergedLocation(DL, TI->getDebugLoc()); + } + return DL; +} + /// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 7d5124d30a04..4d1ec11df46c 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -26,9 +26,8 @@ using namespace llvm; -#define DEBUG_TYPE "block-freq" +#define DEBUG_TYPE "machine-block-freq" -#ifndef NDEBUG static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG( "view-machine-block-freq-propagation-dags", cl::Hidden, @@ -43,10 +42,37 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG( "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available."))); +// Similar option above, but used to control BFI display only after MBP pass +cl::opt<GVDAGType> ViewBlockLayoutWithBFI( + "view-block-layout-with-bfi", cl::Hidden, + cl::desc( + "Pop up a window to show a dag displaying MBP layout and associated " + "block frequencies of the CFG."), + cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), + clEnumValN(GVDT_Fraction, "fraction", + "display a graph using the " + "fractional block frequency representation."), + clEnumValN(GVDT_Integer, "integer", + "display a graph using the raw " + "integer fractional block frequency representation."), + clEnumValN(GVDT_Count, "count", + "display a graph using the real " + "profile count if available."))); +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; +// Command line option to specify hot frequency threshold. +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-hot-freq-perc= extern cl::opt<unsigned> ViewHotFreqPercent; +static GVDAGType getGVDT() { + if (ViewBlockLayoutWithBFI != GVDT_None) + return ViewBlockLayoutWithBFI; + + return ViewMachineBlockFreqPropagationDAG; +} + namespace llvm { template <> struct GraphTraits<MachineBlockFrequencyInfo *> { @@ -80,12 +106,32 @@ template <> struct DOTGraphTraits<MachineBlockFrequencyInfo *> : public MBFIDOTGraphTraitsBase { explicit DOTGraphTraits(bool isSimple = false) - : MBFIDOTGraphTraitsBase(isSimple) {} + : MBFIDOTGraphTraitsBase(isSimple), CurFunc(nullptr), LayoutOrderMap() {} + + const MachineFunction *CurFunc; + DenseMap<const MachineBasicBlock *, int> LayoutOrderMap; std::string getNodeLabel(const MachineBasicBlock *Node, const MachineBlockFrequencyInfo *Graph) { - return MBFIDOTGraphTraitsBase::getNodeLabel( - Node, Graph, ViewMachineBlockFreqPropagationDAG); + + int layout_order = -1; + // Attach additional ordering information if 'isSimple' is false. + if (!isSimple()) { + const MachineFunction *F = Node->getParent(); + if (!CurFunc || F != CurFunc) { + if (CurFunc) + LayoutOrderMap.clear(); + + CurFunc = F; + int O = 0; + for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) { + LayoutOrderMap[&*MBI] = O; + } + } + layout_order = LayoutOrderMap[Node]; + } + return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(), + layout_order); } std::string getNodeAttributes(const MachineBasicBlock *Node, @@ -102,13 +148,12 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *> }; } // end namespace llvm -#endif -INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq", +INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE, "Machine Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq", +INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE, "Machine Block Frequency Analysis", true, true) char MachineBlockFrequencyInfo::ID = 0; @@ -127,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { - MachineBranchProbabilityInfo &MBPI = - getAnalysis<MachineBranchProbabilityInfo>(); - MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); +void MachineBlockFrequencyInfo::calculate( + const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, + const MachineLoopInfo &MLI) { if (!MBFI) MBFI.reset(new ImplType); MBFI->calculate(F, MBPI, MLI); -#ifndef NDEBUG if (ViewMachineBlockFreqPropagationDAG != GVDT_None && (ViewBlockFreqFuncName.empty() || F.getName().equals(ViewBlockFreqFuncName))) { - view(); + view("MachineBlockFrequencyDAGS." + F.getName()); } -#endif +} + +bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { + MachineBranchProbabilityInfo &MBPI = + getAnalysis<MachineBranchProbabilityInfo>(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + calculate(F, MBPI, MLI); return false; } @@ -148,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); } /// Pop up a ghostview window with the current block frequency propagation /// rendered using dot. -void MachineBlockFrequencyInfo::view() const { -// This code is only for debugging. -#ifndef NDEBUG - ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), - "MachineBlockFrequencyDAGs"); -#else - errs() << "MachineBlockFrequencyInfo::view is only available in debug builds " - "on systems with Graphviz or gv!\n"; -#endif // NDEBUG +void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const { + // This code is only for debugging. + ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple); } BlockFrequency diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 40e3840e6b0b..c1ca8e8e83b4 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -32,14 +32,15 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" @@ -49,6 +50,8 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <functional> +#include <utility> using namespace llvm; #define DEBUG_TYPE "block-placement" @@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias( // Definition: // - Outlining: placement of a basic block outside the chain or hot path. -static cl::opt<bool> OutlineOptionalBranches( - "outline-optional-branches", - cl::desc("Outlining optional branches will place blocks that are optional " - "branches, i.e. branches with a common post dominator, outside " - "the hot path or chain"), - cl::init(false), cl::Hidden); - -static cl::opt<unsigned> OutlineOptionalThreshold( - "outline-optional-threshold", - cl::desc("Don't outline optional branches that are a single block with an " - "instruction count below this threshold"), - cl::init(4), cl::Hidden); - static cl::opt<unsigned> LoopToColdBlockRatio( "loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " @@ -136,20 +126,55 @@ BranchFoldPlacement("branch-fold-placement", cl::init(true), cl::Hidden); // Heuristic for tail duplication. -static cl::opt<unsigned> TailDuplicatePlacementThreshold( +static cl::opt<unsigned> TailDupPlacementThreshold( "tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden); +// Heuristic for aggressive tail duplication. +static cl::opt<unsigned> TailDupPlacementAggressiveThreshold( + "tail-dup-placement-aggressive-threshold", + cl::desc("Instruction cutoff for aggressive tail duplication during " + "layout. Used at -O3. Tail merging during layout is forced to " + "have a threshold that won't conflict."), cl::init(3), + cl::Hidden); + +// Heuristic for tail duplication. +static cl::opt<unsigned> TailDupPlacementPenalty( + "tail-dup-placement-penalty", + cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " + "Copying can increase fallthrough, but it also increases icache " + "pressure. This parameter controls the penalty to account for that. " + "Percent as integer."), + cl::init(2), + cl::Hidden); + +// Heuristic for triangle chains. +static cl::opt<unsigned> TriangleChainCount( + "triangle-chain-count", + cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " + "triangle tail duplication heuristic to kick in. 0 to disable."), + cl::init(2), + cl::Hidden); + extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; +// Internal option used to control BFI display only after MBP pass. +// Defined in CodeGen/MachineBlockFrequencyInfo.cpp: +// -view-block-layout-with-bfi= +extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; + +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= +extern cl::opt<std::string> ViewBlockFreqFuncName; + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. -typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType; +typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType; } namespace { @@ -193,12 +218,15 @@ public: /// \brief Iterator over blocks within the chain. typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator; + typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator; /// \brief Beginning of blocks within the chain. iterator begin() { return Blocks.begin(); } + const_iterator begin() const { return Blocks.begin(); } /// \brief End of blocks within the chain. iterator end() { return Blocks.end(); } + const_iterator end() const { return Blocks.end(); } bool remove(MachineBasicBlock* BB) { for(iterator i = begin(); i != end(); ++i) { @@ -217,25 +245,26 @@ public: /// updating the block -> chain mapping. It does not free or tear down the /// old chain, but the old chain's block list is no longer valid. void merge(MachineBasicBlock *BB, BlockChain *Chain) { - assert(BB); - assert(!Blocks.empty()); + assert(BB && "Can't merge a null block."); + assert(!Blocks.empty() && "Can't merge into an empty chain."); // Fast path in case we don't have a chain already. if (!Chain) { - assert(!BlockToChain[BB]); + assert(!BlockToChain[BB] && + "Passed chain is null, but BB has entry in BlockToChain."); Blocks.push_back(BB); BlockToChain[BB] = this; return; } - assert(BB == *Chain->begin()); + assert(BB == *Chain->begin() && "Passed BB is not head of Chain."); assert(Chain->begin() != Chain->end()); // Update the incoming blocks to point to this chain, and add them to the // chain structure. for (MachineBasicBlock *ChainBB : *Chain) { Blocks.push_back(ChainBB); - assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain"); + assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain."); BlockToChain[ChainBB] = this; } } @@ -264,12 +293,28 @@ public: namespace { class MachineBlockPlacement : public MachineFunctionPass { /// \brief A typedef for a block filter set. - typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet; + typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet; + + /// Pair struct containing basic block and taildup profitiability + struct BlockAndTailDupResult { + MachineBasicBlock *BB; + bool ShouldTailDup; + }; + + /// Triple struct containing edge weight and the edge. + struct WeightedEdge { + BlockFrequency Weight; + MachineBasicBlock *Src; + MachineBasicBlock *Dest; + }; /// \brief work lists of blocks that are ready to be laid out SmallVector<MachineBasicBlock *, 16> BlockWorkList; SmallVector<MachineBasicBlock *, 16> EHPadWorkList; + /// Edges that have already been computed as optimal. + DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges; + /// \brief Machine Function MachineFunction *F; @@ -294,7 +339,7 @@ class MachineBlockPlacement : public MachineFunctionPass { const TargetLoweringBase *TLI; /// \brief A handle to the post dominator tree. - MachineDominatorTree *MDT; + MachinePostDominatorTree *MPDT; /// \brief Duplicator used to duplicate tails during placement. /// @@ -303,10 +348,6 @@ class MachineBlockPlacement : public MachineFunctionPass { /// must be done inline. TailDuplicator TailDup; - /// \brief A set of blocks that are unavoidably execute, i.e. they dominate - /// all terminators of the MachineFunction. - SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks; - /// \brief Allocator and owner of BlockChain structures. /// /// We build BlockChains lazily while processing the loop structure of @@ -322,7 +363,7 @@ class MachineBlockPlacement : public MachineFunctionPass { /// BlockChain it participates in, if any. We use it to, among other things, /// allow implicitly defining edges between chains as the existing edges /// between basic blocks. - DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; + DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain; #ifndef NDEBUG /// The set of basic blocks that have terminators that cannot be fully @@ -334,75 +375,107 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Decrease the UnscheduledPredecessors count for all blocks in chain, and /// if the count goes to 0, add them to the appropriate work list. - void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter = nullptr); + void markChainSuccessors( + const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, + const BlockFilterSet *BlockFilter = nullptr); /// Decrease the UnscheduledPredecessors count for a single block, and /// if the count goes to 0, add them to the appropriate work list. void markBlockSuccessors( - BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB, + const BlockChain &Chain, const MachineBasicBlock *BB, + const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter = nullptr); - BranchProbability - collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, - const BlockFilterSet *BlockFilter, - SmallVector<MachineBasicBlock *, 4> &Successors); - bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ, - BlockChain &Chain, - const BlockFilterSet *BlockFilter, - BranchProbability SuccProb, - BranchProbability HotProb); + collectViableSuccessors( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, + SmallVector<MachineBasicBlock *, 4> &Successors); + bool shouldPredBlockBeOutlined( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter, + BranchProbability SuccProb, BranchProbability HotProb); bool repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, - MachineBasicBlock *LoopHeaderBB, + const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt); - bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred, - const BlockChain &Chain, - BlockFilterSet *BlockFilter, - MachineFunction::iterator &PrevUnplacedBlockIt, - bool &DuplicatedToPred); - bool - hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, - BlockChain &SuccChain, BranchProbability SuccProb, - BranchProbability RealSuccProb, BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter); - MachineBasicBlock * - selectBestCandidateBlock(BlockChain &Chain, - SmallVectorImpl<MachineBasicBlock *> &WorkList); - MachineBasicBlock * - getFirstUnplacedBlock(const BlockChain &PlacedChain, - MachineFunction::iterator &PrevUnplacedBlockIt, - const BlockFilterSet *BlockFilter); + bool maybeTailDuplicateBlock( + MachineBasicBlock *BB, MachineBasicBlock *LPred, + BlockChain &Chain, BlockFilterSet *BlockFilter, + MachineFunction::iterator &PrevUnplacedBlockIt, + bool &DuplicatedToPred); + bool hasBetterLayoutPredecessor( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + BlockAndTailDupResult selectBestSuccessor( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + MachineBasicBlock *selectBestCandidateBlock( + const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList); + MachineBasicBlock *getFirstUnplacedBlock( + const BlockChain &PlacedChain, + MachineFunction::iterator &PrevUnplacedBlockIt, + const BlockFilterSet *BlockFilter); /// \brief Add a basic block to the work list if it is appropriate. /// /// If the optional parameter BlockFilter is provided, only MBB /// present in the set will be added to the worklist. If nullptr /// is provided, no filtering occurs. - void fillWorkLists(MachineBasicBlock *MBB, + void fillWorkLists(const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter); - void buildChain(MachineBasicBlock *BB, BlockChain &Chain, + void buildChain(const MachineBasicBlock *BB, BlockChain &Chain, BlockFilterSet *BlockFilter = nullptr); - MachineBasicBlock *findBestLoopTop(MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - MachineBasicBlock *findBestLoopExit(MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - BlockFilterSet collectLoopBlockSet(MachineLoop &L); - void buildLoopChains(MachineLoop &L); - void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, - const BlockFilterSet &LoopBlockSet); - void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, - const BlockFilterSet &LoopBlockSet); - void collectMustExecuteBBs(); + MachineBasicBlock *findBestLoopTop( + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + MachineBasicBlock *findBestLoopExit( + const MachineLoop &L, const BlockFilterSet &LoopBlockSet); + BlockFilterSet collectLoopBlockSet(const MachineLoop &L); + void buildLoopChains(const MachineLoop &L); + void rotateLoop( + BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, + const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile( + BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(); void optimizeBranches(); void alignBlocks(); + /// Returns true if a block should be tail-duplicated to increase fallthrough + /// opportunities. + bool shouldTailDuplicate(MachineBasicBlock *BB); + /// Check the edge frequencies to see if tail duplication will increase + /// fallthroughs. + bool isProfitableToTailDup( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + BranchProbability AdjustedSumProb, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Check for a trellis layout. + bool isTrellis(const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Get the best successor given a trellis layout. + BlockAndTailDupResult getBestTrellisSuccessor( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + BranchProbability AdjustedSumProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter); + /// Get the best pair of non-conflicting edges. + static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges( + const MachineBasicBlock *BB, + MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges); + /// Returns true if a block can tail duplicate into all unplaced + /// predecessors. Filters based on loop. + bool canTailDuplicateUnplacedPreds( + const MachineBasicBlock *BB, MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter); + /// Find chains of triangles to tail-duplicate where a global analysis works, + /// but a local analysis would not find them. + void precomputeTriangleChains(); public: static char ID; // Pass identification, replacement for typeid @@ -415,7 +488,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineBranchProbabilityInfo>(); AU.addRequired<MachineBlockFrequencyInfo>(); - AU.addRequired<MachineDominatorTree>(); + if (TailDupPlacement) + AU.addRequired<MachinePostDominatorTree>(); AU.addRequired<MachineLoopInfo>(); AU.addRequired<TargetPassConfig>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -425,20 +499,20 @@ public: char MachineBlockPlacement::ID = 0; char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID; -INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement", +INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", +INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) #ifndef NDEBUG /// \brief Helper to print the name of a MBB. /// /// Only used by debug logging. -static std::string getBlockName(MachineBasicBlock *BB) { +static std::string getBlockName(const MachineBasicBlock *BB) { std::string Result; raw_string_ostream OS(Result); OS << "BB#" << BB->getNumber(); @@ -455,7 +529,7 @@ static std::string getBlockName(MachineBasicBlock *BB) { /// having one fewer active predecessor. It also adds any successors of this /// chain which reach the zero-predecessor state to the appropriate worklist. void MachineBlockPlacement::markChainSuccessors( - BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, + const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Walk all the blocks in this chain, marking their successors as having // a predecessor placed. @@ -471,8 +545,8 @@ void MachineBlockPlacement::markChainSuccessors( /// and was duplicated into the chain end, we need to redo markBlockSuccessors /// for just that block. void MachineBlockPlacement::markBlockSuccessors( - BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB, - const BlockFilterSet *BlockFilter) { + const BlockChain &Chain, const MachineBasicBlock *MBB, + const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { // Add any successors for which this is the only un-placed in-loop // predecessor to the worklist as a viable candidate for CFG-neutral // placement. No subsequent placement of this block will violate the CFG @@ -504,7 +578,8 @@ void MachineBlockPlacement::markBlockSuccessors( /// the total branch probability of edges from \p BB to those /// blocks. BranchProbability MachineBlockPlacement::collectViableSuccessors( - MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter, SmallVector<MachineBasicBlock *, 4> &Successors) { // Adjust edge probabilities by excluding edges pointing to blocks that is // either not in BlockFilter or is already in the current chain. Consider the @@ -561,46 +636,573 @@ getAdjustedProbability(BranchProbability OrigProb, return SuccProb; } -/// When the option OutlineOptionalBranches is on, this method -/// checks if the fallthrough candidate block \p Succ (of block -/// \p BB) also has other unscheduled predecessor blocks which -/// are also successors of \p BB (forming triangular shape CFG). -/// If none of such predecessors are small, it returns true. -/// The caller can choose to select \p Succ as the layout successors -/// so that \p Succ's predecessors (optional branches) can be -/// outlined. -/// FIXME: fold this with more general layout cost analysis. -bool MachineBlockPlacement::shouldPredBlockBeOutlined( - MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, - const BlockFilterSet *BlockFilter, BranchProbability SuccProb, - BranchProbability HotProb) { - if (!OutlineOptionalBranches) +/// Check if \p BB has exactly the successors in \p Successors. +static bool +hasSameSuccessors(MachineBasicBlock &BB, + SmallPtrSetImpl<const MachineBasicBlock *> &Successors) { + if (BB.succ_size() != Successors.size()) + return false; + // We don't want to count self-loops + if (Successors.count(&BB)) + return false; + for (MachineBasicBlock *Succ : BB.successors()) + if (!Successors.count(Succ)) + return false; + return true; +} + +/// Check if a block should be tail duplicated to increase fallthrough +/// opportunities. +/// \p BB Block to check. +bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { + // Blocks with single successors don't create additional fallthrough + // opportunities. Don't duplicate them. TODO: When conditional exits are + // analyzable, allow them to be duplicated. + bool IsSimple = TailDup.isSimpleBB(BB); + + if (BB->succ_size() == 1) + return false; + return TailDup.shouldTailDuplicate(IsSimple, *BB); +} + +/// Compare 2 BlockFrequency's with a small penalty for \p A. +/// In order to be conservative, we apply a X% penalty to account for +/// increased icache pressure and static heuristics. For small frequencies +/// we use only the numerators to improve accuracy. For simplicity, we assume the +/// penalty is less than 100% +/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. +static bool greaterWithBias(BlockFrequency A, BlockFrequency B, + uint64_t EntryFreq) { + BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); + BlockFrequency Gain = A - B; + return (Gain / ThresholdProb).getFrequency() >= EntryFreq; +} + +/// Check the edge frequencies to see if tail duplication will increase +/// fallthroughs. It only makes sense to call this function when +/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is +/// always locally profitable if we would have picked \p Succ without +/// considering duplication. +bool MachineBlockPlacement::isProfitableToTailDup( + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + BranchProbability QProb, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // We need to do a probability calculation to make sure this is profitable. + // First: does succ have a successor that post-dominates? This affects the + // calculation. The 2 relevant cases are: + // BB BB + // | \Qout | \Qout + // P| C |P C + // = C' = C' + // | /Qin | /Qin + // | / | / + // Succ Succ + // / \ | \ V + // U/ =V |U \ + // / \ = D + // D E | / + // | / + // |/ + // PDom + // '=' : Branch taken for that CFG edge + // In the second case, Placing Succ while duplicating it into C prevents the + // fallthrough of Succ into either D or PDom, because they now have C as an + // unplaced predecessor + + // Start by figuring out which case we fall into + MachineBasicBlock *PDom = nullptr; + SmallVector<MachineBasicBlock *, 4> SuccSuccs; + // Only scan the relevant successors + auto AdjustedSuccSumProb = + collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs); + BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ); + auto BBFreq = MBFI->getBlockFreq(BB); + auto SuccFreq = MBFI->getBlockFreq(Succ); + BlockFrequency P = BBFreq * PProb; + BlockFrequency Qout = BBFreq * QProb; + uint64_t EntryFreq = MBFI->getEntryFreq(); + // If there are no more successors, it is profitable to copy, as it strictly + // increases fallthrough. + if (SuccSuccs.size() == 0) + return greaterWithBias(P, Qout, EntryFreq); + + auto BestSuccSucc = BranchProbability::getZero(); + // Find the PDom or the best Succ if no PDom exists. + for (MachineBasicBlock *SuccSucc : SuccSuccs) { + auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc); + if (Prob > BestSuccSucc) + BestSuccSucc = Prob; + if (PDom == nullptr) + if (MPDT->dominates(SuccSucc, Succ)) { + PDom = SuccSucc; + break; + } + } + // For the comparisons, we need to know Succ's best incoming edge that isn't + // from BB. + auto SuccBestPred = BlockFrequency(0); + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + if (SuccPred == Succ || SuccPred == BB + || BlockToChain[SuccPred] == &Chain + || (BlockFilter && !BlockFilter->count(SuccPred))) + continue; + auto Freq = MBFI->getBlockFreq(SuccPred) + * MBPI->getEdgeProbability(SuccPred, Succ); + if (Freq > SuccBestPred) + SuccBestPred = Freq; + } + // Qin is Succ's best unplaced incoming edge that isn't BB + BlockFrequency Qin = SuccBestPred; + // If it doesn't have a post-dominating successor, here is the calculation: + // BB BB + // | \Qout | \ + // P| C | = + // = C' | C + // | /Qin | | + // | / | C' (+Succ) + // Succ Succ /| + // / \ | \/ | + // U/ =V | == | + // / \ | / \| + // D E D E + // '=' : Branch taken for that CFG edge + // Cost in the first case is: P + V + // For this calculation, we always assume P > Qout. If Qout > P + // The result of this function will be ignored at the caller. + // Let F = SuccFreq - Qin + // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V + + if (PDom == nullptr || !Succ->isSuccessor(PDom)) { + BranchProbability UProb = BestSuccSucc; + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency F = SuccFreq - Qin; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency QinU = std::min(Qin, F) * UProb; + BlockFrequency BaseCost = P + V; + BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb; + return greaterWithBias(BaseCost, DupCost, EntryFreq); + } + BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom); + BranchProbability VProb = AdjustedSuccSumProb - UProb; + BlockFrequency U = SuccFreq * UProb; + BlockFrequency V = SuccFreq * VProb; + BlockFrequency F = SuccFreq - Qin; + // If there is a post-dominating successor, here is the calculation: + // BB BB BB BB + // | \Qout | \ | \Qout | \ + // |P C | = |P C | = + // = C' |P C = C' |P C + // | /Qin | | | /Qin | | + // | / | C' (+Succ) | / | C' (+Succ) + // Succ Succ /| Succ Succ /| + // | \ V | \/ | | \ V | \/ | + // |U \ |U /\ =? |U = |U /\ | + // = D = = =?| | D | = =| + // | / |/ D | / |/ D + // | / | / | = | / + // |/ | / |/ | = + // Dom Dom Dom Dom + // '=' : Branch taken for that CFG edge + // The cost for taken branches in the first case is P + U + // Let F = SuccFreq - Qin + // The cost in the second case (assuming independence), given the layout: + // BB, Succ, (C+Succ), D, Dom or the layout: + // BB, Succ, D, Dom, (C+Succ) + // is Qout + max(F, Qin) * U + min(F, Qin) + // compare P + U vs Qout + P * U + Qin. + // + // The 3rd and 4th cases cover when Dom would be chosen to follow Succ. + // + // For the 3rd case, the cost is P + 2 * V + // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V + // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V + if (UProb > AdjustedSuccSumProb / 2 && + !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb, + Chain, BlockFilter)) + // Cases 3 & 4 + return greaterWithBias( + (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb), + EntryFreq); + // Cases 1 & 2 + return greaterWithBias((P + U), + (Qout + std::min(Qin, F) * AdjustedSuccSumProb + + std::max(Qin, F) * UProb), + EntryFreq); +} + +/// Check for a trellis layout. \p BB is the upper part of a trellis if its +/// successors form the lower part of a trellis. A successor set S forms the +/// lower part of a trellis if all of the predecessors of S are either in S or +/// have all of S as successors. We ignore trellises where BB doesn't have 2 +/// successors because for fewer than 2, it's trivial, and for 3 or greater they +/// are very uncommon and complex to compute optimally. Allowing edges within S +/// is not strictly a trellis, but the same algorithm works, so we allow it. +bool MachineBlockPlacement::isTrellis( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + // Technically BB could form a trellis with branching factor higher than 2. + // But that's extremely uncommon. + if (BB->succ_size() != 2 || ViableSuccs.size() != 2) return false; - // If we outline optional branches, look whether Succ is unavoidable, i.e. - // dominates all terminators of the MachineFunction. If it does, other - // successors must be optional. Don't do this for cold branches. - if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) { - for (MachineBasicBlock *Pred : Succ->predecessors()) { - // Check whether there is an unplaced optional branch. - if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + + SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(), + BB->succ_end()); + // To avoid reviewing the same predecessors twice. + SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds; + + for (MachineBasicBlock *Succ : ViableSuccs) { + int PredCount = 0; + for (auto SuccPred : Succ->predecessors()) { + // Allow triangle successors, but don't count them. + if (Successors.count(SuccPred)) { + // Make sure that it is actually a triangle. + for (MachineBasicBlock *CheckSucc : SuccPred->successors()) + if (!Successors.count(CheckSucc)) + return false; + continue; + } + const BlockChain *PredChain = BlockToChain[SuccPred]; + if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) || + PredChain == &Chain || PredChain == BlockToChain[Succ]) continue; - // Check whether the optional branch has exactly one BB. - if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) + ++PredCount; + // Perform the successor check only once. + if (!SeenPreds.insert(SuccPred).second) continue; - // Check whether the optional branch is small. - if (Pred->size() < OutlineOptionalThreshold) + if (!hasSameSuccessors(*SuccPred, Successors)) return false; } - return true; - } else + // If one of the successors has only BB as a predecessor, it is not a + // trellis. + if (PredCount < 1) + return false; + } + return true; +} + +/// Pick the highest total weight pair of edges that can both be laid out. +/// The edges in \p Edges[0] are assumed to have a different destination than +/// the edges in \p Edges[1]. Simple counting shows that the best pair is either +/// the individual highest weight edges to the 2 different destinations, or in +/// case of a conflict, one of them should be replaced with a 2nd best edge. +std::pair<MachineBlockPlacement::WeightedEdge, + MachineBlockPlacement::WeightedEdge> +MachineBlockPlacement::getBestNonConflictingEdges( + const MachineBasicBlock *BB, + MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>> + Edges) { + // Sort the edges, and then for each successor, find the best incoming + // predecessor. If the best incoming predecessors aren't the same, + // then that is clearly the best layout. If there is a conflict, one of the + // successors will have to fallthrough from the second best predecessor. We + // compare which combination is better overall. + + // Sort for highest frequency. + auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; }; + + std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp); + std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp); + auto BestA = Edges[0].begin(); + auto BestB = Edges[1].begin(); + // Arrange for the correct answer to be in BestA and BestB + // If the 2 best edges don't conflict, the answer is already there. + if (BestA->Src == BestB->Src) { + // Compare the total fallthrough of (Best + Second Best) for both pairs + auto SecondBestA = std::next(BestA); + auto SecondBestB = std::next(BestB); + BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight; + BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight; + if (BestAScore < BestBScore) + BestA = SecondBestA; + else + BestB = SecondBestB; + } + // Arrange for the BB edge to be in BestA if it exists. + if (BestB->Src == BB) + std::swap(BestA, BestB); + return std::make_pair(*BestA, *BestB); +} + +/// Get the best successor from \p BB based on \p BB being part of a trellis. +/// We only handle trellises with 2 successors, so the algorithm is +/// straightforward: Find the best pair of edges that don't conflict. We find +/// the best incoming edge for each successor in the trellis. If those conflict, +/// we consider which of them should be replaced with the second best. +/// Upon return the two best edges will be in \p BestEdges. If one of the edges +/// comes from \p BB, it will be in \p BestEdges[0] +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::getBestTrellisSuccessor( + const MachineBasicBlock *BB, + const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, + BranchProbability AdjustedSumProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + + BlockAndTailDupResult Result = {nullptr, false}; + SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), + BB->succ_end()); + + // We assume size 2 because it's common. For general n, we would have to do + // the Hungarian algorithm, but it's not worth the complexity because more + // than 2 successors is fairly uncommon, and a trellis even more so. + if (Successors.size() != 2 || ViableSuccs.size() != 2) + return Result; + + // Collect the edge frequencies of all edges that form the trellis. + SmallVector<WeightedEdge, 8> Edges[2]; + int SuccIndex = 0; + for (auto Succ : ViableSuccs) { + for (MachineBasicBlock *SuccPred : Succ->predecessors()) { + // Skip any placed predecessors that are not BB + if (SuccPred != BB) + if ((BlockFilter && !BlockFilter->count(SuccPred)) || + BlockToChain[SuccPred] == &Chain || + BlockToChain[SuccPred] == BlockToChain[Succ]) + continue; + BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) * + MBPI->getEdgeProbability(SuccPred, Succ); + Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ}); + } + ++SuccIndex; + } + + // Pick the best combination of 2 edges from all the edges in the trellis. + WeightedEdge BestA, BestB; + std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); + + if (BestA.Src != BB) { + // If we have a trellis, and BB doesn't have the best fallthrough edges, + // we shouldn't choose any successor. We've already looked and there's a + // better fallthrough edge for all the successors. + DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n"); + return Result; + } + + // Did we pick the triangle edge? If tail-duplication is profitable, do + // that instead. Otherwise merge the triangle edge now while we know it is + // optimal. + if (BestA.Dest == BestB.Src) { + // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2 + // would be better. + MachineBasicBlock *Succ1 = BestA.Dest; + MachineBasicBlock *Succ2 = BestB.Dest; + // Check to see if tail-duplication would be profitable. + if (TailDupPlacement && shouldTailDuplicate(Succ2) && + canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) && + isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1), + Chain, BlockFilter)) { + DEBUG(BranchProbability Succ2Prob = getAdjustedProbability( + MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb); + dbgs() << " Selected: " << getBlockName(Succ2) + << ", probability: " << Succ2Prob << " (Tail Duplicate)\n"); + Result.BB = Succ2; + Result.ShouldTailDup = true; + return Result; + } + } + // We have already computed the optimal edge for the other side of the + // trellis. + ComputedEdges[BestB.Src] = { BestB.Dest, false }; + + auto TrellisSucc = BestA.Dest; + DEBUG(BranchProbability SuccProb = getAdjustedProbability( + MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb); + dbgs() << " Selected: " << getBlockName(TrellisSucc) + << ", probability: " << SuccProb << " (Trellis)\n"); + Result.BB = TrellisSucc; + return Result; +} + +/// When the option TailDupPlacement is on, this method checks if the +/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated +/// into all of its unplaced, unfiltered predecessors, that are not BB. +bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( + const MachineBasicBlock *BB, MachineBasicBlock *Succ, + const BlockChain &Chain, const BlockFilterSet *BlockFilter) { + if (!shouldTailDuplicate(Succ)) return false; + + // For CFG checking. + SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), + BB->succ_end()); + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Make sure all unplaced and unfiltered predecessors can be + // tail-duplicated into. + // Skip any blocks that are already placed or not in this loop. + if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) + || BlockToChain[Pred] == &Chain) + continue; + if (!TailDup.canTailDuplicate(Succ, Pred)) { + if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) + // This will result in a trellis after tail duplication, so we don't + // need to copy Succ into this predecessor. In the presence + // of a trellis tail duplication can continue to be profitable. + // For example: + // A A + // |\ |\ + // | \ | \ + // | C | C+BB + // | / | | + // |/ | | + // BB => BB | + // |\ |\/| + // | \ |/\| + // | D | D + // | / | / + // |/ |/ + // Succ Succ + // + // After BB was duplicated into C, the layout looks like the one on the + // right. BB and C now have the same successors. When considering + // whether Succ can be duplicated into all its unplaced predecessors, we + // ignore C. + // We can do this because C already has a profitable fallthrough, namely + // D. TODO(iteratee): ignore sufficiently cold predecessors for + // duplication and for this test. + // + // This allows trellises to be laid out in 2 separate chains + // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic + // because it allows the creation of 2 fallthrough paths with links + // between them, and we correctly identify the best layout for these + // CFGs. We want to extend trellises that the user created in addition + // to trellises created by tail-duplication, so we just look for the + // CFG. + continue; + return false; + } + } + return true; +} + +/// Find chains of triangles where we believe it would be profitable to +/// tail-duplicate them all, but a local analysis would not find them. +/// There are 3 ways this can be profitable: +/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with +/// longer chains) +/// 2) The chains are statically correlated. Branch probabilities have a very +/// U-shaped distribution. +/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] +/// If the branches in a chain are likely to be from the same side of the +/// distribution as their predecessor, but are independent at runtime, this +/// transformation is profitable. (Because the cost of being wrong is a small +/// fixed cost, unlike the standard triangle layout where the cost of being +/// wrong scales with the # of triangles.) +/// 3) The chains are dynamically correlated. If the probability that a previous +/// branch was taken positively influences whether the next branch will be +/// taken +/// We believe that 2 and 3 are common enough to justify the small margin in 1. +void MachineBlockPlacement::precomputeTriangleChains() { + struct TriangleChain { + std::vector<MachineBasicBlock *> Edges; + TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst) + : Edges({src, dst}) {} + + void append(MachineBasicBlock *dst) { + assert(getKey()->isSuccessor(dst) && + "Attempting to append a block that is not a successor."); + Edges.push_back(dst); + } + + unsigned count() const { return Edges.size() - 1; } + + MachineBasicBlock *getKey() const { + return Edges.back(); + } + }; + + if (TriangleChainCount == 0) + return; + + DEBUG(dbgs() << "Pre-computing triangle chains.\n"); + // Map from last block to the chain that contains it. This allows us to extend + // chains as we find new triangles. + DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap; + for (MachineBasicBlock &BB : *F) { + // If BB doesn't have 2 successors, it doesn't start a triangle. + if (BB.succ_size() != 2) + continue; + MachineBasicBlock *PDom = nullptr; + for (MachineBasicBlock *Succ : BB.successors()) { + if (!MPDT->dominates(Succ, &BB)) + continue; + PDom = Succ; + break; + } + // If BB doesn't have a post-dominating successor, it doesn't form a + // triangle. + if (PDom == nullptr) + continue; + // If PDom has a hint that it is low probability, skip this triangle. + if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100)) + continue; + // If PDom isn't eligible for duplication, this isn't the kind of triangle + // we're looking for. + if (!shouldTailDuplicate(PDom)) + continue; + bool CanTailDuplicate = true; + // If PDom can't tail-duplicate into it's non-BB predecessors, then this + // isn't the kind of triangle we're looking for. + for (MachineBasicBlock* Pred : PDom->predecessors()) { + if (Pred == &BB) + continue; + if (!TailDup.canTailDuplicate(PDom, Pred)) { + CanTailDuplicate = false; + break; + } + } + // If we can't tail-duplicate PDom to its predecessors, then skip this + // triangle. + if (!CanTailDuplicate) + continue; + + // Now we have an interesting triangle. Insert it if it's not part of an + // existing chain + // Note: This cannot be replaced with a call insert() or emplace() because + // the find key is BB, but the insert/emplace key is PDom. + auto Found = TriangleChainMap.find(&BB); + // If it is, remove the chain from the map, grow it, and put it back in the + // map with the end as the new key. + if (Found != TriangleChainMap.end()) { + TriangleChain Chain = std::move(Found->second); + TriangleChainMap.erase(Found); + Chain.append(PDom); + TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain))); + } else { + auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom); + assert(InsertResult.second && "Block seen twice."); + (void)InsertResult; + } + } + + // Iterating over a DenseMap is safe here, because the only thing in the body + // of the loop is inserting into another DenseMap (ComputedEdges). + // ComputedEdges is never iterated, so this doesn't lead to non-determinism. + for (auto &ChainPair : TriangleChainMap) { + TriangleChain &Chain = ChainPair.second; + // Benchmarking has shown that due to branch correlation duplicating 2 or + // more triangles is profitable, despite the calculations assuming + // independence. + if (Chain.count() < TriangleChainCount) + continue; + MachineBasicBlock *dst = Chain.Edges.back(); + Chain.Edges.pop_back(); + for (MachineBasicBlock *src : reverse(Chain.Edges)) { + DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" << + getBlockName(dst) << " as pre-computed based on triangles.\n"); + + auto InsertResult = ComputedEdges.insert({src, {dst, true}}); + assert(InsertResult.second && "Block seen twice."); + (void)InsertResult; + + dst = src; + } + } } // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold( - MachineBasicBlock *BB) { + const MachineBasicBlock *BB) { if (!BB->getParent()->getFunction()->getEntryCount()) return BranchProbability(StaticLikelyProb, 100); if (BB->succ_size() == 2) { @@ -609,11 +1211,11 @@ static BranchProbability getLayoutSuccessorProbThreshold( if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { /* See case 1 below for the cost analysis. For BB->Succ to * be taken with smaller cost, the following needs to hold: - * Prob(BB->Succ) > 2* Prob(BB->Pred) - * So the threshold T - * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, - * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified - * branch bias, we have + * Prob(BB->Succ) > 2 * Prob(BB->Pred) + * So the threshold T in the calculation below + * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred) + * So T / (1 - T) = 2, Yielding T = 2/3 + * Also adding user specified branch bias, we have * T = (2/3)*(ProfileLikelyProb/50) * = (2*ProfileLikelyProb)/150) */ @@ -625,10 +1227,17 @@ static BranchProbability getLayoutSuccessorProbThreshold( /// Checks to see if the layout candidate block \p Succ has a better layout /// predecessor than \c BB. If yes, returns true. +/// \p SuccProb: The probability adjusted for only remaining blocks. +/// Only used for logging +/// \p RealSuccProb: The un-adjusted probability. +/// \p Chain: The chain that BB belongs to and Succ is being considered for. +/// \p BlockFilter: if non-null, the set of blocks that make up the loop being +/// considered bool MachineBlockPlacement::hasBetterLayoutPredecessor( - MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, - BranchProbability SuccProb, BranchProbability RealSuccProb, - BlockChain &Chain, const BlockFilterSet *BlockFilter) { + const MachineBasicBlock *BB, const MachineBasicBlock *Succ, + const BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { // There isn't a better layout when there are no unscheduled predecessors. if (SuccChain.UnscheduledPredecessors == 0) @@ -734,11 +1343,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( // | Pred----| | S1---- // | | | | // --(S1 or S2) ---Pred-- + // | + // S2 // // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2) // + min(freq(Pred->S1), freq(Pred->S2)) // Non-topo-order cost: - // In the worst case, S2 will not get laid out after Pred. // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2). // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2)) // is 0. Then the non topo layout is better when @@ -756,13 +1366,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( for (MachineBasicBlock *Pred : Succ->predecessors()) { if (Pred == Succ || BlockToChain[Pred] == &SuccChain || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) + BlockToChain[Pred] == &Chain || + // This check is redundant except for look ahead. This function is + // called for lookahead by isProfitableToTailDup when BB hasn't been + // placed yet. + (Pred == BB)) continue; // Do backward checking. // For all cases above, we need a backward checking to filter out edges that - // are not 'strongly' biased. With profile data available, the check is - // mostly redundant for case 2 (when threshold prob is set at 50%) unless S - // has more than two successors. + // are not 'strongly' biased. // BB Pred // \ / // Succ @@ -798,14 +1410,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor( /// breaking CFG structure, but cave and break such structures in the case of /// very hot successor edges. /// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * -MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter) { +/// \returns The best successor block found, or null if none are viable, along +/// with a boolean indicating if tail duplication is necessary. +MachineBlockPlacement::BlockAndTailDupResult +MachineBlockPlacement::selectBestSuccessor( + const MachineBasicBlock *BB, const BlockChain &Chain, + const BlockFilterSet *BlockFilter) { const BranchProbability HotProb(StaticLikelyProb, 100); - MachineBasicBlock *BestSucc = nullptr; + BlockAndTailDupResult BestSucc = { nullptr, false }; auto BestProb = BranchProbability::getZero(); SmallVector<MachineBasicBlock *, 4> Successors; @@ -813,22 +1426,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, collectViableSuccessors(BB, Chain, BlockFilter, Successors); DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); + + // if we already precomputed the best successor for BB, return that if still + // applicable. + auto FoundEdge = ComputedEdges.find(BB); + if (FoundEdge != ComputedEdges.end()) { + MachineBasicBlock *Succ = FoundEdge->second.BB; + ComputedEdges.erase(FoundEdge); + BlockChain *SuccChain = BlockToChain[Succ]; + if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) && + SuccChain != &Chain && Succ == *SuccChain->begin()) + return FoundEdge->second; + } + + // if BB is part of a trellis, Use the trellis to determine the optimal + // fallthrough edges + if (isTrellis(BB, Successors, Chain, BlockFilter)) + return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain, + BlockFilter); + + // For blocks with CFG violations, we may be able to lay them out anyway with + // tail-duplication. We keep this vector so we can perform the probability + // calculations the minimum number of times. + SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4> + DupCandidates; for (MachineBasicBlock *Succ : Successors) { auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BranchProbability SuccProb = getAdjustedProbability(RealSuccProb, AdjustedSumProb); - // This heuristic is off by default. - if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, - HotProb)) - return Succ; - BlockChain &SuccChain = *BlockToChain[Succ]; // Skip the edge \c BB->Succ if block \c Succ has a better layout // predecessor that yields lower global cost. if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, - Chain, BlockFilter)) + Chain, BlockFilter)) { + // If tail duplication would make Succ profitable, place it. + if (TailDupPlacement && shouldTailDuplicate(Succ)) + DupCandidates.push_back(std::make_tuple(SuccProb, Succ)); continue; + } DEBUG( dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " @@ -836,17 +1472,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) { + if (BestSucc.BB && BestProb >= SuccProb) { DEBUG(dbgs() << " Not the best candidate, continuing\n"); continue; } DEBUG(dbgs() << " Setting it as best candidate\n"); - BestSucc = Succ; + BestSucc.BB = Succ; BestProb = SuccProb; } - if (BestSucc) - DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc) << "\n"); + // Handle the tail duplication candidates in order of decreasing probability. + // Stop at the first one that is profitable. Also stop if they are less + // profitable than BestSucc. Position is important because we preserve it and + // prefer first best match. Here we aren't comparing in order, so we capture + // the position instead. + if (DupCandidates.size() != 0) { + auto cmp = + [](const std::tuple<BranchProbability, MachineBasicBlock *> &a, + const std::tuple<BranchProbability, MachineBasicBlock *> &b) { + return std::get<0>(a) > std::get<0>(b); + }; + std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp); + } + for(auto &Tup : DupCandidates) { + BranchProbability DupProb; + MachineBasicBlock *Succ; + std::tie(DupProb, Succ) = Tup; + if (DupProb < BestProb) + break; + if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) + && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) { + DEBUG( + dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " + << DupProb + << " (Tail Duplicate)\n"); + BestSucc.BB = Succ; + BestSucc.ShouldTailDup = true; + break; + } + } + + if (BestSucc.BB) + DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n"); return BestSucc; } @@ -862,7 +1529,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, /// /// \returns The best block found, or null if none are viable. MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( - BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { + const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { // Once we need to walk the worklist looking for a candidate, cleanup the // worklist of already placed entries. // FIXME: If this shows up on profiles, it could be folded (at the cost of @@ -881,13 +1548,15 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( MachineBasicBlock *BestBlock = nullptr; BlockFrequency BestFreq; for (MachineBasicBlock *MBB : WorkList) { - assert(MBB->isEHPad() == IsEHPad); + assert(MBB->isEHPad() == IsEHPad && + "EHPad mismatch between block and work list."); BlockChain &SuccChain = *BlockToChain[MBB]; if (&SuccChain == &Chain) continue; - assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block"); + assert(SuccChain.UnscheduledPredecessors == 0 && + "Found CFG-violating block"); BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB); DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "; @@ -948,16 +1617,19 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( } void MachineBlockPlacement::fillWorkLists( - MachineBasicBlock *MBB, + const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter = nullptr) { BlockChain &Chain = *BlockToChain[MBB]; if (!UpdatedPreds.insert(&Chain).second) return; - assert(Chain.UnscheduledPredecessors == 0); + assert( + Chain.UnscheduledPredecessors == 0 && + "Attempting to place block with unscheduled predecessors in worklist."); for (MachineBasicBlock *ChainBB : Chain) { - assert(BlockToChain[ChainBB] == &Chain); + assert(BlockToChain[ChainBB] == &Chain && + "Block in chain doesn't match BlockToChain map."); for (MachineBasicBlock *Pred : ChainBB->predecessors()) { if (BlockFilter && !BlockFilter->count(Pred)) continue; @@ -970,23 +1642,23 @@ void MachineBlockPlacement::fillWorkLists( if (Chain.UnscheduledPredecessors != 0) return; - MBB = *Chain.begin(); - if (MBB->isEHPad()) - EHPadWorkList.push_back(MBB); + MachineBasicBlock *BB = *Chain.begin(); + if (BB->isEHPad()) + EHPadWorkList.push_back(BB); else - BlockWorkList.push_back(MBB); + BlockWorkList.push_back(BB); } void MachineBlockPlacement::buildChain( - MachineBasicBlock *BB, BlockChain &Chain, + const MachineBasicBlock *HeadBB, BlockChain &Chain, BlockFilterSet *BlockFilter) { - assert(BB && "BB must not be null.\n"); - assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); + assert(HeadBB && "BB must not be null.\n"); + assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n"); MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); - MachineBasicBlock *LoopHeaderBB = BB; + const MachineBasicBlock *LoopHeaderBB = HeadBB; markChainSuccessors(Chain, LoopHeaderBB, BlockFilter); - BB = *std::prev(Chain.end()); + MachineBasicBlock *BB = *std::prev(Chain.end()); for (;;) { assert(BB && "null block found at end of chain in loop."); assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); @@ -995,7 +1667,11 @@ void MachineBlockPlacement::buildChain( // Look for the best viable successor if there is one to place immediately // after this block. - MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + auto Result = selectBestSuccessor(BB, Chain, BlockFilter); + MachineBasicBlock* BestSucc = Result.BB; + bool ShouldTailDup = Result.ShouldTailDup; + if (TailDupPlacement) + ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc)); // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at @@ -1016,7 +1692,7 @@ void MachineBlockPlacement::buildChain( // Placement may have changed tail duplication opportunities. // Check for that now. - if (TailDupPlacement && BestSucc) { + if (TailDupPlacement && BestSucc && ShouldTailDup) { // If the chosen successor was duplicated into all its predecessors, // don't bother laying it out, just go round the loop again with BB as // the chain end. @@ -1052,7 +1728,7 @@ void MachineBlockPlacement::buildChain( /// unconditional jump (for the backedge) rotating it in front of the loop /// header is always profitable. MachineBasicBlock * -MachineBlockPlacement::findBestLoopTop(MachineLoop &L, +MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // Placing the latch block before the header may introduce an extra branch // that skips this block the first time the loop is executed, which we want @@ -1116,7 +1792,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L, /// block to layout at the top of the loop. Typically this is done to maximize /// fallthrough opportunities. MachineBasicBlock * -MachineBlockPlacement::findBestLoopExit(MachineLoop &L, +MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // We don't want to layout the loop linearly in all cases. If the loop header // is just a normal basic block in the loop, we want to look for what block @@ -1235,7 +1911,7 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L, /// branches. For example, if the loop has fallthrough into its header and out /// of its bottom already, don't rotate it. void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, - MachineBasicBlock *ExitingBB, + const MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet) { if (!ExitingBB) return; @@ -1285,7 +1961,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, /// Therefore, the cost for a given rotation is the sum of costs listed above. /// We select the best rotation with the smallest cost. void MachineBlockPlacement::rotateLoopWithProfile( - BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + BlockChain &LoopChain, const MachineLoop &L, + const BlockFilterSet &LoopBlockSet) { auto HeaderBB = L.getHeader(); auto HeaderIter = find(LoopChain, HeaderBB); auto RotationPos = LoopChain.end(); @@ -1422,7 +2099,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( /// When profile data is available, exclude cold blocks from the returned set; /// otherwise, collect all blocks in the loop. MachineBlockPlacement::BlockFilterSet -MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { +MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { BlockFilterSet LoopBlockSet; // Filter cold blocks off from LoopBlockSet when profile data is available. @@ -1459,14 +2136,16 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { /// as much as possible. We can then stitch the chains together in a way which /// both preserves the topological structure and minimizes taken conditional /// branches. -void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { +void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { // First recurse through any nested loops, building chains for those inner // loops. - for (MachineLoop *InnerLoop : L) + for (const MachineLoop *InnerLoop : L) buildLoopChains(*InnerLoop); - assert(BlockWorkList.empty()); - assert(EHPadWorkList.empty()); + assert(BlockWorkList.empty() && + "BlockWorkList not empty when starting to build loop chains."); + assert(EHPadWorkList.empty() && + "EHPadWorkList not empty when starting to build loop chains."); BlockFilterSet LoopBlockSet = collectLoopBlockSet(L); // Check if we have profile data for this function. If yes, we will rotate @@ -1496,10 +2175,11 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { // walk the blocks, and use a set to prevent visiting a particular chain // twice. SmallPtrSet<BlockChain *, 4> UpdatedPreds; - assert(LoopChain.UnscheduledPredecessors == 0); + assert(LoopChain.UnscheduledPredecessors == 0 && + "LoopChain should not have unscheduled predecessors."); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : LoopBlockSet) + for (const MachineBasicBlock *LoopBB : LoopBlockSet) fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet); buildChain(LoopTop, LoopChain, &LoopBlockSet); @@ -1533,7 +2213,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { if (!LoopBlockSet.empty()) { BadLoop = true; - for (MachineBasicBlock *LoopBB : LoopBlockSet) + for (const MachineBasicBlock *LoopBB : LoopBlockSet) dbgs() << "Loop contains blocks never placed into a chain!\n" << " Loop header: " << getBlockName(*L.block_begin()) << "\n" << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n" @@ -1546,31 +2226,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { EHPadWorkList.clear(); } -/// When OutlineOpitonalBranches is on, this method collects BBs that -/// dominates all terminator blocks of the function \p F. -void MachineBlockPlacement::collectMustExecuteBBs() { - if (OutlineOptionalBranches) { - // Find the nearest common dominator of all of F's terminators. - MachineBasicBlock *Terminator = nullptr; - for (MachineBasicBlock &MBB : *F) { - if (MBB.succ_size() == 0) { - if (Terminator == nullptr) - Terminator = &MBB; - else - Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); - } - } - - // MBBs dominating this common dominator are unavoidable. - UnavoidableBlocks.clear(); - for (MachineBasicBlock &MBB : *F) { - if (MDT->dominates(&MBB, Terminator)) { - UnavoidableBlocks.insert(&MBB); - } - } - } -} - void MachineBlockPlacement::buildCFGChains() { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. @@ -1605,16 +2260,15 @@ void MachineBlockPlacement::buildCFGChains() { } } - // Turned on with OutlineOptionalBranches option - collectMustExecuteBBs(); - // Build any loop-based chains. PreferredLoopExit = nullptr; for (MachineLoop *L : *MLI) buildLoopChains(*L); - assert(BlockWorkList.empty()); - assert(EHPadWorkList.empty()); + assert(BlockWorkList.empty() && + "BlockWorkList should be empty before building final chain."); + assert(EHPadWorkList.empty() && + "EHPadWorkList should be empty before building final chain."); SmallPtrSet<BlockChain *, 4> UpdatedPreds; for (MachineBasicBlock &MBB : *F) @@ -1839,7 +2493,7 @@ void MachineBlockPlacement::alignBlocks() { /// @return true if \p BB was removed. bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, - MachineBasicBlock *LoopHeaderBB, + const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt) { bool Removed, DuplicatedToLPred; @@ -1901,21 +2555,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( /// \return - True if the block was duplicated into all preds and removed. bool MachineBlockPlacement::maybeTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *LPred, - const BlockChain &Chain, BlockFilterSet *BlockFilter, + BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, bool &DuplicatedToLPred) { - DuplicatedToLPred = false; + if (!shouldTailDuplicate(BB)) + return false; + DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber() << "\n"); - bool IsSimple = TailDup.isSimpleBB(BB); - // Blocks with single successors don't create additional fallthrough - // opportunities. Don't duplicate them. TODO: When conditional exits are - // analyzable, allow them to be duplicated. - if (!IsSimple && BB->succ_size() == 1) - return false; - if (!TailDup.shouldTailDuplicate(IsSimple, *BB)) - return false; + // This has to be a callback because none of it can be done after // BB is deleted. bool Removed = false; @@ -1967,6 +2616,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback); SmallVector<MachineBasicBlock *, 8> DuplicatedPreds; + bool IsSimple = TailDup.isSimpleBB(BB); TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef); @@ -2006,25 +2656,46 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis<MachineLoopInfo>(); TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); - MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = nullptr; // Initialize PreferredLoopExit to nullptr here since it may never be set if // there are no MachineLoops. PreferredLoopExit = nullptr; + assert(BlockToChain.empty() && + "BlockToChain map should be empty before starting placement."); + assert(ComputedEdges.empty() && + "Computed Edge map should be empty before starting placement."); + + unsigned TailDupSize = TailDupPlacementThreshold; + // If only the aggressive threshold is explicitly set, use it. + if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && + TailDupPlacementThreshold.getNumOccurrences() == 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + + TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>(); + // For agressive optimization, we can adjust some thresholds to be less + // conservative. + if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) { + // At O3 we should be more willing to copy blocks for tail duplication. This + // increases size pressure, so we only do it at O3 + // Do this unless only the regular threshold is explicitly set. + if (TailDupPlacementThreshold.getNumOccurrences() == 0 || + TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + } + if (TailDupPlacement) { - unsigned TailDupSize = TailDuplicatePlacementThreshold; + MPDT = &getAnalysis<MachinePostDominatorTree>(); if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + precomputeTriangleChains(); } - assert(BlockToChain.empty()); - buildCFGChains(); // Changing the layout can create new tail merging opportunities. - TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>(); // TailMerge can create jump into if branches that make CFG irreducible for // HW that requires structured CFG. bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && @@ -2032,7 +2703,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1; + unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); @@ -2041,8 +2712,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); - // Must redo the dominator tree if blocks were changed. - MDT->runOnMachineFunction(MF); + ComputedEdges.clear(); + // Must redo the post-dominator tree if blocks were changed. + if (MPDT) + MPDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } @@ -2052,6 +2725,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { alignBlocks(); BlockToChain.clear(); + ComputedEdges.clear(); ChainAllocator.DestroyAll(); if (AlignAllBlock) @@ -2067,6 +2741,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MBI->setAlignment(AlignAllNonFallThruBlocks); } } + if (ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + F->getFunction()->getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("MBP." + MF.getName(), false); + } + // We always return true as we have no way to track whether the final order // differs from the original order. diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp index 0766f465456c..34f6bbd59e9b 100644 --- a/contrib/llvm/lib/CodeGen/MachineCSE.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp @@ -108,12 +108,12 @@ namespace { char MachineCSE::ID = 0; char &llvm::MachineCSEID = MachineCSE::ID; -INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse", - "Machine Common Subexpression Elimination", false, false) +INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, + "Machine Common Subexpression Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineCSE, "machine-cse", - "Machine Common Subexpression Elimination", false, false) +INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, + "Machine Common Subexpression Elimination", false, false) /// The source register of a COPY machine instruction can be propagated to all /// its users, and this propagation could increase the probability of finding @@ -180,8 +180,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg, I = skipDebugInstructionsForward(I, E); if (I == E) - // Reached end of block, register is obviously dead. - return true; + // Reached end of block, we don't know if register is dead or not. + return false; bool SeenDef = false; for (const MachineOperand &MO : I->operands()) { diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp index 5beed5f5dd08..c176de16b593 100644 --- a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // The machine combiner pass uses machine trace metrics to ensure the combined -// instructions does not lengthen the critical path or the resource depth. +// instructions do not lengthen the critical path or the resource depth. //===----------------------------------------------------------------------===// #define DEBUG_TYPE "machine-combiner" @@ -86,11 +86,11 @@ private: char MachineCombiner::ID = 0; char &llvm::MachineCombinerID = MachineCombiner::ID; -INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", +INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", +INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { @@ -135,7 +135,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth for (auto *InstrPtr : InsInstrs) { // for each Use unsigned IDepth = 0; - DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";); + DEBUG(dbgs() << "NEW INSTR "; + InstrPtr->print(dbgs(), TII); + dbgs() << "\n";); for (const MachineOperand &MO : InstrPtr->operands()) { // Check for virtual register operand. if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) @@ -352,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { return false; } +static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, + SmallVector<MachineInstr *, 16> InsInstrs, + SmallVector<MachineInstr *, 16> DelInstrs, + MachineTraceMetrics *Traces) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); + for (auto *InstrPtr : DelInstrs) + InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); + ++NumInstCombined; + Traces->invalidate(MBB); + Traces->verifyAnalysis(); +} + /// Substitute a slow code sequence with a faster one by /// evaluating instruction combining pattern. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction @@ -406,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { DenseMap<unsigned, unsigned> InstrIdxForVirtReg; if (!MinInstr) MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); Traces->verifyAnalysis(); TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, InstrIdxForVirtReg); @@ -426,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || - (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - DelInstrs, InstrIdxForVirtReg, P) && - preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { - for (auto *InstrPtr : InsInstrs) - MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) - InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - - Changed = true; - ++NumInstCombined; - - Traces->invalidate(MBB); - Traces->verifyAnalysis(); + if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); // Eagerly stop after the first pattern fires. + Changed = true; break; } else { + // Calculating the trace metrics may be expensive, + // so only do this when necessary. + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, + InstrIdxForVirtReg, P) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + // Eagerly stop after the first pattern fires. + Changed = true; + break; + } // Cleanup instructions of the alternative code sequence. There is no // use for them. MachineFunction *MF = MBB->getParent(); diff --git a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 5de6dec29fb9..f83b5481e0a5 100644 --- a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -27,7 +27,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "codegen-cp" +#define DEBUG_TYPE "machine-cp" STATISTIC(NumDeletes, "Number of dead copies deleted"); @@ -79,7 +79,7 @@ namespace { char MachineCopyPropagation::ID = 0; char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID; -INITIALIZE_PASS(MachineCopyPropagation, "machine-cp", +INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE, "Machine Copy Propagation Pass", false, false) /// Remove any entry in \p Map where the register is a subregister or equal to @@ -291,17 +291,9 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) { if (MO.isDef()) { Defs.push_back(Reg); - } else { + continue; + } else if (MO.readsReg()) ReadRegister(Reg); - } - // Treat undef use like defs for copy propagation but not for - // dead copy. We would need to do a liveness check to be sure the copy - // is dead for undef uses. - // The backends are allowed to do whatever they want with undef value - // and we cannot be sure this register will not be rewritten to break - // some false dependencies for the hardware for instance. - if (MO.isUndef()) - Defs.push_back(Reg); } // The instruction has a register mask operand which means that it clobbers diff --git a/contrib/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm/lib/CodeGen/MachineDominators.cpp index 303a6a9263be..e3a6c51c47ad 100644 --- a/contrib/llvm/lib/CodeGen/MachineDominators.cpp +++ b/contrib/llvm/lib/CodeGen/MachineDominators.cpp @@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const { bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) { CriticalEdgesToSplit.clear(); NewBBs.clear(); + DT.reset(new DominatorTreeBase<MachineBasicBlock>(false)); DT->recalculate(F); - return false; } MachineDominatorTree::MachineDominatorTree() : MachineFunctionPass(ID) { initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); - DT = new DominatorTreeBase<MachineBasicBlock>(false); -} - -MachineDominatorTree::~MachineDominatorTree() { - delete DT; } void MachineDominatorTree::releaseMemory() { - DT->releaseMemory(); + CriticalEdgesToSplit.clear(); + DT.reset(nullptr); } void MachineDominatorTree::verifyAnalysis() const { - if (VerifyMachineDomInfo) + if (DT && VerifyMachineDomInfo) verifyDomTree(); } void MachineDominatorTree::print(raw_ostream &OS, const Module*) const { - DT->print(OS); + if (DT) + DT->print(OS); } void MachineDominatorTree::applySplitCriticalEdges() const { @@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const { } void MachineDominatorTree::verifyDomTree() const { + if (!DT) + return; MachineFunction &F = *getRoot()->getParent(); - MachineDominatorTree OtherDT; - OtherDT.DT->recalculate(F); - if (compare(OtherDT)) { + DominatorTreeBase<MachineBasicBlock> OtherDT(false); + OtherDT.recalculate(F); + if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() || + DT->compare(OtherDT)) { errs() << "MachineDominatorTree is not up to date!\nComputed:\n"; - print(errs(), nullptr); + DT->print(errs()); errs() << "\nActual:\n"; - OtherDT.print(errs(), nullptr); + OtherDT.print(errs()); abort(); } } diff --git a/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp new file mode 100644 index 000000000000..73d778ff3023 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp @@ -0,0 +1,244 @@ +//===-- MachineFrameInfo.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file Implements MachineFrameInfo that manages the stack frame. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFrameInfo.h" + +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> + +#define DEBUG_TYPE "codegen" + +using namespace llvm; + +void MachineFrameInfo::ensureMaxAlignment(unsigned Align) { + if (!StackRealignable) + assert(Align <= StackAlignment && + "For targets without stack realignment, Align is out of limit!"); + if (MaxAlignment < Align) MaxAlignment = Align; +} + +/// Clamp the alignment if requested and emit a warning. +static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align, + unsigned StackAlign) { + if (!ShouldClamp || Align <= StackAlign) + return Align; + DEBUG(dbgs() << "Warning: requested alignment " << Align + << " exceeds the stack alignment " << StackAlign + << " when stack realignment is off" << '\n'); + return StackAlign; +} + +int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, + bool isSS, const AllocaInst *Alloca) { + assert(Size != 0 && "Cannot allocate zero size stack objects!"); + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca, + !isSS)); + int Index = (int)Objects.size() - NumFixedObjects - 1; + assert(Index >= 0 && "Bad frame index!"); + ensureMaxAlignment(Alignment); + return Index; +} + +int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, + unsigned Alignment) { + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + CreateStackObject(Size, Alignment, true); + int Index = (int)Objects.size() - NumFixedObjects - 1; + ensureMaxAlignment(Alignment); + return Index; +} + +int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, + const AllocaInst *Alloca) { + HasVarSizedObjects = true; + Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); + Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true)); + ensureMaxAlignment(Alignment); + return (int)Objects.size()-NumFixedObjects-1; +} + +int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, + bool Immutable, bool isAliased) { + assert(Size != 0 && "Cannot allocate zero size fixed stack objects!"); + // The alignment of the frame index can be determined from its offset from + // the incoming frame position. If the frame object is at offset 32 and + // the stack is guaranteed to be 16-byte aligned, then we know that the + // object is 16-byte aligned. Note that unlike the non-fixed case, if the + // stack needs realignment, we can't assume that the stack will in fact be + // aligned. + unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); + Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); + Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, + /*isSS*/ false, + /*Alloca*/ nullptr, isAliased)); + return -++NumFixedObjects; +} + +int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, + int64_t SPOffset, + bool Immutable) { + unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); + Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); + Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, + /*isSS*/ true, + /*Alloca*/ nullptr, + /*isAliased*/ false)); + return -++NumFixedObjects; +} + +BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + BitVector BV(TRI->getNumRegs()); + + // Before CSI is calculated, no registers are considered pristine. They can be + // freely used and PEI will make sure they are saved. + if (!isCalleeSavedInfoValid()) + return BV; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; + ++CSR) + BV.set(*CSR); + + // Saved CSRs are not pristine. + for (auto &I : getCalleeSavedInfo()) + for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) + BV.reset(*S); + + return BV; +} + +unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + unsigned MaxAlign = getMaxAlignment(); + int Offset = 0; + + // This code is very, very similar to PEI::calculateFrameObjectOffsets(). + // It really should be refactored to share code. Until then, changes + // should keep in mind that there's tight coupling between the two. + + for (int i = getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) { + if (isDeadObjectIndex(i)) + continue; + Offset += getObjectSize(i); + unsigned Align = getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + + MaxAlign = std::max(Align, MaxAlign); + } + + if (adjustsStack() && TFI->hasReservedCallFrame(MF)) + Offset += getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (adjustsStack() || hasVarSizedObjects() || + (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0)) + StackAlign = TFI->getStackAlignment(); + else + StackAlign = TFI->getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + + return (unsigned)Offset; +} + +void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + assert(FrameSetupOpcode != ~0u && FrameDestroyOpcode != ~0u && + "Can only compute MaxCallFrameSize if Setup/Destroy opcode are known"); + + MaxCallFrameSize = 0; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) { + unsigned Size = TII.getFrameSize(MI); + MaxCallFrameSize = std::max(MaxCallFrameSize, Size); + AdjustsStack = true; + } else if (MI.isInlineAsm()) { + // Some inline asm's need a stack frame, as indicated by operand 1. + unsigned ExtraInfo = MI.getOperand(InlineAsm::MIOp_ExtraInfo).getImm(); + if (ExtraInfo & InlineAsm::Extra_IsAlignStack) + AdjustsStack = true; + } + } + } +} + +void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ + if (Objects.empty()) return; + + const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering(); + int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0); + + OS << "Frame Objects:\n"; + + for (unsigned i = 0, e = Objects.size(); i != e; ++i) { + const StackObject &SO = Objects[i]; + OS << " fi#" << (int)(i-NumFixedObjects) << ": "; + if (SO.Size == ~0ULL) { + OS << "dead\n"; + continue; + } + if (SO.Size == 0) + OS << "variable sized"; + else + OS << "size=" << SO.Size; + OS << ", align=" << SO.Alignment; + + if (i < NumFixedObjects) + OS << ", fixed"; + if (i < NumFixedObjects || SO.SPOffset != -1) { + int64_t Off = SO.SPOffset - ValOffset; + OS << ", at location [SP"; + if (Off > 0) + OS << "+" << Off; + else if (Off < 0) + OS << Off; + OS << "]"; + } + OS << "\n"; + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const { + print(MF, dbgs()); +} +#endif diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp index c1d5ea96cd17..ac4ccb81b884 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp @@ -169,6 +169,7 @@ void MachineFunction::clear() { InstructionRecycler.clear(Allocator); OperandRecycler.clear(Allocator); BasicBlockRecycler.clear(Allocator); + VariableDbgInfos.clear(); if (RegInfo) { RegInfo->~MachineRegisterInfo(); Allocator.Deallocate(RegInfo); @@ -756,212 +757,6 @@ void llvm::addLandingPadInfo(const LandingPadInst &I, MachineBasicBlock &MBB) { /// \} //===----------------------------------------------------------------------===// -// MachineFrameInfo implementation -//===----------------------------------------------------------------------===// - -/// Make sure the function is at least Align bytes aligned. -void MachineFrameInfo::ensureMaxAlignment(unsigned Align) { - if (!StackRealignable) - assert(Align <= StackAlignment && - "For targets without stack realignment, Align is out of limit!"); - if (MaxAlignment < Align) MaxAlignment = Align; -} - -/// Clamp the alignment if requested and emit a warning. -static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align, - unsigned StackAlign) { - if (!ShouldClamp || Align <= StackAlign) - return Align; - DEBUG(dbgs() << "Warning: requested alignment " << Align - << " exceeds the stack alignment " << StackAlign - << " when stack realignment is off" << '\n'); - return StackAlign; -} - -/// Create a new statically sized stack object, returning a nonnegative -/// identifier to represent it. -int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment, - bool isSS, const AllocaInst *Alloca) { - assert(Size != 0 && "Cannot allocate zero size stack objects!"); - Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca, - !isSS)); - int Index = (int)Objects.size() - NumFixedObjects - 1; - assert(Index >= 0 && "Bad frame index!"); - ensureMaxAlignment(Alignment); - return Index; -} - -/// Create a new statically sized stack object that represents a spill slot, -/// returning a nonnegative identifier to represent it. -int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, - unsigned Alignment) { - Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - CreateStackObject(Size, Alignment, true); - int Index = (int)Objects.size() - NumFixedObjects - 1; - ensureMaxAlignment(Alignment); - return Index; -} - -/// Notify the MachineFrameInfo object that a variable sized object has been -/// created. This must be created whenever a variable sized object is created, -/// whether or not the index returned is actually used. -int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment, - const AllocaInst *Alloca) { - HasVarSizedObjects = true; - Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment); - Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true)); - ensureMaxAlignment(Alignment); - return (int)Objects.size()-NumFixedObjects-1; -} - -/// Create a new object at a fixed location on the stack. -/// All fixed objects should be created before other objects are created for -/// efficiency. By default, fixed objects are immutable. This returns an -/// index with a negative value. -int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset, - bool Immutable, bool isAliased) { - assert(Size != 0 && "Cannot allocate zero size fixed stack objects!"); - // The alignment of the frame index can be determined from its offset from - // the incoming frame position. If the frame object is at offset 32 and - // the stack is guaranteed to be 16-byte aligned, then we know that the - // object is 16-byte aligned. Note that unlike the non-fixed case, if the - // stack needs realignment, we can't assume that the stack will in fact be - // aligned. - unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); - Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); - Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/ false, - /*Alloca*/ nullptr, isAliased)); - return -++NumFixedObjects; -} - -/// Create a spill slot at a fixed location on the stack. -/// Returns an index with a negative value. -int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, - int64_t SPOffset, - bool Immutable) { - unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment); - Align = clampStackAlignment(!StackRealignable, Align, StackAlignment); - Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable, - /*isSS*/ true, - /*Alloca*/ nullptr, - /*isAliased*/ false)); - return -++NumFixedObjects; -} - -BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - BitVector BV(TRI->getNumRegs()); - - // Before CSI is calculated, no registers are considered pristine. They can be - // freely used and PEI will make sure they are saved. - if (!isCalleeSavedInfoValid()) - return BV; - - for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) - BV.set(*CSR); - - // Saved CSRs are not pristine. - for (auto &I : getCalleeSavedInfo()) - for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) - BV.reset(*S); - - return BV; -} - -unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - unsigned MaxAlign = getMaxAlignment(); - int Offset = 0; - - // This code is very, very similar to PEI::calculateFrameObjectOffsets(). - // It really should be refactored to share code. Until then, changes - // should keep in mind that there's tight coupling between the two. - - for (int i = getObjectIndexBegin(); i != 0; ++i) { - int FixedOff = -getObjectOffset(i); - if (FixedOff > Offset) Offset = FixedOff; - } - for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) { - if (isDeadObjectIndex(i)) - continue; - Offset += getObjectSize(i); - unsigned Align = getObjectAlignment(i); - // Adjust to alignment boundary - Offset = (Offset+Align-1)/Align*Align; - - MaxAlign = std::max(Align, MaxAlign); - } - - if (adjustsStack() && TFI->hasReservedCallFrame(MF)) - Offset += getMaxCallFrameSize(); - - // Round up the size to a multiple of the alignment. If the function has - // any calls or alloca's, align to the target's StackAlignment value to - // ensure that the callee's frame or the alloca data is suitably aligned; - // otherwise, for leaf functions, align to the TransientStackAlignment - // value. - unsigned StackAlign; - if (adjustsStack() || hasVarSizedObjects() || - (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0)) - StackAlign = TFI->getStackAlignment(); - else - StackAlign = TFI->getTransientStackAlignment(); - - // If the frame pointer is eliminated, all frame offsets will be relative to - // SP not FP. Align to MaxAlign so this works. - StackAlign = std::max(StackAlign, MaxAlign); - unsigned AlignMask = StackAlign - 1; - Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); - - return (unsigned)Offset; -} - -void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ - if (Objects.empty()) return; - - const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering(); - int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0); - - OS << "Frame Objects:\n"; - - for (unsigned i = 0, e = Objects.size(); i != e; ++i) { - const StackObject &SO = Objects[i]; - OS << " fi#" << (int)(i-NumFixedObjects) << ": "; - if (SO.Size == ~0ULL) { - OS << "dead\n"; - continue; - } - if (SO.Size == 0) - OS << "variable sized"; - else - OS << "size=" << SO.Size; - OS << ", align=" << SO.Alignment; - - if (i < NumFixedObjects) - OS << ", fixed"; - if (i < NumFixedObjects || SO.SPOffset != -1) { - int64_t Off = SO.SPOffset - ValOffset; - OS << ", at location [SP"; - if (Off > 0) - OS << "+" << Off; - else if (Off < 0) - OS << Off; - OS << "]"; - } - OS << "\n"; - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void MachineFrameInfo::dump(const MachineFunction &MF) const { - print(MF, dbgs()); -} -#endif - -//===----------------------------------------------------------------------===// // MachineJumpTableInfo implementation //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp index 2f2e3b3d8e9f..d665201a5d17 100644 --- a/contrib/llvm/lib/CodeGen/MachineInstr.cpp +++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp @@ -262,8 +262,21 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { return getBlockAddress() == Other.getBlockAddress() && getOffset() == Other.getOffset(); case MachineOperand::MO_RegisterMask: - case MachineOperand::MO_RegisterLiveOut: - return getRegMask() == Other.getRegMask(); + case MachineOperand::MO_RegisterLiveOut: { + // Shallow compare of the two RegMasks + const uint32_t *RegMask = getRegMask(); + const uint32_t *OtherRegMask = Other.getRegMask(); + if (RegMask == OtherRegMask) + return true; + + // Calculate the size of the RegMask + const MachineFunction *MF = getParent()->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + + // Deep compare of the two RegMasks + return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask); + } case MachineOperand::MO_MCSymbol: return getMCSymbol() == Other.getMCSymbol(); case MachineOperand::MO_CFIIndex: @@ -403,6 +416,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, bool Unused; APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused); OS << "half " << APF.convertToFloat(); + } else if (getFPImm()->getType()->isFP128Ty()) { + APFloat APF = getFPImm()->getValueAPF(); + SmallString<16> Str; + getFPImm()->getValueAPF().toString(Str); + OS << "quad " << Str; } else { OS << getFPImm()->getValueAPF().convertToDouble(); } @@ -491,6 +509,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, auto Pred = static_cast<CmpInst::Predicate>(getPredicate()); OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred") << CmpInst::getPredicateName(Pred) << '>'; + break; } } if (unsigned TF = getTargetFlags()) @@ -1571,6 +1590,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const { return true; } +bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other, + bool UseTBAA) { + const MachineFunction *MF = getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + + // If neither instruction stores to memory, they can't alias in any + // meaningful way, even if they read from the same address. + if (!mayStore() && !Other.mayStore()) + return false; + + // Let the target decide if memory accesses cannot possibly overlap. + if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA)) + return false; + + if (!AA) + return true; + + // FIXME: Need to handle multiple memory operands to support all targets. + if (!hasOneMemOperand() || !Other.hasOneMemOperand()) + return true; + + MachineMemOperand *MMOa = *memoperands_begin(); + MachineMemOperand *MMOb = *Other.memoperands_begin(); + + if (!MMOa->getValue() || !MMOb->getValue()) + return true; + + // The following interface to AA is fashioned after DAGCombiner::isAlias + // and operates with MachineMemOperand offset with some important + // assumptions: + // - LLVM fundamentally assumes flat address spaces. + // - MachineOperand offset can *only* result from legalization and + // cannot affect queries other than the trivial case of overlap + // checking. + // - These offsets never wrap and never step outside + // of allocated objects. + // - There should never be any negative offsets here. + // + // FIXME: Modify API to hide this math from "user" + // FIXME: Even before we go to AA we can reason locally about some + // memory objects. It can save compile time, and possibly catch some + // corner cases not currently covered. + + assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + + int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); + int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; + int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; + + AliasResult AAResult = + AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, + UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), + MemoryLocation(MMOb->getValue(), Overlapb, + UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); + + return (AAResult != NoAlias); +} + /// hasOrderedMemoryRef - Return true if this instruction may have an ordered /// or volatile memory reference, or if the information describing the memory /// reference is not available. Return false if it is known to have no ordered @@ -1692,14 +1770,14 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF, } } -LLVM_DUMP_METHOD void MachineInstr::dump(const TargetInstrInfo *TII) const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineInstr::dump() const { dbgs() << " "; - print(dbgs(), false /* SkipOpers */, TII); -#endif + print(dbgs()); } +#endif -void MachineInstr::print(raw_ostream &OS, bool SkipOpers, +void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc, const TargetInstrInfo *TII) const { const Module *M = nullptr; if (const MachineBasicBlock *MBB = getParent()) @@ -1707,11 +1785,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers, M = MF->getFunction()->getParent(); ModuleSlotTracker MST(M); - print(OS, MST, SkipOpers, TII); + print(OS, MST, SkipOpers, SkipDebugLoc, TII); } void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, - bool SkipOpers, const TargetInstrInfo *TII) const { + bool SkipOpers, bool SkipDebugLoc, + const TargetInstrInfo *TII) const { // We can be a bit tidier if we know the MachineFunction. const MachineFunction *MF = nullptr; const TargetRegisterInfo *TRI = nullptr; @@ -1987,6 +2066,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, } if (isIndirectDebugValue()) OS << " indirect"; + } else if (SkipDebugLoc) { + return; } else if (debugLoc && MF) { if (!HaveSemi) OS << ";"; @@ -2263,3 +2344,26 @@ MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB, BB.insert(I, MI); return MachineInstrBuilder(MF, MI); } + +MachineInstr *llvm::buildDbgValueForSpill(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + const MachineInstr &Orig, + int FrameIndex) { + const MDNode *Var = Orig.getDebugVariable(); + const auto *Expr = cast_or_null<DIExpression>(Orig.getDebugExpression()); + bool IsIndirect = Orig.isIndirectDebugValue(); + uint64_t Offset = IsIndirect ? Orig.getOperand(1).getImm() : 0; + DebugLoc DL = Orig.getDebugLoc(); + assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) && + "Expected inlined-at fields to agree"); + // If the DBG_VALUE already was a memory location, add an extra + // DW_OP_deref. Otherwise just turning this from a register into a + // memory/indirect location is sufficient. + if (IsIndirect) + Expr = DIExpression::prepend(Expr, DIExpression::WithDeref); + return BuildMI(BB, I, DL, Orig.getDesc()) + .addFrameIndex(FrameIndex) + .addImm(Offset) + .addMetadata(Var) + .addMetadata(Expr); +} diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp index b3d18435985e..95c62d820b0e 100644 --- a/contrib/llvm/lib/CodeGen/MachineLICM.cpp +++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp @@ -38,7 +38,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "machine-licm" +#define DEBUG_TYPE "machinelicm" static cl::opt<bool> AvoidSpeculation("avoid-speculation", @@ -237,13 +237,13 @@ namespace { char MachineLICM::ID = 0; char &llvm::MachineLICMID = MachineLICM::ID; -INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm", - "Machine Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE, + "Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineLICM, "machinelicm", - "Machine Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE, + "Machine Loop Invariant Code Motion", false, false) /// Test if the given loop is the outer-most loop that has a unique predecessor. static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) { @@ -330,7 +330,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) { /// Return true if instruction stores to the specified frame. static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { // If we lost memory operands, conservatively assume that the instruction - // writes to all slots. + // writes to all slots. if (MI->memoperands_empty()) return true; for (const MachineMemOperand *MemOp : MI->memoperands()) { @@ -708,7 +708,7 @@ void MachineLICM::SinkIntoLoop() { for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin(); I != Preheader->instr_end(); ++I) { // We need to ensure that we can safely move this instruction into the loop. - // As such, it must not have side-effects, e.g. such as a call has. + // As such, it must not have side-effects, e.g. such as a call has. if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I)) Candidates.push_back(&*I); } @@ -837,9 +837,9 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, /// constant pool. static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) { assert (MI.mayLoad() && "Expected MI that loads!"); - + // If we lost memory operands, conservatively assume that the instruction - // reads from everything.. + // reads from everything.. if (MI.memoperands_empty()) return true; @@ -1337,7 +1337,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI); // Since we are moving the instruction out of its basic block, we do not - // retain its debug location. Doing so would degrade the debugging + // retain its debug location. Doing so would degrade the debugging // experience and adversely affect the accuracy of profiling information. MI->setDebugLoc(DebugLoc()); diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp index fdeaf7b71161..a9aa1d954e70 100644 --- a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -87,6 +87,22 @@ MachineBasicBlock *MachineLoop::findLoopControlBlock() { return nullptr; } +DebugLoc MachineLoop::getStartLoc() const { + // Try the pre-header first. + if (MachineBasicBlock *PHeadMBB = getLoopPreheader()) + if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock()) + if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc()) + return DL; + + // If we have no pre-header or there are no instructions with debug + // info in it, try the header. + if (MachineBasicBlock *HeadMBB = getHeader()) + if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock()) + return HeadBB->getTerminator()->getDebugLoc(); + + return DebugLoc(); +} + MachineBasicBlock * MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader) const { diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp index 6618857477ed..6cf751d34e26 100644 --- a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -32,8 +32,8 @@ using namespace llvm; using namespace llvm::dwarf; // Handle the Pass registration stuff necessary to use DataLayout's. -INITIALIZE_TM_PASS(MachineModuleInfo, "machinemoduleinfo", - "Machine Module Information", false, false) +INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo", + "Machine Module Information", false, false) char MachineModuleInfo::ID = 0; // Out of line virtual method. @@ -306,6 +306,10 @@ public: MMI.deleteMachineFunctionFor(F); return true; } + + StringRef getPassName() const override { + return "Free MachineFunction"; + } }; char FreeMachineFunction::ID; } // end anonymous namespace diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp index 22d519e5d88f..4c81fd91cb82 100644 --- a/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp +++ b/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp @@ -23,6 +23,7 @@ using namespace llvm; // Out of line virtual method. void MachineModuleInfoMachO::anchor() {} void MachineModuleInfoELF::anchor() {} +void MachineModuleInfoWasm::anchor() {} static int SortSymbolPair(const void *LHS, const void *RHS) { typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy; diff --git a/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp new file mode 100644 index 000000000000..6b6b5f2814a9 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp @@ -0,0 +1,100 @@ +///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===// +/// +/// The LLVM Compiler Infrastructure +/// +/// This file is distributed under the University of Illinois Open Source +/// License. See LICENSE.TXT for details. +/// +///===---------------------------------------------------------------------===// +/// \file +/// Optimization diagnostic interfaces for machine passes. It's packaged as an +/// analysis pass so that by using this service passes become dependent on MBFI +/// as well. MBFI is used to compute the "hotness" of the diagnostic message. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +DiagnosticInfoMIROptimization::MachineArgument::MachineArgument( + StringRef MKey, const MachineInstr &MI) + : Argument() { + Key = MKey; + + raw_string_ostream OS(Val); + MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true); +} + +Optional<uint64_t> +MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) { + if (!MBFI) + return None; + + return MBFI->getBlockProfileCount(&MBB); +} + +void MachineOptimizationRemarkEmitter::computeHotness( + DiagnosticInfoMIROptimization &Remark) { + const MachineBasicBlock *MBB = Remark.getBlock(); + if (MBB) + Remark.setHotness(computeHotness(*MBB)); +} + +void MachineOptimizationRemarkEmitter::emit( + DiagnosticInfoOptimizationBase &OptDiagCommon) { + auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon); + computeHotness(OptDiag); + + LLVMContext &Ctx = MF.getFunction()->getContext(); + yaml::Output *Out = Ctx.getDiagnosticsOutputFile(); + if (Out) { + auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon); + *Out << P; + } + // FIXME: now that IsVerbose is part of DI, filtering for this will be moved + // from here to clang. + if (!OptDiag.isVerbose() || shouldEmitVerbose()) + Ctx.diagnose(OptDiag); +} + +MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass() + : MachineFunctionPass(ID) { + initializeMachineOptimizationRemarkEmitterPassPass( + *PassRegistry::getPassRegistry()); +} + +bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction( + MachineFunction &MF) { + MachineBlockFrequencyInfo *MBFI; + + if (MF.getFunction()->getContext().getDiagnosticHotnessRequested()) + MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI(); + else + MBFI = nullptr; + + ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI); + return false; +} + +void MachineOptimizationRemarkEmitterPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +char MachineOptimizationRemarkEmitterPass::ID = 0; +static const char ore_name[] = "Machine Optimization Remark Emitter"; +#define ORE_NAME "machine-opt-remark-emitter" + +INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, + false, true) +INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass) +INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name, + false, true) diff --git a/contrib/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp new file mode 100644 index 000000000000..9ea3c00a2fc4 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp @@ -0,0 +1,1251 @@ +//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Replaces repeated sequences of instructions with function calls. +/// +/// This works by placing every instruction from every basic block in a +/// suffix tree, and repeatedly querying that tree for repeated sequences of +/// instructions. If a sequence of instructions appears often, then it ought +/// to be beneficial to pull out into a function. +/// +/// This was originally presented at the 2016 LLVM Developers' Meeting in the +/// talk "Reducing Code Size Using Outlining". For a high-level overview of +/// how this pass works, the talk is available on YouTube at +/// +/// https://www.youtube.com/watch?v=yorld-WSOeU +/// +/// The slides for the talk are available at +/// +/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf +/// +/// The talk provides an overview of how the outliner finds candidates and +/// ultimately outlines them. It describes how the main data structure for this +/// pass, the suffix tree, is queried and purged for candidates. It also gives +/// a simplified suffix tree construction algorithm for suffix trees based off +/// of the algorithm actually used here, Ukkonen's algorithm. +/// +/// For the original RFC for this pass, please see +/// +/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html +/// +/// For more information on the suffix tree data structure, please see +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +/// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <functional> +#include <map> +#include <sstream> +#include <tuple> +#include <vector> + +#define DEBUG_TYPE "machine-outliner" + +using namespace llvm; + +STATISTIC(NumOutlined, "Number of candidates outlined"); +STATISTIC(FunctionsCreated, "Number of functions created"); + +namespace { + +/// \brief An individual sequence of instructions to be replaced with a call to +/// an outlined function. +struct Candidate { + + /// Set to false if the candidate overlapped with another candidate. + bool InCandidateList = true; + + /// The start index of this \p Candidate. + size_t StartIdx; + + /// The number of instructions in this \p Candidate. + size_t Len; + + /// The index of this \p Candidate's \p OutlinedFunction in the list of + /// \p OutlinedFunctions. + size_t FunctionIdx; + + /// \brief The number of instructions that would be saved by outlining every + /// candidate of this type. + /// + /// This is a fixed value which is not updated during the candidate pruning + /// process. It is only used for deciding which candidate to keep if two + /// candidates overlap. The true benefit is stored in the OutlinedFunction + /// for some given candidate. + unsigned Benefit = 0; + + Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx) + : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {} + + Candidate() {} + + /// \brief Used to ensure that \p Candidates are outlined in an order that + /// preserves the start and end indices of other \p Candidates. + bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; } +}; + +/// \brief The information necessary to create an outlined function for some +/// class of candidate. +struct OutlinedFunction { + + /// The actual outlined function created. + /// This is initialized after we go through and create the actual function. + MachineFunction *MF = nullptr; + + /// A number assigned to this function which appears at the end of its name. + size_t Name; + + /// The number of candidates for this OutlinedFunction. + size_t OccurrenceCount = 0; + + /// \brief The sequence of integers corresponding to the instructions in this + /// function. + std::vector<unsigned> Sequence; + + /// The number of instructions this function would save. + unsigned Benefit = 0; + + /// \brief Set to true if candidates for this outlined function should be + /// replaced with tail calls to this OutlinedFunction. + bool IsTailCall = false; + + OutlinedFunction(size_t Name, size_t OccurrenceCount, + const std::vector<unsigned> &Sequence, + unsigned Benefit, bool IsTailCall) + : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence), + Benefit(Benefit), IsTailCall(IsTailCall) + {} +}; + +/// Represents an undefined index in the suffix tree. +const size_t EmptyIdx = -1; + +/// A node in a suffix tree which represents a substring or suffix. +/// +/// Each node has either no children or at least two children, with the root +/// being a exception in the empty tree. +/// +/// Children are represented as a map between unsigned integers and nodes. If +/// a node N has a child M on unsigned integer k, then the mapping represented +/// by N is a proper prefix of the mapping represented by M. Note that this, +/// although similar to a trie is somewhat different: each node stores a full +/// substring of the full mapping rather than a single character state. +/// +/// Each internal node contains a pointer to the internal node representing +/// the same string, but with the first character chopped off. This is stored +/// in \p Link. Each leaf node stores the start index of its respective +/// suffix in \p SuffixIdx. +struct SuffixTreeNode { + + /// The children of this node. + /// + /// A child existing on an unsigned integer implies that from the mapping + /// represented by the current node, there is a way to reach another + /// mapping by tacking that character on the end of the current string. + DenseMap<unsigned, SuffixTreeNode *> Children; + + /// A flag set to false if the node has been pruned from the tree. + bool IsInTree = true; + + /// The start index of this node's substring in the main string. + size_t StartIdx = EmptyIdx; + + /// The end index of this node's substring in the main string. + /// + /// Every leaf node must have its \p EndIdx incremented at the end of every + /// step in the construction algorithm. To avoid having to update O(N) + /// nodes individually at the end of every step, the end index is stored + /// as a pointer. + size_t *EndIdx = nullptr; + + /// For leaves, the start index of the suffix represented by this node. + /// + /// For all other nodes, this is ignored. + size_t SuffixIdx = EmptyIdx; + + /// \brief For internal nodes, a pointer to the internal node representing + /// the same sequence with the first character chopped off. + /// + /// This has two major purposes in the suffix tree. The first is as a + /// shortcut in Ukkonen's construction algorithm. One of the things that + /// Ukkonen's algorithm does to achieve linear-time construction is + /// keep track of which node the next insert should be at. This makes each + /// insert O(1), and there are a total of O(N) inserts. The suffix link + /// helps with inserting children of internal nodes. + /// + /// Say we add a child to an internal node with associated mapping S. The + /// next insertion must be at the node representing S - its first character. + /// This is given by the way that we iteratively build the tree in Ukkonen's + /// algorithm. The main idea is to look at the suffixes of each prefix in the + /// string, starting with the longest suffix of the prefix, and ending with + /// the shortest. Therefore, if we keep pointers between such nodes, we can + /// move to the next insertion point in O(1) time. If we don't, then we'd + /// have to query from the root, which takes O(N) time. This would make the + /// construction algorithm O(N^2) rather than O(N). + /// + /// The suffix link is also used during the tree pruning process to let us + /// quickly throw out a bunch of potential overlaps. Say we have a sequence + /// S we want to outline. Then each of its suffixes contribute to at least + /// one overlapping case. Therefore, we can follow the suffix links + /// starting at the node associated with S to the root and "delete" those + /// nodes, save for the root. For each candidate, this removes + /// O(|candidate|) overlaps from the search space. We don't actually + /// completely invalidate these nodes though; doing that is far too + /// aggressive. Consider the following pathological string: + /// + /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 + /// + /// If we, for the sake of example, outlined 1 2 3, then we would throw + /// out all instances of 2 3. This isn't desirable. To get around this, + /// when we visit a link node, we decrement its occurrence count by the + /// number of sequences we outlined in the current step. In the pathological + /// example, the 2 3 node would have an occurrence count of 8, while the + /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node + /// would survive to the next round allowing us to outline the extra + /// instances of 2 3. + SuffixTreeNode *Link = nullptr; + + /// The parent of this node. Every node except for the root has a parent. + SuffixTreeNode *Parent = nullptr; + + /// The number of times this node's string appears in the tree. + /// + /// This is equal to the number of leaf children of the string. It represents + /// the number of suffixes that the node's string is a prefix of. + size_t OccurrenceCount = 0; + + /// The length of the string formed by concatenating the edge labels from the + /// root to this node. + size_t ConcatLen = 0; + + /// Returns true if this node is a leaf. + bool isLeaf() const { return SuffixIdx != EmptyIdx; } + + /// Returns true if this node is the root of its owning \p SuffixTree. + bool isRoot() const { return StartIdx == EmptyIdx; } + + /// Return the number of elements in the substring associated with this node. + size_t size() const { + + // Is it the root? If so, it's the empty string so return 0. + if (isRoot()) + return 0; + + assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); + + // Size = the number of elements in the string. + // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. + return *EndIdx - StartIdx + 1; + } + + SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link, + SuffixTreeNode *Parent) + : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {} + + SuffixTreeNode() {} +}; + +/// A data structure for fast substring queries. +/// +/// Suffix trees represent the suffixes of their input strings in their leaves. +/// A suffix tree is a type of compressed trie structure where each node +/// represents an entire substring rather than a single character. Each leaf +/// of the tree is a suffix. +/// +/// A suffix tree can be seen as a type of state machine where each state is a +/// substring of the full string. The tree is structured so that, for a string +/// of length N, there are exactly N leaves in the tree. This structure allows +/// us to quickly find repeated substrings of the input string. +/// +/// In this implementation, a "string" is a vector of unsigned integers. +/// These integers may result from hashing some data type. A suffix tree can +/// contain 1 or many strings, which can then be queried as one large string. +/// +/// The suffix tree is implemented using Ukkonen's algorithm for linear-time +/// suffix tree construction. Ukkonen's algorithm is explained in more detail +/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The +/// paper is available at +/// +/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf +class SuffixTree { +private: + /// Each element is an integer representing an instruction in the module. + ArrayRef<unsigned> Str; + + /// Maintains each node in the tree. + SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; + + /// The root of the suffix tree. + /// + /// The root represents the empty string. It is maintained by the + /// \p NodeAllocator like every other node in the tree. + SuffixTreeNode *Root = nullptr; + + /// Stores each leaf node in the tree. + /// + /// This is used for finding outlining candidates. + std::vector<SuffixTreeNode *> LeafVector; + + /// Maintains the end indices of the internal nodes in the tree. + /// + /// Each internal node is guaranteed to never have its end index change + /// during the construction algorithm; however, leaves must be updated at + /// every step. Therefore, we need to store leaf end indices by reference + /// to avoid updating O(N) leaves at every step of construction. Thus, + /// every internal node must be allocated its own end index. + BumpPtrAllocator InternalEndIdxAllocator; + + /// The end index of each leaf in the tree. + size_t LeafEndIdx = -1; + + /// \brief Helper struct which keeps track of the next insertion point in + /// Ukkonen's algorithm. + struct ActiveState { + /// The next node to insert at. + SuffixTreeNode *Node; + + /// The index of the first character in the substring currently being added. + size_t Idx = EmptyIdx; + + /// The length of the substring we have to add at the current step. + size_t Len = 0; + }; + + /// \brief The point the next insertion will take place at in the + /// construction algorithm. + ActiveState Active; + + /// Allocate a leaf node and add it to the tree. + /// + /// \param Parent The parent of this node. + /// \param StartIdx The start index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated leaf node. + SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx, + unsigned Edge) { + + assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); + + SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, + &LeafEndIdx, + nullptr, + &Parent); + Parent.Children[Edge] = N; + + return N; + } + + /// Allocate an internal node and add it to the tree. + /// + /// \param Parent The parent of this node. Only null when allocating the root. + /// \param StartIdx The start index of this node's associated string. + /// \param EndIdx The end index of this node's associated string. + /// \param Edge The label on the edge leaving \p Parent to this node. + /// + /// \returns A pointer to the allocated internal node. + SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx, + size_t EndIdx, unsigned Edge) { + + assert(StartIdx <= EndIdx && "String can't start after it ends!"); + assert(!(!Parent && StartIdx != EmptyIdx) && + "Non-root internal nodes must have parents!"); + + size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx); + SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, + E, + Root, + Parent); + if (Parent) + Parent->Children[Edge] = N; + + return N; + } + + /// \brief Set the suffix indices of the leaves to the start indices of their + /// respective suffixes. Also stores each leaf in \p LeafVector at its + /// respective suffix index. + /// + /// \param[in] CurrNode The node currently being visited. + /// \param CurrIdx The current index of the string being visited. + void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) { + + bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot(); + + // Store the length of the concatenation of all strings from the root to + // this node. + if (!CurrNode.isRoot()) { + if (CurrNode.ConcatLen == 0) + CurrNode.ConcatLen = CurrNode.size(); + + if (CurrNode.Parent) + CurrNode.ConcatLen += CurrNode.Parent->ConcatLen; + } + + // Traverse the tree depth-first. + for (auto &ChildPair : CurrNode.Children) { + assert(ChildPair.second && "Node had a null child!"); + setSuffixIndices(*ChildPair.second, + CurrIdx + ChildPair.second->size()); + } + + // Is this node a leaf? + if (IsLeaf) { + // If yes, give it a suffix index and bump its parent's occurrence count. + CurrNode.SuffixIdx = Str.size() - CurrIdx; + assert(CurrNode.Parent && "CurrNode had no parent!"); + CurrNode.Parent->OccurrenceCount++; + + // Store the leaf in the leaf vector for pruning later. + LeafVector[CurrNode.SuffixIdx] = &CurrNode; + } + } + + /// \brief Construct the suffix tree for the prefix of the input ending at + /// \p EndIdx. + /// + /// Used to construct the full suffix tree iteratively. At the end of each + /// step, the constructed suffix tree is either a valid suffix tree, or a + /// suffix tree with implicit suffixes. At the end of the final step, the + /// suffix tree is a valid tree. + /// + /// \param EndIdx The end index of the current prefix in the main string. + /// \param SuffixesToAdd The number of suffixes that must be added + /// to complete the suffix tree at the current phase. + /// + /// \returns The number of suffixes that have not been added at the end of + /// this step. + unsigned extend(size_t EndIdx, size_t SuffixesToAdd) { + SuffixTreeNode *NeedsLink = nullptr; + + while (SuffixesToAdd > 0) { + + // Are we waiting to add anything other than just the last character? + if (Active.Len == 0) { + // If not, then say the active index is the end index. + Active.Idx = EndIdx; + } + + assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); + + // The first character in the current substring we're looking at. + unsigned FirstChar = Str[Active.Idx]; + + // Have we inserted anything starting with FirstChar at the current node? + if (Active.Node->Children.count(FirstChar) == 0) { + // If not, then we can just insert a leaf and move too the next step. + insertLeaf(*Active.Node, EndIdx, FirstChar); + + // The active node is an internal node, and we visited it, so it must + // need a link if it doesn't have one. + if (NeedsLink) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + } else { + // There's a match with FirstChar, so look for the point in the tree to + // insert a new node. + SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; + + size_t SubstringLen = NextNode->size(); + + // Is the current suffix we're trying to insert longer than the size of + // the child we want to move to? + if (Active.Len >= SubstringLen) { + // If yes, then consume the characters we've seen and move to the next + // node. + Active.Idx += SubstringLen; + Active.Len -= SubstringLen; + Active.Node = NextNode; + continue; + } + + // Otherwise, the suffix we're trying to insert must be contained in the + // next node we want to move to. + unsigned LastChar = Str[EndIdx]; + + // Is the string we're trying to insert a substring of the next node? + if (Str[NextNode->StartIdx + Active.Len] == LastChar) { + // If yes, then we're done for this step. Remember our insertion point + // and move to the next end index. At this point, we have an implicit + // suffix tree. + if (NeedsLink && !Active.Node->isRoot()) { + NeedsLink->Link = Active.Node; + NeedsLink = nullptr; + } + + Active.Len++; + break; + } + + // The string we're trying to insert isn't a substring of the next node, + // but matches up to a point. Split the node. + // + // For example, say we ended our search at a node n and we're trying to + // insert ABD. Then we'll create a new node s for AB, reduce n to just + // representing C, and insert a new leaf node l to represent d. This + // allows us to ensure that if n was a leaf, it remains a leaf. + // + // | ABC ---split---> | AB + // n s + // C / \ D + // n l + + // The node s from the diagram + SuffixTreeNode *SplitNode = + insertInternalNode(Active.Node, + NextNode->StartIdx, + NextNode->StartIdx + Active.Len - 1, + FirstChar); + + // Insert the new node representing the new substring into the tree as + // a child of the split node. This is the node l from the diagram. + insertLeaf(*SplitNode, EndIdx, LastChar); + + // Make the old node a child of the split node and update its start + // index. This is the node n from the diagram. + NextNode->StartIdx += Active.Len; + NextNode->Parent = SplitNode; + SplitNode->Children[Str[NextNode->StartIdx]] = NextNode; + + // SplitNode is an internal node, update the suffix link. + if (NeedsLink) + NeedsLink->Link = SplitNode; + + NeedsLink = SplitNode; + } + + // We've added something new to the tree, so there's one less suffix to + // add. + SuffixesToAdd--; + + if (Active.Node->isRoot()) { + if (Active.Len > 0) { + Active.Len--; + Active.Idx = EndIdx - SuffixesToAdd + 1; + } + } else { + // Start the next phase at the next smallest suffix. + Active.Node = Active.Node->Link; + } + } + + return SuffixesToAdd; + } + +public: + + /// Find all repeated substrings that satisfy \p BenefitFn. + /// + /// If a substring appears at least twice, then it must be represented by + /// an internal node which appears in at least two suffixes. Each suffix is + /// represented by a leaf node. To do this, we visit each internal node in + /// the tree, using the leaf children of each internal node. If an internal + /// node represents a beneficial substring, then we use each of its leaf + /// children to find the locations of its substring. + /// + /// \param[out] CandidateList Filled with candidates representing each + /// beneficial substring. + /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each + /// type of candidate. + /// \param BenefitFn The function to satisfy. + /// + /// \returns The length of the longest candidate found. + size_t findCandidates(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)> + &BenefitFn) { + + CandidateList.clear(); + FunctionList.clear(); + size_t FnIdx = 0; + size_t MaxLen = 0; + + for (SuffixTreeNode* Leaf : LeafVector) { + assert(Leaf && "Leaves in LeafVector cannot be null!"); + if (!Leaf->IsInTree) + continue; + + assert(Leaf->Parent && "All leaves must have parents!"); + SuffixTreeNode &Parent = *(Leaf->Parent); + + // If it doesn't appear enough, or we already outlined from it, skip it. + if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree) + continue; + + size_t StringLen = Leaf->ConcatLen - Leaf->size(); + + // How many instructions would outlining this string save? + unsigned Benefit = BenefitFn(Parent, + StringLen, Str[Leaf->SuffixIdx + StringLen - 1]); + + // If it's not beneficial, skip it. + if (Benefit < 1) + continue; + + if (StringLen > MaxLen) + MaxLen = StringLen; + + unsigned OccurrenceCount = 0; + for (auto &ChildPair : Parent.Children) { + SuffixTreeNode *M = ChildPair.second; + + // Is it a leaf? If so, we have an occurrence of this candidate. + if (M && M->IsInTree && M->isLeaf()) { + OccurrenceCount++; + CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx); + CandidateList.back().Benefit = Benefit; + M->IsInTree = false; + } + } + + // Save the function for the new candidate sequence. + std::vector<unsigned> CandidateSequence; + for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++) + CandidateSequence.push_back(Str[i]); + + FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence, + Benefit, false); + + // Move to the next function. + FnIdx++; + Parent.IsInTree = false; + } + + return MaxLen; + } + + /// Construct a suffix tree from a sequence of unsigned integers. + /// + /// \param Str The string to construct the suffix tree for. + SuffixTree(const std::vector<unsigned> &Str) : Str(Str) { + Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); + Root->IsInTree = true; + Active.Node = Root; + LeafVector = std::vector<SuffixTreeNode*>(Str.size()); + + // Keep track of the number of suffixes we have to add of the current + // prefix. + size_t SuffixesToAdd = 0; + Active.Node = Root; + + // Construct the suffix tree iteratively on each prefix of the string. + // PfxEndIdx is the end index of the current prefix. + // End is one past the last element in the string. + for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { + SuffixesToAdd++; + LeafEndIdx = PfxEndIdx; // Extend each of the leaves. + SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); + } + + // Set the suffix indices of each leaf. + assert(Root && "Root node can't be nullptr!"); + setSuffixIndices(*Root, 0); + } +}; + +/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings. +struct InstructionMapper { + + /// \brief The next available integer to assign to a \p MachineInstr that + /// cannot be outlined. + /// + /// Set to -3 for compatability with \p DenseMapInfo<unsigned>. + unsigned IllegalInstrNumber = -3; + + /// \brief The next available integer to assign to a \p MachineInstr that can + /// be outlined. + unsigned LegalInstrNumber = 0; + + /// Correspondence from \p MachineInstrs to unsigned integers. + DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait> + InstructionIntegerMap; + + /// Corresponcence from unsigned integers to \p MachineInstrs. + /// Inverse of \p InstructionIntegerMap. + DenseMap<unsigned, MachineInstr *> IntegerInstructionMap; + + /// The vector of unsigned integers that the module is mapped to. + std::vector<unsigned> UnsignedVec; + + /// \brief Stores the location of the instruction associated with the integer + /// at index i in \p UnsignedVec for each index i. + std::vector<MachineBasicBlock::iterator> InstrList; + + /// \brief Maps \p *It to a legal integer. + /// + /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap, + /// \p IntegerInstructionMap, and \p LegalInstrNumber. + /// + /// \returns The integer that \p *It was mapped to. + unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) { + + // Get the integer for this instruction or give it the current + // LegalInstrNumber. + InstrList.push_back(It); + MachineInstr &MI = *It; + bool WasInserted; + DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator + ResultIt; + std::tie(ResultIt, WasInserted) = + InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber)); + unsigned MINumber = ResultIt->second; + + // There was an insertion. + if (WasInserted) { + LegalInstrNumber++; + IntegerInstructionMap.insert(std::make_pair(MINumber, &MI)); + } + + UnsignedVec.push_back(MINumber); + + // Make sure we don't overflow or use any integers reserved by the DenseMap. + if (LegalInstrNumber >= IllegalInstrNumber) + report_fatal_error("Instruction mapping overflow!"); + + assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() + && "Tried to assign DenseMap tombstone or empty key to instruction."); + assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() + && "Tried to assign DenseMap tombstone or empty key to instruction."); + + return MINumber; + } + + /// Maps \p *It to an illegal integer. + /// + /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber. + /// + /// \returns The integer that \p *It was mapped to. + unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) { + unsigned MINumber = IllegalInstrNumber; + + InstrList.push_back(It); + UnsignedVec.push_back(IllegalInstrNumber); + IllegalInstrNumber--; + + assert(LegalInstrNumber < IllegalInstrNumber && + "Instruction mapping overflow!"); + + assert(IllegalInstrNumber != + DenseMapInfo<unsigned>::getEmptyKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + assert(IllegalInstrNumber != + DenseMapInfo<unsigned>::getTombstoneKey() && + "IllegalInstrNumber cannot be DenseMap tombstone or empty key!"); + + return MINumber; + } + + /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds + /// and appends it to \p UnsignedVec and \p InstrList. + /// + /// Two instructions are assigned the same integer if they are identical. + /// If an instruction is deemed unsafe to outline, then it will be assigned an + /// unique integer. The resulting mapping is placed into a suffix tree and + /// queried for candidates. + /// + /// \param MBB The \p MachineBasicBlock to be translated into integers. + /// \param TRI \p TargetRegisterInfo for the module. + /// \param TII \p TargetInstrInfo for the module. + void convertToUnsignedVec(MachineBasicBlock &MBB, + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { + for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et; + It++) { + + // Keep track of where this instruction is in the module. + switch(TII.getOutliningType(*It)) { + case TargetInstrInfo::MachineOutlinerInstrType::Illegal: + mapToIllegalUnsigned(It); + break; + + case TargetInstrInfo::MachineOutlinerInstrType::Legal: + mapToLegalUnsigned(It); + break; + + case TargetInstrInfo::MachineOutlinerInstrType::Invisible: + break; + } + } + + // After we're done every insertion, uniquely terminate this part of the + // "string". This makes sure we won't match across basic block or function + // boundaries since the "end" is encoded uniquely and thus appears in no + // repeated substring. + InstrList.push_back(MBB.end()); + UnsignedVec.push_back(IllegalInstrNumber); + IllegalInstrNumber--; + } + + InstructionMapper() { + // Make sure that the implementation of DenseMapInfo<unsigned> hasn't + // changed. + assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 && + "DenseMapInfo<unsigned>'s empty key isn't -1!"); + assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 && + "DenseMapInfo<unsigned>'s tombstone key isn't -2!"); + } +}; + +/// \brief An interprocedural pass which finds repeated sequences of +/// instructions and replaces them with calls to functions. +/// +/// Each instruction is mapped to an unsigned integer and placed in a string. +/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree +/// is then repeatedly queried for repeated sequences of instructions. Each +/// non-overlapping repeated sequence is then placed in its own +/// \p MachineFunction and each instance is then replaced with a call to that +/// function. +struct MachineOutliner : public ModulePass { + + static char ID; + + StringRef getPassName() const override { return "Machine Outliner"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineModuleInfo>(); + AU.addPreserved<MachineModuleInfo>(); + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } + + MachineOutliner() : ModulePass(ID) { + initializeMachineOutlinerPass(*PassRegistry::getPassRegistry()); + } + + /// \brief Replace the sequences of instructions represented by the + /// \p Candidates in \p CandidateList with calls to \p MachineFunctions + /// described in \p FunctionList. + /// + /// \param M The module we are outlining from. + /// \param CandidateList A list of candidates to be outlined. + /// \param FunctionList A list of functions to be inserted into the module. + /// \param Mapper Contains the instruction mappings for the module. + bool outline(Module &M, const ArrayRef<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + InstructionMapper &Mapper); + + /// Creates a function for \p OF and inserts it into the module. + MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF, + InstructionMapper &Mapper); + + /// Find potential outlining candidates and store them in \p CandidateList. + /// + /// For each type of potential candidate, also build an \p OutlinedFunction + /// struct containing the information to build the function for that + /// candidate. + /// + /// \param[out] CandidateList Filled with outlining candidates for the module. + /// \param[out] FunctionList Filled with functions corresponding to each type + /// of \p Candidate. + /// \param ST The suffix tree for the module. + /// \param TII TargetInstrInfo for the module. + /// + /// \returns The length of the longest candidate found. 0 if there are none. + unsigned buildCandidateList(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + SuffixTree &ST, + InstructionMapper &Mapper, + const TargetInstrInfo &TII); + + /// \brief Remove any overlapping candidates that weren't handled by the + /// suffix tree's pruning method. + /// + /// Pruning from the suffix tree doesn't necessarily remove all overlaps. + /// If a short candidate is chosen for outlining, then a longer candidate + /// which has that short candidate as a suffix is chosen, the tree's pruning + /// method will not find it. Thus, we need to prune before outlining as well. + /// + /// \param[in,out] CandidateList A list of outlining candidates. + /// \param[in,out] FunctionList A list of functions to be outlined. + /// \param MaxCandidateLen The length of the longest candidate. + /// \param TII TargetInstrInfo for the module. + void pruneOverlaps(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + unsigned MaxCandidateLen, + const TargetInstrInfo &TII); + + /// Construct a suffix tree on the instructions in \p M and outline repeated + /// strings from that tree. + bool runOnModule(Module &M) override; +}; + +} // Anonymous namespace. + +char MachineOutliner::ID = 0; + +namespace llvm { +ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); } +} + +INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, + "Machine Function Outliner", false, false) + +void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + unsigned MaxCandidateLen, + const TargetInstrInfo &TII) { + // TODO: Experiment with interval trees or other interval-checking structures + // to lower the time complexity of this function. + // TODO: Can we do better than the simple greedy choice? + // Check for overlaps in the range. + // This is O(MaxCandidateLen * CandidateList.size()). + for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et; + It++) { + Candidate &C1 = *It; + OutlinedFunction &F1 = FunctionList[C1.FunctionIdx]; + + // If we removed this candidate, skip it. + if (!C1.InCandidateList) + continue; + + // Is it still worth it to outline C1? + if (F1.Benefit < 1 || F1.OccurrenceCount < 2) { + assert(F1.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F1.OccurrenceCount--; + C1.InCandidateList = false; + continue; + } + + // The minimum start index of any candidate that could overlap with this + // one. + unsigned FarthestPossibleIdx = 0; + + // Either the index is 0, or it's at most MaxCandidateLen indices away. + if (C1.StartIdx > MaxCandidateLen) + FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen; + + // Compare against the candidates in the list that start at at most + // FarthestPossibleIdx indices away from C1. There are at most + // MaxCandidateLen of these. + for (auto Sit = It + 1; Sit != Et; Sit++) { + Candidate &C2 = *Sit; + OutlinedFunction &F2 = FunctionList[C2.FunctionIdx]; + + // Is this candidate too far away to overlap? + if (C2.StartIdx < FarthestPossibleIdx) + break; + + // Did we already remove this candidate in a previous step? + if (!C2.InCandidateList) + continue; + + // Is the function beneficial to outline? + if (F2.OccurrenceCount < 2 || F2.Benefit < 1) { + // If not, remove this candidate and move to the next one. + assert(F2.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F2.OccurrenceCount--; + C2.InCandidateList = false; + continue; + } + + size_t C2End = C2.StartIdx + C2.Len - 1; + + // Do C1 and C2 overlap? + // + // Not overlapping: + // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices + // + // We sorted our candidate list so C2Start <= C1Start. We know that + // C2End > C2Start since each candidate has length >= 2. Therefore, all we + // have to check is C2End < C2Start to see if we overlap. + if (C2End < C1.StartIdx) + continue; + + // C1 and C2 overlap. + // We need to choose the better of the two. + // + // Approximate this by picking the one which would have saved us the + // most instructions before any pruning. + if (C1.Benefit >= C2.Benefit) { + + // C1 is better, so remove C2 and update C2's OutlinedFunction to + // reflect the removal. + assert(F2.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F2.OccurrenceCount--; + F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(), + F2.OccurrenceCount, + F2.IsTailCall + ); + + C2.InCandidateList = false; + + DEBUG ( + dbgs() << "- Removed C2. \n"; + dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n"; + dbgs() << "--- C2's benefit: " << F2.Benefit << "\n"; + ); + + } else { + // C2 is better, so remove C1 and update C1's OutlinedFunction to + // reflect the removal. + assert(F1.OccurrenceCount > 0 && + "Can't remove OutlinedFunction with no occurrences!"); + F1.OccurrenceCount--; + F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(), + F1.OccurrenceCount, + F1.IsTailCall + ); + C1.InCandidateList = false; + + DEBUG ( + dbgs() << "- Removed C1. \n"; + dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n"; + dbgs() << "--- C1's benefit: " << F1.Benefit << "\n"; + ); + + // C1 is out, so we don't have to compare it against anyone else. + break; + } + } + } +} + +unsigned +MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + SuffixTree &ST, + InstructionMapper &Mapper, + const TargetInstrInfo &TII) { + + std::vector<unsigned> CandidateSequence; // Current outlining candidate. + size_t MaxCandidateLen = 0; // Length of the longest candidate. + + // Function for maximizing query in the suffix tree. + // This allows us to define more fine-grained types of things to outline in + // the target without putting target-specific info in the suffix tree. + auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr, + size_t StringLen, unsigned EndVal) { + + // The root represents the empty string. + if (Curr.isRoot()) + return 0u; + + // Is this long enough to outline? + // TODO: Let the target decide how "long" a string is in terms of the sizes + // of the instructions in the string. For example, if a call instruction + // is smaller than a one instruction string, we should outline that string. + if (StringLen < 2) + return 0u; + + size_t Occurrences = Curr.OccurrenceCount; + + // Anything we want to outline has to appear at least twice. + if (Occurrences < 2) + return 0u; + + // Check if the last instruction in the sequence is a return. + MachineInstr *LastInstr = + Mapper.IntegerInstructionMap[EndVal]; + assert(LastInstr && "Last instruction in sequence was unmapped!"); + + // The only way a terminator could be mapped as legal is if it was safe to + // tail call. + bool IsTailCall = LastInstr->isTerminator(); + return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall); + }; + + MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn); + + for (auto &OF : FunctionList) + OF.IsTailCall = Mapper. + IntegerInstructionMap[OF.Sequence.back()]->isTerminator(); + + // Sort the candidates in decending order. This will simplify the outlining + // process when we have to remove the candidates from the mapping by + // allowing us to cut them out without keeping track of an offset. + std::stable_sort(CandidateList.begin(), CandidateList.end()); + + return MaxCandidateLen; +} + +MachineFunction * +MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF, + InstructionMapper &Mapper) { + + // Create the function name. This should be unique. For now, just hash the + // module name and include it in the function name plus the number of this + // function. + std::ostringstream NameStream; + NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name; + + // Create the function using an IR-level function. + LLVMContext &C = M.getContext(); + Function *F = dyn_cast<Function>( + M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C))); + assert(F && "Function was null!"); + + // NOTE: If this is linkonceodr, then we can take advantage of linker deduping + // which gives us better results when we outline from linkonceodr functions. + F->setLinkage(GlobalValue::PrivateLinkage); + F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); + IRBuilder<> Builder(EntryBB); + Builder.CreateRetVoid(); + + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); + MachineFunction &MF = MMI.getMachineFunction(*F); + MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + // Insert the new function into the module. + MF.insert(MF.begin(), &MBB); + + TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall); + + // Copy over the instructions for the function using the integer mappings in + // its sequence. + for (unsigned Str : OF.Sequence) { + MachineInstr *NewMI = + MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second); + NewMI->dropMemRefs(); + + // Don't keep debug information for outlined instructions. + // FIXME: This means outlined functions are currently undebuggable. + NewMI->setDebugLoc(DebugLoc()); + MBB.insert(MBB.end(), NewMI); + } + + TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall); + + return &MF; +} + +bool MachineOutliner::outline(Module &M, + const ArrayRef<Candidate> &CandidateList, + std::vector<OutlinedFunction> &FunctionList, + InstructionMapper &Mapper) { + + bool OutlinedSomething = false; + + // Replace the candidates with calls to their respective outlined functions. + for (const Candidate &C : CandidateList) { + + // Was the candidate removed during pruneOverlaps? + if (!C.InCandidateList) + continue; + + // If not, then look at its OutlinedFunction. + OutlinedFunction &OF = FunctionList[C.FunctionIdx]; + + // Was its OutlinedFunction made unbeneficial during pruneOverlaps? + if (OF.OccurrenceCount < 2 || OF.Benefit < 1) + continue; + + // If not, then outline it. + assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); + MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent(); + MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx]; + unsigned EndIdx = C.StartIdx + C.Len - 1; + + assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!"); + MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; + assert(EndIt != MBB->end() && "EndIt out of bounds!"); + + EndIt++; // Erase needs one past the end index. + + // Does this candidate have a function yet? + if (!OF.MF) { + OF.MF = createOutlinedFunction(M, OF, Mapper); + FunctionsCreated++; + } + + MachineFunction *MF = OF.MF; + const TargetSubtargetInfo &STI = MF->getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + // Insert a call to the new function and erase the old sequence. + TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall); + StartIt = Mapper.InstrList[C.StartIdx]; + MBB->erase(StartIt, EndIt); + + OutlinedSomething = true; + + // Statistics. + NumOutlined++; + } + + DEBUG ( + dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n"; + ); + + return OutlinedSomething; +} + +bool MachineOutliner::runOnModule(Module &M) { + + // Is there anything in the module at all? + if (M.empty()) + return false; + + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>(); + const TargetSubtargetInfo &STI = MMI.getMachineFunction(*M.begin()) + .getSubtarget(); + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetInstrInfo *TII = STI.getInstrInfo(); + + InstructionMapper Mapper; + + // Build instruction mappings for each function in the module. + for (Function &F : M) { + MachineFunction &MF = MMI.getMachineFunction(F); + + // Is the function empty? Safe to outline from? + if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF)) + continue; + + // If it is, look at each MachineBasicBlock in the function. + for (MachineBasicBlock &MBB : MF) { + + // Is there anything in MBB? + if (MBB.empty()) + continue; + + // If yes, map it. + Mapper.convertToUnsignedVec(MBB, *TRI, *TII); + } + } + + // Construct a suffix tree, use it to find candidates, and then outline them. + SuffixTree ST(Mapper.UnsignedVec); + std::vector<Candidate> CandidateList; + std::vector<OutlinedFunction> FunctionList; + + // Find all of the outlining candidates. + unsigned MaxCandidateLen = + buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII); + + // Remove candidates that overlap with other candidates. + pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII); + + // Outline each of the candidates and return true if something was outlined. + return outline(M, CandidateList, FunctionList, Mapper); +} diff --git a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp index 43a18099d39a..8f5ac8b3fc45 100644 --- a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -552,7 +552,9 @@ public: os << "\n"; } - void dump() const { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif }; /// This class repesents the scheduled code. The main data structure is a @@ -593,7 +595,7 @@ private: /// Virtual register information. MachineRegisterInfo &MRI; - DFAPacketizer *Resources; + std::unique_ptr<DFAPacketizer> Resources; public: SMSchedule(MachineFunction *mf) @@ -604,13 +606,6 @@ public: InitiationInterval = 0; } - ~SMSchedule() { - ScheduledInstrs.clear(); - InstrToCycle.clear(); - RegToStageDiff.clear(); - delete Resources; - } - void reset() { ScheduledInstrs.clear(); InstrToCycle.clear(); @@ -720,13 +715,13 @@ char MachinePipeliner::ID = 0; int MachinePipeliner::NumTries = 0; #endif char &llvm::MachinePipelinerID = MachinePipeliner::ID; -INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner", +INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(MachinePipeliner, "pipeliner", +INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) /// The "main" function for implementing Swing Modulo Scheduling. @@ -738,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { return false; if (mf.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + AttributeList::FunctionIndex, Attribute::OptimizeForSize) && !EnableSWPOptSize.getPosition()) return false; @@ -960,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2) if (Phi.getOperand(i + 1).getMBB() != Loop) InitVal = Phi.getOperand(i).getReg(); - else if (Phi.getOperand(i + 1).getMBB() == Loop) + else LoopVal = Phi.getOperand(i).getReg(); assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure."); @@ -2514,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis( MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap, InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum, bool IsLast) { - // Compute the stage number for the inital value of the Phi, which + // Compute the stage number for the initial value of the Phi, which // comes from the prolog. The prolog to use depends on to which kernel/ // epilog that we're adding the Phi. unsigned PrologStage = 0; @@ -3480,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep, // increment value to determine if the accesses may be loop carried. if (OffsetS >= OffsetD) return OffsetS + AccessSizeS > DeltaS; - else if (OffsetS < OffsetD) + else return OffsetD + AccessSizeD > DeltaD; return true; @@ -3980,5 +3975,7 @@ void SMSchedule::print(raw_ostream &os) const { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Utility function used for debugging to print the schedule. -void SMSchedule::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); } +#endif diff --git a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp index fc32183c7f63..71ad4e6aa7f5 100644 --- a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp @@ -1,10 +1,9 @@ - #include "llvm/CodeGen/MachineRegionInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/RegionInfoImpl.h" #include "llvm/CodeGen/MachinePostDominators.h" -#define DEBUG_TYPE "region" +#define DEBUG_TYPE "machine-region-info" using namespace llvm; @@ -86,6 +85,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { auto DF = &getAnalysis<MachineDominanceFrontier>(); RI.recalculate(F, DT, PDT, DF); + + DEBUG(RI.dump()); + return false; } @@ -103,9 +105,10 @@ void MachineRegionInfoPass::verifyAnalysis() const { void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequiredTransitive<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<DominanceFrontierWrapperPass>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineDominanceFrontier>(); + MachineFunctionPass::getAnalysisUsage(AU); } void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const { @@ -119,14 +122,15 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const { #endif char MachineRegionInfoPass::ID = 0; +char &MachineRegionInfoPassID = MachineRegionInfoPass::ID; -INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions", - "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE, + "Detect single entry single exit regions", true, true) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) -INITIALIZE_PASS_END(MachineRegionInfoPass, "regions", - "Detect single entry single exit regions", true, true) +INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE, + "Detect single entry single exit regions", true, true) // Create methods available outside of this file, to use them // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 242cb0b80953..128910f8eb2a 100644 --- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -1,4 +1,4 @@ -//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===// +//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,13 +11,27 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" -#include "llvm/Support/raw_os_ostream.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> using namespace llvm; @@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden, void MachineRegisterInfo::Delegate::anchor() {} MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF) - : MF(MF), TheDelegate(nullptr), - TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && - EnableSubRegLiveness) { + : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() && + EnableSubRegLiveness), + IsUpdatedCSRsInitialized(false) { unsigned NumRegs = getTargetRegisterInfo()->getNumRegs(); VRegInfo.reserve(256); RegAllocHints.reserve(256); @@ -444,8 +458,8 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const { return TRC.getLaneMask(); } -#ifndef NDEBUG -void MachineRegisterInfo::dumpUses(unsigned Reg) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const { for (MachineInstr &I : use_instructions(Reg)) I.dump(); } @@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const { } return false; } + +void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) { + + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + assert(Reg && (Reg < TRI->getNumRegs()) && + "Trying to disable an invalid register"); + + if (!IsUpdatedCSRsInitialized) { + const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); + for (const MCPhysReg *I = CSR; *I; ++I) + UpdatedCSRs.push_back(*I); + + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); + + IsUpdatedCSRsInitialized = true; + } + + // Remove the register (and its aliases from the list). + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI), + UpdatedCSRs.end()); +} + +const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const { + if (IsUpdatedCSRsInitialized) + return UpdatedCSRs.data(); + + return getTargetRegisterInfo()->getCalleeSavedRegs(MF); +} + +void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) { + if (IsUpdatedCSRsInitialized) + UpdatedCSRs.clear(); + + for (MCPhysReg Reg : CSRs) + UpdatedCSRs.push_back(Reg); + + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); + IsUpdatedCSRsInitialized = true; +} diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp index e06bc517fa91..edc3783afa2f 100644 --- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp @@ -12,30 +12,67 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/PriorityQueue.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/ScheduleDFS.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <memory> +#include <string> +#include <tuple> +#include <utility> +#include <vector> using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" namespace llvm { + cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, cl::desc("Force top-down list scheduling")); cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, @@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden, cl::opt<bool> DumpCriticalPathLength("misched-dcpl", cl::Hidden, cl::desc("Print critical path length to stdout")); -} + +} // end namespace llvm #ifndef NDEBUG static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, @@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, cl::desc("Enable memop clustering."), cl::init(true)); -// Experimental heuristics -static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, - cl::desc("Enable scheduling for macro fusion."), cl::init(true)); - static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); @@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8; // Pin the vtables to this file. void MachineSchedStrategy::anchor() {} + void ScheduleDAGMutation::anchor() {} //===----------------------------------------------------------------------===// // Machine Instruction Scheduling Pass and Registry //===----------------------------------------------------------------------===// -MachineSchedContext::MachineSchedContext(): - MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) { +MachineSchedContext::MachineSchedContext() { RegClassInfo = new RegisterClassInfo(); } @@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() { } namespace { + /// Base class for a machine scheduler class that can run at any point. class MachineSchedulerBase : public MachineSchedContext, public MachineFunctionPass { @@ -149,18 +184,20 @@ public: protected: ScheduleDAGInstrs *createPostMachineScheduler(); }; -} // namespace + +} // end anonymous namespace char MachineScheduler::ID = 0; char &llvm::MachineSchedulerID = MachineScheduler::ID; -INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler", +INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler", +INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) MachineScheduler::MachineScheduler() @@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { /// MachineSchedOpt allows command line selection of the scheduler. static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false, - RegisterPassParser<MachineSchedRegistry> > + RegisterPassParser<MachineSchedRegistry>> MachineSchedOpt("misched", cl::init(&useDefaultMachineSched), cl::Hidden, cl::desc("Machine instruction scheduler to use")); @@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // instruction stream until we find the nearest boundary. unsigned NumRegionInstrs = 0; MachineBasicBlock::iterator I = RegionEnd; - for (;I != MBB->begin(); --I) { + for (; I != MBB->begin(); --I) { MachineInstr &MI = *std::prev(I); if (isSchedBoundary(&MI, &*MBB, MF, TII)) break; @@ -495,7 +532,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // thumb2 size reduction is currently an exception, so the PostMIScheduler // needs to do this. if (FixKillFlags) - Scheduler.fixupKills(&*MBB); + Scheduler.fixupKills(*MBB); } Scheduler.finalizeSchedule(); } @@ -504,13 +541,14 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { // unimplemented } -LLVM_DUMP_METHOD -void ReadyQueue::dump() { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ReadyQueue::dump() { dbgs() << "Queue " << Name << ": "; for (unsigned i = 0, e = Queue.size(); i < e; ++i) dbgs() << Queue[i]->NodeNum << " "; dbgs() << "\n"; } +#endif //===----------------------------------------------------------------------===// // ScheduleDAGMI - Basic machine instruction scheduling. This is @@ -519,8 +557,7 @@ void ReadyQueue::dump() { // ===----------------------------------------------------------------------===/ // Provide a vtable anchor. -ScheduleDAGMI::~ScheduleDAGMI() { -} +ScheduleDAGMI::~ScheduleDAGMI() = default; bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) { return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU); @@ -825,7 +862,7 @@ void ScheduleDAGMI::placeDebugValues() { RegionBegin = FirstDbgValue; } - for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator + for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) { std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI); MachineInstr *DbgValue = P.first; @@ -841,7 +878,7 @@ void ScheduleDAGMI::placeDebugValues() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void ScheduleDAGMI::dumpSchedule() const { +LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) { if (SUnit *SU = getSUnit(&(*MI))) SU->dump(this); @@ -1012,7 +1049,7 @@ updateScheduledPressure(const SUnit *SU, ++CritIdx; if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) { if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc() - && NewMaxPressure[ID] <= INT16_MAX) + && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max()) RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]); } unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID); @@ -1136,6 +1173,12 @@ void ScheduleDAGMILive::schedule() { dbgs() << " Pressure Diff : "; getPressureDiff(&SU).dump(*TRI); } + dbgs() << " Single Issue : "; + if (SchedModel.mustBeginGroup(SU.getInstr()) && + SchedModel.mustEndGroup(SU.getInstr())) + dbgs() << "true;"; + else + dbgs() << "false;"; dbgs() << '\n'; } if (ExitSU.getInstr() != nullptr) @@ -1396,6 +1439,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { //===----------------------------------------------------------------------===// namespace { + /// \brief Post-process the DAG to create cluster edges between neighboring /// loads or between neighboring stores. class BaseMemOpClusterMutation : public ScheduleDAGMutation { @@ -1403,6 +1447,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation { SUnit *SU; unsigned BaseReg; int64_t Offset; + MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) : SU(su), BaseReg(reg), Offset(ofs) {} @@ -1439,25 +1484,26 @@ public: LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) : BaseMemOpClusterMutation(tii, tri, true) {} }; -} // anonymous + +} // end anonymous namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI) + return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI) : nullptr; } std::unique_ptr<ScheduleDAGMutation> createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI) + return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI) : nullptr; } -} // namespace llvm +} // end namespace llvm void BaseMemOpClusterMutation::clusterNeighboringMemOps( ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { @@ -1543,80 +1589,11 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } //===----------------------------------------------------------------------===// -// MacroFusion - DAG post-processing to encourage fusion of macro ops. -//===----------------------------------------------------------------------===// - -namespace { -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. -class MacroFusion : public ScheduleDAGMutation { - const TargetInstrInfo &TII; -public: - MacroFusion(const TargetInstrInfo &TII) - : TII(TII) {} - - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; -} // anonymous - -namespace llvm { - -std::unique_ptr<ScheduleDAGMutation> -createMacroFusionDAGMutation(const TargetInstrInfo *TII) { - return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr; -} - -} // namespace llvm - -/// \brief Callback from DAG postProcessing to create cluster edges to encourage -/// fused operations. -void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { - ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); - - // For now, assume targets can only fuse with the branch. - SUnit &ExitSU = DAG->ExitSU; - MachineInstr *Branch = ExitSU.getInstr(); - if (!Branch) - return; - - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.isWeak()) - continue; - SUnit &SU = *PredDep.getSUnit(); - MachineInstr &Pred = *SU.getInstr(); - if (!TII.shouldScheduleAdjacent(Pred, *Branch)) - continue; - - // Create a single weak edge from SU to ExitSU. The only effect is to cause - // bottom-up scheduling to heavily prioritize the clustered SU. There is no - // need to copy predecessor edges from ExitSU to SU, since top-down - // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling - // of SU, we could create an artificial edge from the deepest root, but it - // hasn't been needed yet. - bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); - (void)Success; - assert(Success && "No DAG nodes should be reachable from ExitSU"); - - // Adjust latency of data deps between the nodes. - for (SDep &PredDep : ExitSU.Preds) { - if (PredDep.getSUnit() == &SU) - PredDep.setLatency(0); - } - for (SDep &SuccDep : SU.Succs) { - if (SuccDep.getSUnit() == &ExitSU) - SuccDep.setLatency(0); - } - - DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); - break; - } -} - -//===----------------------------------------------------------------------===// // CopyConstrain - DAG post-processing to encourage copy elimination. //===----------------------------------------------------------------------===// namespace { + /// \brief Post-process the DAG to create weak edges from all uses of a copy to /// the one use that defines the copy's source vreg, most likely an induction /// variable increment. @@ -1626,6 +1603,7 @@ class CopyConstrain : public ScheduleDAGMutation { // RegionEndIdx is the slot index of the last non-debug instruction in the // scheduling region. So we may have RegionBeginIdx == RegionEndIdx. SlotIndex RegionEndIdx; + public: CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} @@ -1634,17 +1612,18 @@ public: protected: void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); }; -} // anonymous + +} // end anonymous namespace namespace llvm { std::unique_ptr<ScheduleDAGMutation> createCopyConstrainDAGMutation(const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - return make_unique<CopyConstrain>(TII, TRI); + const TargetRegisterInfo *TRI) { + return llvm::make_unique<CopyConstrain>(TII, TRI); } -} // namespace llvm +} // end namespace llvm /// constrainLocalCopy handles two possibilities: /// 1) Local src: @@ -1836,7 +1815,7 @@ void SchedBoundary::reset() { CheckPending = false; CurrCycle = 0; CurrMOps = 0; - MinReadyCycle = UINT_MAX; + MinReadyCycle = std::numeric_limits<unsigned>::max(); ExpectedLatency = 0; DependentLatency = 0; RetiredMOps = 0; @@ -1937,12 +1916,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) { && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) { return true; } + unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); return true; } + + if (CurrMOps > 0 && + ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) { + DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must " + << (isTop()? "begin" : "end") << " group\n"); + return true; + } + if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { const MCSchedClassDesc *SC = DAG->getSchedClass(SU); for (TargetSchedModel::ProcResIter @@ -2039,7 +2028,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) { /// Move the boundary of scheduled code by one cycle. void SchedBoundary::bumpCycle(unsigned NextCycle) { if (SchedModel->getMicroOpBufferSize() == 0) { - assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized"); + assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && + "MinReadyCycle uninitialized"); if (MinReadyCycle > NextCycle) NextCycle = MinReadyCycle; } @@ -2237,6 +2227,18 @@ void SchedBoundary::bumpNode(SUnit *SU) { // one cycle. Since we commonly reach the max MOps here, opportunistically // bump the cycle to avoid uselessly checking everything in the readyQ. CurrMOps += IncMOps; + + // Bump the cycle count for issue group constraints. + // This must be done after NextCycle has been adjust for all other stalls. + // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set + // currCycle to X. + if ((isTop() && SchedModel->mustEndGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) { + DEBUG(dbgs() << " Bump cycle to " + << (isTop() ? "end" : "begin") << " group\n"); + bumpCycle(++NextCycle); + } + while (CurrMOps >= SchedModel->getIssueWidth()) { DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle " << CurrCycle << '\n'); @@ -2250,7 +2252,7 @@ void SchedBoundary::bumpNode(SUnit *SU) { void SchedBoundary::releasePending() { // If the available queue is empty, it is safe to reset MinReadyCycle. if (Available.empty()) - MinReadyCycle = UINT_MAX; + MinReadyCycle = std::numeric_limits<unsigned>::max(); // Check to see if any of the pending instructions are ready to issue. If // so, add them to the available queue. @@ -2323,10 +2325,10 @@ SUnit *SchedBoundary::pickOnlyChoice() { return nullptr; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) // This is useful information to dump after bumpNode. // Note that the Queue contents are more useful before pickNodeFromQueue. -void SchedBoundary::dumpScheduledState() { +LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() { unsigned ResFactor; unsigned ResCount; if (ZoneCritResIdx) { @@ -2666,11 +2668,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, } void GenericScheduler::dumpPolicy() { + // Cannot completely remove virtual function even in release mode. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dbgs() << "GenericScheduler RegionPolicy: " << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure << " OnlyTopDown=" << RegionPolicy.OnlyTopDown << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp << "\n"; +#endif } /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic @@ -2724,7 +2729,7 @@ void GenericScheduler::registerRoots() { errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n"; } - if (EnableCyclicPath) { + if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) { Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); checkAcyclicLatency(); } @@ -3106,7 +3111,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { } void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { - MachineBasicBlock::iterator InsertPos = SU->getInstr(); if (!isTop) ++InsertPos; @@ -3154,7 +3158,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { /// Create the standard converging machine scheduler. This will be used as the /// default scheduler if the target does not set a default. ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { - ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)); + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)); // Register DAG post-processors. // // FIXME: extend the mutation API to allow earlier mutations to instantiate @@ -3195,7 +3200,6 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { } } - void PostGenericScheduler::registerRoots() { Rem.CriticalPath = DAG->ExitSU.getDepth(); @@ -3229,6 +3233,12 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) return; + // Keep clustered nodes together. + if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), + Cand.SU == DAG->getNextClusterSucc(), + TryCand, Cand, Cluster)) + return; + // Avoid critical resource consumption and balance the schedule. if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, TryCand, Cand, ResourceReduce)) @@ -3302,7 +3312,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { } ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { - return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C), + return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C), /*RemoveKillFlags=*/true); } @@ -3311,14 +3321,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { //===----------------------------------------------------------------------===// namespace { + /// \brief Order nodes by the ILP metric. struct ILPOrder { - const SchedDFSResult *DFSResult; - const BitVector *ScheduledTrees; + const SchedDFSResult *DFSResult = nullptr; + const BitVector *ScheduledTrees = nullptr; bool MaximizeILP; - ILPOrder(bool MaxILP) - : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {} + ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {} /// \brief Apply a less-than relation on node priority. /// @@ -3347,12 +3357,13 @@ struct ILPOrder { /// \brief Schedule based on the ILP metric. class ILPScheduler : public MachineSchedStrategy { - ScheduleDAGMILive *DAG; + ScheduleDAGMILive *DAG = nullptr; ILPOrder Cmp; std::vector<SUnit*> ReadyQ; + public: - ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {} + ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {} void initialize(ScheduleDAGMI *dag) override { assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness"); @@ -3405,14 +3416,16 @@ public: std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp); } }; -} // namespace + +} // end anonymous namespace static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true)); + return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true)); } static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false)); + return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false)); } + static MachineSchedRegistry ILPMaxRegistry( "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler); static MachineSchedRegistry ILPMinRegistry( @@ -3424,6 +3437,7 @@ static MachineSchedRegistry ILPMinRegistry( #ifndef NDEBUG namespace { + /// Apply a less-than relation on the node order, which corresponds to the /// instruction order prior to scheduling. IsReverse implements greater-than. template<bool IsReverse> @@ -3444,11 +3458,12 @@ class InstructionShuffler : public MachineSchedStrategy { // Using a less-than relation (SUnitOrder<false>) for the TopQ priority // gives nodes with a higher number higher priority causing the latest // instructions to be scheduled first. - PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> > + PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>> TopQ; // When scheduling bottom-up, use greater-than as the queue priority. - PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> > + PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>> BottomQ; + public: InstructionShuffler(bool alternate, bool topdown) : IsAlternating(alternate), IsTopDown(topdown) {} @@ -3492,15 +3507,18 @@ public: BottomQ.push(SU); } }; -} // namespace + +} // end anonymous namespace static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) { bool Alternate = !ForceTopDown && !ForceBottomUp; bool TopDown = !ForceBottomUp; assert((TopDown || !ForceTopDown) && "-misched-topdown incompatible with -misched-bottomup"); - return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown)); + return new ScheduleDAGMILive( + C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown)); } + static MachineSchedRegistry ShufflerRegistry( "shuffle", "Shuffle machine instructions alternating directions", createInstructionShuffler); @@ -3518,8 +3536,7 @@ template<> struct GraphTraits< template<> struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { - - DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} static std::string getGraphName(const ScheduleDAG *G) { return G->MF.getName(); @@ -3576,7 +3593,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { return Str; } }; -} // namespace llvm + +} // end namespace llvm #endif // NDEBUG /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp index 5f87b68123f1..7c34e71a0cce 100644 --- a/contrib/llvm/lib/CodeGen/MachineSink.cpp +++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp @@ -173,14 +173,14 @@ namespace { char MachineSinking::ID = 0; char &llvm::MachineSinkingID = MachineSinking::ID; -INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink", - "Machine code sinking", false, false) +INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE, + "Machine code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineSinking, "machine-sink", - "Machine code sinking", false, false) +INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE, + "Machine code sinking", false, false) bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI, MachineBasicBlock *MBB) { diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp index ef7e525e8165..01391a1a0e50 100644 --- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -1,4 +1,4 @@ -//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===// +//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,21 +7,35 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <tuple> +#include <utility> using namespace llvm; @@ -30,16 +44,14 @@ using namespace llvm; char MachineTraceMetrics::ID = 0; char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID; -INITIALIZE_PASS_BEGIN(MachineTraceMetrics, - "machine-trace-metrics", "Machine Trace Metrics", false, true) +INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, + "Machine Trace Metrics", false, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineTraceMetrics, - "machine-trace-metrics", "Machine Trace Metrics", false, true) +INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE, + "Machine Trace Metrics", false, true) -MachineTraceMetrics::MachineTraceMetrics() - : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr), - MRI(nullptr), Loops(nullptr) { +MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) { std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr); } @@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const { return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds); } - //===----------------------------------------------------------------------===// // Ensemble utility functions //===----------------------------------------------------------------------===// @@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct) } // Virtual destructor serves as an anchor. -MachineTraceMetrics::Ensemble::~Ensemble() {} +MachineTraceMetrics::Ensemble::~Ensemble() = default; const MachineLoop* MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const { @@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) { // MinInstrCountEnsemble - Pick the trace that executes the least number of // instructions. namespace { + class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble { const char *getName() const override { return "MinInstr"; } const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override; @@ -306,7 +318,8 @@ public: MinInstrCountEnsemble(MachineTraceMetrics *mtm) : MachineTraceMetrics::Ensemble(mtm) {} }; -} + +} // end anonymous namespace // Select the preferred predecessor for MBB. const MachineBasicBlock* @@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const { // revisit blocks. namespace { + struct LoopBounds { MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks; SmallPtrSet<const MachineBasicBlock*, 8> Visited; const MachineLoopInfo *Loops; - bool Downward; + bool Downward = false; + LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks, - const MachineLoopInfo *loops) - : Blocks(blocks), Loops(loops), Downward(false) {} + const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {} }; -} + +} // end anonymous namespace // Specialize po_iterator_storage in order to prune the post-order traversal so // it is limited to the current loop and doesn't traverse the loop back edges. namespace llvm { + template<> class po_iterator_storage<LoopBounds, true> { LoopBounds &LB; + public: po_iterator_storage(LoopBounds &lb) : LB(lb) {} + void finishPostorder(const MachineBasicBlock*) {} bool insertEdge(Optional<const MachineBasicBlock *> From, @@ -452,7 +470,8 @@ public: return LB.Visited.insert(To).second; } }; -} + +} // end namespace llvm /// Compute the trace through MBB. void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) { @@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const { // A data dependency is represented as a defining MI and operand numbers on the // defining and using MI. namespace { + struct DataDep { const MachineInstr *DefMI; unsigned DefOp; @@ -622,7 +642,8 @@ struct DataDep { assert((++DefI).atEnd() && "Register has multiple defs"); } }; -} + +} // end anonymous namespace // Get the input data dependencies that must be ready before UseMI can issue. // Return true if UseMI has any physreg operands. @@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI, // direction instructions are scanned, it could be the operand that defined the // regunit, or the highest operand to read the regunit. namespace { + struct LiveRegUnit { unsigned RegUnit; - unsigned Cycle; - const MachineInstr *MI; - unsigned Op; + unsigned Cycle = 0; + const MachineInstr *MI = nullptr; + unsigned Op = 0; unsigned getSparseSetIndex() const { return RegUnit; } - LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {} + LiveRegUnit(unsigned RU) : RegUnit(RU) {} }; -} + +} // end anonymous namespace // Identify physreg dependencies for UseMI, and update the live regunit // tracking set when scanning instructions downwards. @@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height, return Height; } - typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap; // Push the height of DefMI upwards if required to match UseMI. diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp index a98139f9e5af..265f93c363ca 100644 --- a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp @@ -87,7 +87,6 @@ namespace { RegSet regsLive; RegVector regsDefined, regsDead, regsKilled; RegMaskVector regMasks; - RegSet regsLiveInButUnused; SlotIndex lastIndex; @@ -188,8 +187,9 @@ namespace { return Reg < regsReserved.size() && regsReserved.test(Reg); } - bool isAllocatable(unsigned Reg) { - return Reg < TRI->getNumRegs() && MRI->isAllocatable(Reg); + bool isAllocatable(unsigned Reg) const { + return Reg < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) && + !regsReserved.test(Reg); } // Analysis information if available @@ -260,8 +260,8 @@ namespace { static char ID; // Pass ID, replacement for typeid const std::string Banner; - MachineVerifierPass(const std::string &banner = nullptr) - : MachineFunctionPass(ID), Banner(banner) { + MachineVerifierPass(std::string banner = std::string()) + : MachineFunctionPass(ID), Banner(std::move(banner)) { initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry()); } @@ -418,7 +418,6 @@ unsigned MachineVerifier::verify(MachineFunction &MF) { regsDead.clear(); regsKilled.clear(); regMasks.clear(); - regsLiveInButUnused.clear(); MBBInfoMap.clear(); return foundErrors; @@ -526,9 +525,11 @@ void MachineVerifier::markReachable(const MachineBasicBlock *MBB) { void MachineVerifier::visitMachineFunctionBefore() { lastIndex = SlotIndex(); - regsReserved = MRI->getReservedRegs(); + regsReserved = MRI->reservedRegsFrozen() ? MRI->getReservedRegs() + : TRI->getReservedRegs(*MF); - markReachable(&MF->front()); + if (!MF->empty()) + markReachable(&MF->front()); // Build a set of the basic blocks in the function. FunctionBlocks.clear(); @@ -548,7 +549,8 @@ void MachineVerifier::visitMachineFunctionBefore() { // Check that the register use lists are sane. MRI->verifyUseLists(); - verifyStackFrame(); + if (!MF->empty()) + verifyStackFrame(); } // Does iterator point to a and b as the first two elements? @@ -572,7 +574,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { for (const auto &LI : MBB->liveins()) { if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && MBB->getIterator() != MBB->getParent()->begin()) { - report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB); + report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB); } } } @@ -752,11 +754,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { regsLive.insert(*SubRegs); } } - regsLiveInButUnused = regsLive; const MachineFrameInfo &MFI = MF->getFrameInfo(); BitVector PR = MFI.getPristineRegs(*MF); - for (int I = PR.find_first(); I>0; I = PR.find_next(I)) { + for (unsigned I : PR.set_bits()) { for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) regsLive.insert(*SubRegs); @@ -908,6 +909,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { } } + // Generic loads and stores must have a single MachineMemOperand + // describing that access. + if ((MI->getOpcode() == TargetOpcode::G_LOAD || + MI->getOpcode() == TargetOpcode::G_STORE) && + !MI->hasOneMemOperand()) + report("Generic instruction accessing memory must have one mem operand", + MI); + StringRef ErrorInfo; if (!TII->verifyInstruction(*MI, ErrorInfo)) report(ErrorInfo.data(), MI); @@ -1256,8 +1265,6 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { // Both use and def operands can read a register. if (MO->readsReg()) { - regsLiveInButUnused.erase(Reg); - if (MO->isKill()) addRegWithSubRegs(regsKilled, Reg); @@ -2020,6 +2027,8 @@ namespace { void MachineVerifier::verifyStackFrame() { unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u) + return; SmallVector<StackStateOfBB, 8> SPState; SPState.resize(MF->getNumBlockIDs()); @@ -2047,23 +2056,14 @@ void MachineVerifier::verifyStackFrame() { // Update stack state by checking contents of MBB. for (const auto &I : *MBB) { if (I.getOpcode() == FrameSetupOpcode) { - // The first operand of a FrameOpcode should be i32. - int Size = I.getOperand(0).getImm(); - assert(Size >= 0 && - "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - if (BBState.ExitIsSetup) report("FrameSetup is after another FrameSetup", &I); - BBState.ExitValue -= Size; + BBState.ExitValue -= TII->getFrameTotalSize(I); BBState.ExitIsSetup = true; } if (I.getOpcode() == FrameDestroyOpcode) { - // The first operand of a FrameOpcode should be i32. - int Size = I.getOperand(0).getImm(); - assert(Size >= 0 && - "Value should be non-negative in FrameSetup and FrameDestroy.\n"); - + int Size = TII->getFrameTotalSize(I); if (!BBState.ExitIsSetup) report("FrameDestroy is not after a FrameSetup", &I); int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue : diff --git a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp index 2a8531f337a0..76ad668104b4 100644 --- a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp +++ b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp @@ -23,7 +23,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "phi-opt" +#define DEBUG_TYPE "opt-phis" STATISTIC(NumPHICycles, "Number of PHI cycles replaced"); STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles"); @@ -59,7 +59,7 @@ namespace { char OptimizePHIs::ID = 0; char &llvm::OptimizePHIsID = OptimizePHIs::ID; -INITIALIZE_PASS(OptimizePHIs, "opt-phis", +INITIALIZE_PASS(OptimizePHIs, DEBUG_TYPE, "Optimize machine instruction PHIs", false, false) bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) { diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp index c67a25b888bf..9c898fa40d7e 100644 --- a/contrib/llvm/lib/CodeGen/PHIElimination.cpp +++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp @@ -34,7 +34,7 @@ #include <algorithm> using namespace llvm; -#define DEBUG_TYPE "phielim" +#define DEBUG_TYPE "phi-node-elimination" static cl::opt<bool> DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false), @@ -112,11 +112,11 @@ STATISTIC(NumReused, "Number of reused lowered phis"); char PHIElimination::ID = 0; char& llvm::PHIEliminationID = PHIElimination::ID; -INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination", +INITIALIZE_PASS_BEGIN(PHIElimination, DEBUG_TYPE, "Eliminate PHI nodes for register allocation", false, false) INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination", +INITIALIZE_PASS_END(PHIElimination, DEBUG_TYPE, "Eliminate PHI nodes for register allocation", false, false) void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp index ad9166f1ed23..00e72971a01e 100644 --- a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp +++ b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp @@ -75,7 +75,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) { .addImm(FirstActualI->getOpcode()); for (auto &MO : FirstActualI->operands()) - MIB.addOperand(MO); + MIB.add(MO); FirstActualI->eraseFromParent(); MF.ensureAlignment(4); diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp index 6081916a6a82..f2249f9e37e0 100644 --- a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -200,7 +200,7 @@ namespace { char &llvm::PostRASchedulerID = PostRAScheduler::ID; -INITIALIZE_PASS(PostRAScheduler, "post-RA-sched", +INITIALIZE_PASS(PostRAScheduler, DEBUG_TYPE, "Post RA top-down list latency scheduler", false, false) SchedulePostRATDList::SchedulePostRATDList( @@ -253,7 +253,7 @@ void SchedulePostRATDList::exitRegion() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// dumpSchedule - dump the scheduled Sequence. -void SchedulePostRATDList::dumpSchedule() const { +LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const { for (unsigned i = 0, e = Sequence.size(); i != e; i++) { if (SUnit *SU = Sequence[i]) SU->dump(this); @@ -367,7 +367,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { Scheduler.finishBlock(); // Update register kills - Scheduler.fixupKills(&MBB); + Scheduler.fixupKills(MBB); } return true; diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp index d27ea2f51867..0118580a626a 100644 --- a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp @@ -20,7 +20,7 @@ using namespace llvm; -#define DEBUG_TYPE "processimplicitdefs" +#define DEBUG_TYPE "processimpdefs" namespace { /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def @@ -51,9 +51,7 @@ public: char ProcessImplicitDefs::ID = 0; char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID; -INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs", - "Process Implicit Definitions", false, false) -INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs", +INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE, "Process Implicit Definitions", false, false) void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 5fca7fa5536b..a9813e534c5f 100644 --- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -45,7 +45,7 @@ using namespace llvm; -#define DEBUG_TYPE "pei" +#define DEBUG_TYPE "prologepilog" typedef SmallVector<MachineBasicBlock *, 4> MBBVector; static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, @@ -60,19 +60,8 @@ namespace { class PEI : public MachineFunctionPass { public: static char ID; - explicit PEI(const TargetMachine *TM = nullptr) : MachineFunctionPass(ID) { + PEI() : MachineFunctionPass(ID) { initializePEIPass(*PassRegistry::getPassRegistry()); - - if (TM && (!TM->usesPhysRegsForPEI())) { - SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *, - unsigned &, unsigned &, const MBBVector &, - const MBBVector &) {}; - ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {}; - } else { - SpillCalleeSavedRegisters = doSpillCalleeSavedRegs; - ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs; - UsesCalleeSaves = true; - } } void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -140,18 +129,17 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1), cl::desc("Warn for stack size bigger than the given" " number")); -INITIALIZE_TM_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion", - false, false) +INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false, + false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_TM_PASS_END(PEI, "prologepilog", - "Prologue/Epilogue Insertion & Frame Finalization", - false, false) +INITIALIZE_PASS_END(PEI, DEBUG_TYPE, + "Prologue/Epilogue Insertion & Frame Finalization", false, + false) -MachineFunctionPass * -llvm::createPrologEpilogInserterPass(const TargetMachine *TM) { - return new PEI(TM); +MachineFunctionPass *llvm::createPrologEpilogInserterPass() { + return new PEI(); } STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); @@ -174,6 +162,20 @@ typedef SmallSetVector<int, 8> StackObjSet; /// frame indexes with appropriate references. /// bool PEI::runOnMachineFunction(MachineFunction &Fn) { + if (!SpillCalleeSavedRegisters) { + const TargetMachine &TM = Fn.getTarget(); + if (!TM.usesPhysRegsForPEI()) { + SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *, + unsigned &, unsigned &, const MBBVector &, + const MBBVector &) {}; + ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {}; + } else { + SpillCalleeSavedRegisters = doSpillCalleeSavedRegs; + ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs; + UsesCalleeSaves = true; + } + } + const Function* F = Fn.getFunction(); const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); @@ -265,11 +267,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) { std::vector<MachineBasicBlock::iterator> FrameSDOps; for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) - if (I->getOpcode() == FrameSetupOpcode || - I->getOpcode() == FrameDestroyOpcode) { - assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" - " instructions should have a single immediate argument!"); - unsigned Size = I->getOperand(0).getImm(); + if (TII.isFrameInstr(*I)) { + unsigned Size = TII.getFrameSize(*I); if (Size > MaxCallFrameSize) MaxCallFrameSize = Size; AdjustsStack = true; FrameSDOps.push_back(I); @@ -280,6 +279,9 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) { AdjustsStack = true; } + assert(!MFI.isMaxCallFrameSizeComputed() || + (MFI.getMaxCallFrameSize() == MaxCallFrameSize && + MFI.adjustsStack() == AdjustsStack)); MFI.setAdjustsStack(AdjustsStack); MFI.setMaxCallFrameSize(MaxCallFrameSize); @@ -336,7 +338,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, return; const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); + const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs(); std::vector<CalleeSavedInfo> CSI; for (unsigned i = 0; CSRegs[i]; ++i) { @@ -376,22 +378,22 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, FixedSlot->Reg != Reg) ++FixedSlot; + unsigned Size = RegInfo->getSpillSize(*RC); if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { // Nope, just spill it anywhere convenient. - unsigned Align = RC->getAlignment(); + unsigned Align = RegInfo->getSpillAlignment(*RC); unsigned StackAlign = TFI->getStackAlignment(); // We may not be able to satisfy the desired alignment specification of // the TargetRegisterClass if the stack alignment is smaller. Use the // min. Align = std::min(Align, StackAlign); - FrameIdx = MFI.CreateStackObject(RC->getSize(), Align, true); + FrameIdx = MFI.CreateStackObject(Size, Align, true); if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } else { // Spill it to the stack where we must. - FrameIdx = - MFI.CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); + FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset); } CS.setFrameIdx(FrameIdx); @@ -448,12 +450,13 @@ static void updateLiveness(MachineFunction &MF) { const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { for (MachineBasicBlock *MBB : Visited) { MCPhysReg Reg = CSI[i].getReg(); // Add the callee-saved register as live-in. // It's killed at the spill. - if (!MBB->isLiveIn(Reg)) + if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); } } @@ -764,6 +767,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { } else if (MaxCSFrameIndex >= MinCSFrameIndex) { // Be careful about underflow in comparisons agains MinCSFrameIndex. for (unsigned i = MaxCSFrameIndex; i != MinCSFrameIndex - 1; --i) { + if (MFI.isDeadObjectIndex(i)) + continue; + unsigned Align = MFI.getObjectAlignment(i); // Adjust to alignment boundary Offset = alignTo(Offset, Align, Skew); @@ -1049,8 +1055,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); - unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); - unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); if (RS && FrameIndexEliminationScavenging) RS->enterBasicBlock(*BB); @@ -1059,11 +1063,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { - if (I->getOpcode() == FrameSetupOpcode || - I->getOpcode() == FrameDestroyOpcode) { - InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); + if (TII.isFrameInstr(*I)) { + InsideCallSequence = TII.isFrameSetup(*I); SPAdj += TII.getSPAdjust(*I); - I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); continue; } @@ -1237,4 +1239,6 @@ doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) { ++I; } } + + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); } diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp index 804a4c3dad66..b29e62bf1aa3 100644 --- a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp +++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp @@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {} PseudoSourceValue::~PseudoSourceValue() {} void PseudoSourceValue::printCustom(raw_ostream &O) const { - O << PSVNames[Kind]; + if (Kind < TargetCustom) + O << PSVNames[Kind]; + else + O << "TargetCustom" << Kind; } bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const { diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp index a558e371ad4c..a87fed3a687e 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -176,8 +176,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg, for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); Q.collectInterferingVRegs(); - if (Q.seenUnspillableVReg()) - return false; for (unsigned i = Q.interferingVRegs().size(); i; --i) { LiveInterval *Intf = Q.interferingVRegs()[i - 1]; if (!Intf->isSpillable() || Intf->weight > VirtReg.weight) diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp index fd759bc372b2..c606b7b83310 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp @@ -212,8 +212,9 @@ int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) { return SS; // Already has space allocated? // Allocate a new stack object for this spill location... - int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(), - RC->getAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); + int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(Size, Align); // Assign the slot. StackSlotForVirtReg[VirtReg] = FrameIdx; @@ -304,19 +305,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveDbgValueMap[LRI->VirtReg]; for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) { MachineInstr *DBG = LRIDbgValues[li]; - const MDNode *Var = DBG->getDebugVariable(); - const MDNode *Expr = DBG->getDebugExpression(); - bool IsIndirect = DBG->isIndirectDebugValue(); - uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0; - DebugLoc DL = DBG->getDebugLoc(); - assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) && - "Expected inlined-at fields to agree"); - MachineInstr *NewDV = - BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE)) - .addFrameIndex(FI) - .addImm(Offset) - .addMetadata(Var) - .addMetadata(Expr); + MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI); assert(NewDV->getParent() == MBB && "dangling parent pointer"); (void)NewDV; DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV); diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp index c47cfb1b986f..47d726f6da7a 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -29,8 +29,10 @@ #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" @@ -125,6 +127,7 @@ class RAGreedy : public MachineFunctionPass, MachineBlockFrequencyInfo *MBFI; MachineDominatorTree *DomTree; MachineLoopInfo *Loops; + MachineOptimizationRemarkEmitter *ORE; EdgeBundles *Bundles; SpillPlacement *SpillPlacer; LiveDebugVariables *DebugVars; @@ -282,8 +285,7 @@ class RAGreedy : public MachineFunctionPass, // Set B[i] = C for every live bundle where B[i] was NoCand. unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) { unsigned Count = 0; - for (int i = LiveBundles.find_first(); i >= 0; - i = LiveBundles.find_next(i)) + for (unsigned i : LiveBundles.set_bits()) if (B[i] == NoCand) { B[i] = C; Count++; @@ -419,6 +421,20 @@ private: void collectHintInfo(unsigned, HintsInfo &); bool isUnusedCalleeSavedReg(unsigned PhysReg) const; + + /// Compute and report the number of spills and reloads for a loop. + void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, + unsigned &FoldedReloads, unsigned &Spills, + unsigned &FoldedSpills); + + /// Report the number of spills and reloads for each loop. + void reportNumberOfSplillsReloads() { + for (MachineLoop *L : *Loops) { + unsigned Reloads, FoldedReloads, Spills, FoldedSpills; + reportNumberOfSplillsReloads(L, Reloads, FoldedReloads, Spills, + FoldedSpills); + } + } }; } // end anonymous namespace @@ -439,6 +455,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(SpillPlacement) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) INITIALIZE_PASS_END(RAGreedy, "greedy", "Greedy Register Allocator", false, false) @@ -490,6 +507,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<LiveRegMatrix>(); AU.addRequired<EdgeBundles>(); AU.addRequired<SpillPlacement>(); + AU.addRequired<MachineOptimizationRemarkEmitterPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -679,7 +697,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) { MCRegUnitIterator Units(PhysReg, TRI); for (; Units.isValid(); ++Units) { // Instantiate a "subquery", not to be confused with the Queries array. - LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]); + LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]); if (subQ.checkInterference()) break; } @@ -830,7 +848,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg, SmallVector<LiveInterval*, 8> Intfs; for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); - assert(Q.seenAllInterferences() && "Didn't check all interfererences."); + // We usually have the interfering VRegs cached so collectInterferingVRegs() + // should be fast, we may need to recalculate if when different physregs + // overlap the same register unit so we had different SubRanges queried + // against it. + Q.collectInterferingVRegs(); ArrayRef<LiveInterval*> IVR = Q.interferingVRegs(); Intfs.append(IVR.begin(), IVR.end()); } @@ -1139,9 +1161,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) { } DEBUG({ - for (int i = Cand.LiveBundles.find_first(); i>=0; - i = Cand.LiveBundles.find_next(i)) - dbgs() << " EB#" << i; + for (int i : Cand.LiveBundles.set_bits()) + dbgs() << " EB#" << i; dbgs() << ".\n"; }); return true; @@ -1459,8 +1480,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg, DEBUG({ dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost) << " with bundles"; - for (int i = Cand.LiveBundles.find_first(); i>=0; - i = Cand.LiveBundles.find_next(i)) + for (int i : Cand.LiveBundles.set_bits()) dbgs() << " EB#" << i; dbgs() << ".\n"; }); @@ -2611,6 +2631,69 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, return 0; } +void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads, + unsigned &FoldedReloads, + unsigned &Spills, + unsigned &FoldedSpills) { + Reloads = 0; + FoldedReloads = 0; + Spills = 0; + FoldedSpills = 0; + + // Sum up the spill and reloads in subloops. + for (MachineLoop *SubLoop : *L) { + unsigned SubReloads; + unsigned SubFoldedReloads; + unsigned SubSpills; + unsigned SubFoldedSpills; + + reportNumberOfSplillsReloads(SubLoop, SubReloads, SubFoldedReloads, + SubSpills, SubFoldedSpills); + Reloads += SubReloads; + FoldedReloads += SubFoldedReloads; + Spills += SubSpills; + FoldedSpills += SubFoldedSpills; + } + + const MachineFrameInfo &MFI = MF->getFrameInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + int FI; + + for (MachineBasicBlock *MBB : L->getBlocks()) + // Handle blocks that were not included in subloops. + if (Loops->getLoopFor(MBB) == L) + for (MachineInstr &MI : *MBB) { + const MachineMemOperand *MMO; + + if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) + ++Reloads; + else if (TII->hasLoadFromStackSlot(MI, MMO, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++FoldedReloads; + else if (TII->isStoreToStackSlot(MI, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++Spills; + else if (TII->hasStoreToStackSlot(MI, MMO, FI) && + MFI.isSpillSlotObjectIndex(FI)) + ++FoldedSpills; + } + + if (Reloads || FoldedReloads || Spills || FoldedSpills) { + using namespace ore; + MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReload", + L->getStartLoc(), L->getHeader()); + if (Spills) + R << NV("NumSpills", Spills) << " spills "; + if (FoldedSpills) + R << NV("NumFoldedSpills", FoldedSpills) << " folded spills "; + if (Reloads) + R << NV("NumReloads", Reloads) << " reloads "; + if (FoldedReloads) + R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads "; + ORE->emit(R << "generated in loop"); + } +} + bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n" << "********** Function: " << mf.getName() << '\n'); @@ -2633,6 +2716,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { Indexes = &getAnalysis<SlotIndexes>(); MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); DomTree = &getAnalysis<MachineDominatorTree>(); + ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE(); SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM)); Loops = &getAnalysis<MachineLoopInfo>(); Bundles = &getAnalysis<EdgeBundles>(); @@ -2658,6 +2742,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { allocatePhysRegs(); tryHintsRecoloring(); postOptimization(); + reportNumberOfSplillsReloads(); releaseMemory(); return true; diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp index 101b30bf3b65..3b5964eef55e 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -1,4 +1,4 @@ -//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===// +//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===// // // The LLVM Compiler Infrastructure // @@ -29,34 +29,61 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegAllocPBQP.h" #include "RegisterCoalescer.h" #include "Spiller.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/CalcSpillWeights.h" +#include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PBQP/Graph.h" +#include "llvm/CodeGen/PBQP/Solution.h" +#include "llvm/CodeGen/PBQPRAConstraint.h" +#include "llvm/CodeGen/RegAllocPBQP.h" #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstddef> #include <limits> +#include <map> #include <memory> #include <queue> #include <set> #include <sstream> +#include <string> +#include <system_error> +#include <tuple> #include <vector> +#include <utility> using namespace llvm; @@ -86,7 +113,6 @@ namespace { /// Programming problems. class RegAllocPBQP : public MachineFunctionPass { public: - static char ID; /// Construct a PBQP register allocator. @@ -113,7 +139,6 @@ public: } private: - typedef std::map<const LiveInterval*, unsigned> LI2NodeMap; typedef std::vector<const LiveInterval*> Node2LIMap; typedef std::vector<unsigned> AllowedSet; @@ -187,7 +212,6 @@ public: /// @brief Add interference edges between overlapping vregs. class Interference : public PBQPRAConstraint { private: - typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr; typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey; typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache; @@ -276,7 +300,6 @@ private: } public: - void apply(PBQPRAGraph &G) override { // The following is loosely based on the linear scan algorithm introduced in // "Linear Scan Register Allocation" by Poletto and Sarkar. This version @@ -363,7 +386,6 @@ public: } private: - // Create an Interference edge and add it to the graph, unless it is // a null matrix, meaning the nodes' allowed registers do not have any // interference. This case occurs frequently between integer and floating @@ -372,7 +394,6 @@ private: bool createInterferenceEdge(PBQPRAGraph &G, PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId, IMatrixCache &C) { - const TargetRegisterInfo &TRI = *G.getMetadata().MF.getSubtarget().getRegisterInfo(); const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs(); @@ -409,7 +430,6 @@ private: } }; - class Coalescing : public PBQPRAConstraint { public: void apply(PBQPRAGraph &G) override { @@ -421,7 +441,6 @@ public: // gives the Ok. for (const auto &MBB : MF) { for (const auto &MI : MBB) { - // Skip not-coalescable or already coalesced copies. if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg()) continue; @@ -479,7 +498,6 @@ public: } private: - void addVirtRegCoalesce( PBQPRAGraph::RawMatrix &CostMat, const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1, @@ -496,14 +514,15 @@ private: } } } - }; -} // End anonymous namespace. +} // end anonymous namespace // Out-of-line destructor/anchor for PBQPRAConstraint. -PBQPRAConstraint::~PBQPRAConstraint() {} +PBQPRAConstraint::~PBQPRAConstraint() = default; + void PBQPRAConstraint::anchor() {} + void PBQPRAConstraintList::anchor() {} void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { @@ -554,7 +573,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF, static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI, const MachineFunction &MF) { - const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); + const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs(); for (unsigned i = 0; CSR[i] != 0; ++i) if (TRI.regsOverlap(reg, CSR[i])) return true; @@ -777,7 +796,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { // If there are non-empty intervals allocate them using pbqp. if (!VRegsToAlloc.empty()) { - const TargetSubtargetInfo &Subtarget = MF.getSubtarget(); std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot = llvm::make_unique<PBQPRAConstraintList>(); @@ -840,7 +858,8 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId, }); } -void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { for (auto NId : nodeIds()) { const Vector &Costs = getNodeCosts(NId); assert(Costs.getLength() != 0 && "Empty vector in graph."); @@ -861,7 +880,10 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { } } -LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); } +LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { + dump(dbgs()); +} +#endif void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const { OS << "graph {\n"; diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp index ece44c28e9ed..855aa37ff3c3 100644 --- a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp @@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Clobbered Registers: "); - for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) - if (MRI->isPhysRegModified(PReg, true)) - RegMask[PReg / 32] &= ~(1u << PReg % 32); + const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask(); + auto SetRegAsDefined = [&RegMask] (unsigned Reg) { + RegMask[Reg / 32] &= ~(1u << Reg % 32); + }; + // Scan all the physical registers. When a register is defined in the current + // function set it and all the aliasing registers as defined in the regmask. + for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) { + // If a register is in the UsedPhysRegsMask set then mark it as defined. + // All it's aliases will also be in the set, so we can skip setting + // as defined all the aliases here. + if (UsedPhysRegsMask.test(PReg)) { + SetRegAsDefined(PReg); + continue; + } + // If a register is defined by an instruction mark it as defined together + // with all it's aliases. + if (!MRI->def_empty(PReg)) { + for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI) + SetRegAsDefined(*AI); + } + } if (!TargetFrameLowering::isSafeForNoCSROpt(F)) { const uint32_t *CallPreservedMask = diff --git a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp index 178fa18ac5a6..82a3bd9a0bd1 100644 --- a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -1,4 +1,4 @@ -//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===// +//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===// // // The LLVM Compiler Infrastructure // @@ -14,12 +14,22 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; @@ -29,8 +39,7 @@ static cl::opt<unsigned> StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"), cl::desc("Limit all regclasses to N registers")); -RegisterClassInfo::RegisterClassInfo() - : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {} +RegisterClassInfo::RegisterClassInfo() = default; void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { bool Update = false; @@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { // Does this MF have different CSRs? assert(TRI && "no register info set"); - const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); - if (Update || CSR != CalleeSaved) { - // Build a CSRNum map. Every CSR alias gets an entry pointing to the last + + // Get the callee saved registers. + const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs(); + if (Update || CSR != CalleeSavedRegs) { + // Build a CSRAlias map. Every CSR alias saves the last // overlapping CSR. - CSRNum.clear(); - CSRNum.resize(TRI->getNumRegs(), 0); - for (unsigned N = 0; unsigned Reg = CSR[N]; ++N) - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ... + CalleeSavedAliases.resize(TRI->getNumRegs(), 0); + for (const MCPhysReg *I = CSR; *I; ++I) + for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) + CalleeSavedAliases[*AI] = *I; + Update = true; } - CalleeSaved = CSR; + CalleeSavedRegs = CSR; // Different reserved registers? const BitVector &RR = MF->getRegInfo().getReservedRegs(); @@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { unsigned Cost = TRI->getCostPerUse(PhysReg); MinCost = std::min(MinCost, Cost); - if (CSRNum[PhysReg]) + if (CalleeSavedAliases[PhysReg]) // PhysReg aliases a CSR, save it for later. CSRAlias.push_back(PhysReg); else { @@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { } } RCI.NumRegs = N + CSRAlias.size(); - assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); + assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass"); // CSR aliases go after the volatile registers, preserve the target's order. for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) { @@ -156,9 +167,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { const TargetRegisterClass *RC = nullptr; unsigned NumRCUnits = 0; - for (TargetRegisterInfo::regclass_iterator - RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) { - const int *PSetID = TRI->getRegClassPressureSets(*RI); + for (const TargetRegisterClass *C : TRI->regclasses()) { + const int *PSetID = TRI->getRegClassPressureSets(C); for (; *PSetID != -1; ++PSetID) { if ((unsigned)*PSetID == Idx) break; @@ -168,9 +178,9 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { // Found a register class that counts against this pressure set. // For efficiency, only compute the set order for the largest set. - unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit; + unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit; if (!RC || NUnits > NumRCUnits) { - RC = *RI; + RC = C; NumRCUnits = NUnits; } } diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp index 4bb3c229afc5..7b3a5d5c5ff7 100644 --- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -189,6 +190,9 @@ namespace { /// This returns true if an interval was modified. bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); + /// We found a copy which can be moved to its less frequent predecessor. + bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI); + /// If the source of a copy is defined by a /// trivial computation, replace the copy by rematerialize the definition. bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, @@ -811,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, VNInfo *ASubValNo = SA.getVNInfoAt(AIdx); assert(ASubValNo != nullptr); - LaneBitmask AMask = SA.LaneMask; - for (LiveInterval::SubRange &SB : IntB.subranges()) { - LaneBitmask BMask = SB.LaneMask; - LaneBitmask Common = BMask & AMask; - if (Common.none()) - continue; - - DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask) - << " into " << PrintLaneMask(Common) << '\n'); - LaneBitmask BRest = BMask & ~AMask; - LiveInterval::SubRange *CommonRange; - if (BRest.any()) { - SB.LaneMask = BRest; - DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest) - << '\n'); - // Duplicate SubRange for newly merged common stuff. - CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB); - } else { - // We van reuse the L SubRange. - SB.LaneMask = Common; - CommonRange = &SB; - } - LiveRange RangeCopy(SB, Allocator); - - VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx); - assert(BSubValNo->def == CopyIdx); - BSubValNo->def = ASubValNo->def; - addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo); - AMask &= ~BMask; - } - if (AMask.any()) { - DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n'); - LiveRange *NewRange = IntB.createSubRange(Allocator, AMask); - VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator); - addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo); - } + IntB.refineSubRanges(Allocator, SA.LaneMask, + [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) { + VNInfo *BSubValNo = SR.empty() + ? SR.getNextValue(CopyIdx, Allocator) + : SR.getVNInfoAt(CopyIdx); + assert(BSubValNo != nullptr); + addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo); + }); } } @@ -861,6 +837,184 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, return true; } +/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a +/// predecessor of BB2, and if B is not redefined on the way from A = B +/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the +/// execution goes through the path from BB0 to BB2. We may move B = A +/// to the predecessor without such reversed copy. +/// So we will transform the program from: +/// BB0: +/// A = B; BB1: +/// ... ... +/// / \ / +/// BB2: +/// ... +/// B = A; +/// +/// to: +/// +/// BB0: BB1: +/// A = B; ... +/// ... B = A; +/// / \ / +/// BB2: +/// ... +/// +/// A special case is when BB0 and BB2 are the same BB which is the only +/// BB in a loop: +/// BB1: +/// ... +/// BB0/BB2: ---- +/// B = A; | +/// ... | +/// A = B; | +/// |------- +/// | +/// We may hoist B = A from BB0/BB2 to BB1. +/// +/// The major preconditions for correctness to remove such partial +/// redundancy include: +/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of +/// the PHI is defined by the reversed copy A = B in BB0. +/// 2. No B is referenced from the start of BB2 to B = A. +/// 3. No B is defined from A = B to the end of BB0. +/// 4. BB1 has only one successor. +/// +/// 2 and 4 implicitly ensure B is not live at the end of BB1. +/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a +/// colder place, which not only prevent endless loop, but also make sure +/// the movement of copy is beneficial. +bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, + MachineInstr &CopyMI) { + assert(!CP.isPhys()); + if (!CopyMI.isFullCopy()) + return false; + + MachineBasicBlock &MBB = *CopyMI.getParent(); + if (MBB.isEHPad()) + return false; + + if (MBB.pred_size() != 2) + return false; + + LiveInterval &IntA = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + LiveInterval &IntB = + LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg()); + + // A is defined by PHI at the entry of MBB. + SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true); + VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx); + assert(AValNo && !AValNo->isUnused() && "COPY source not live"); + if (!AValNo->isPHIDef()) + return false; + + // No B is referenced before CopyMI in MBB. + if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx)) + return false; + + // MBB has two predecessors: one contains A = B so no copy will be inserted + // for it. The other one will have a copy moved from MBB. + bool FoundReverseCopy = false; + MachineBasicBlock *CopyLeftBB = nullptr; + for (MachineBasicBlock *Pred : MBB.predecessors()) { + VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred)); + MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def); + if (!DefMI || !DefMI->isFullCopy()) { + CopyLeftBB = Pred; + continue; + } + // Check DefMI is a reverse copy and it is in BB Pred. + if (DefMI->getOperand(0).getReg() != IntA.reg || + DefMI->getOperand(1).getReg() != IntB.reg || + DefMI->getParent() != Pred) { + CopyLeftBB = Pred; + continue; + } + // If there is any other def of B after DefMI and before the end of Pred, + // we need to keep the copy of B = A at the end of Pred if we remove + // B = A from MBB. + bool ValB_Changed = false; + for (auto VNI : IntB.valnos) { + if (VNI->isUnused()) + continue; + if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) { + ValB_Changed = true; + break; + } + } + if (ValB_Changed) { + CopyLeftBB = Pred; + continue; + } + FoundReverseCopy = true; + } + + // If no reverse copy is found in predecessors, nothing to do. + if (!FoundReverseCopy) + return false; + + // If CopyLeftBB is nullptr, it means every predecessor of MBB contains + // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated. + // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and + // update IntA/IntB. + // + // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so + // MBB is hotter than CopyLeftBB. + if (CopyLeftBB && CopyLeftBB->succ_size() > 1) + return false; + + // Now ok to move copy. + if (CopyLeftBB) { + DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#" + << CopyLeftBB->getNumber() << '\t' << CopyMI); + + // Insert new copy to CopyLeftBB. + auto InsPos = CopyLeftBB->getFirstTerminator(); + MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(), + TII->get(TargetOpcode::COPY), IntB.reg) + .addReg(IntA.reg); + SlotIndex NewCopyIdx = + LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); + IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + for (LiveInterval::SubRange &SR : IntB.subranges()) + SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + } else { + DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#" + << MBB.getNumber() << '\t' << CopyMI); + } + + // Remove CopyMI. + // Note: This is fine to remove the copy before updating the live-ranges. + // While updating the live-ranges, we only look at slot indices and + // never go back to the instruction. + LIS->RemoveMachineInstrFromMaps(CopyMI); + CopyMI.eraseFromParent(); + + // Update the liveness. + SmallVector<SlotIndex, 8> EndPoints; + VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead(); + LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(), + &EndPoints); + BValNo->markUnused(); + // Extend IntB to the EndPoints of its original live interval. + LIS->extendToIndices(IntB, EndPoints); + + // Now, do the same for its subranges. + for (LiveInterval::SubRange &SR : IntB.subranges()) { + EndPoints.clear(); + VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead(); + assert(BValNo && "All sublanes should be live"); + LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints); + BValNo->markUnused(); + LIS->extendToIndices(SR, EndPoints); + } + + // Finally, update the live-range of IntA. + shrinkToUses(&IntA); + return true; +} + /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just /// defining a subregister. static bool definesFullReg(const MachineInstr &MI, unsigned Reg) { @@ -1290,7 +1444,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, // If SrcReg wasn't read, it may still be the case that DstReg is live-in // because SrcReg is a sub-register. - if (DstInt && !Reads && SubIdx) + if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); // Replace SrcReg with DstReg in all UseMI operands. @@ -1486,6 +1640,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { } } + // Try and see if we can partially eliminate the copy by moving the copy to + // its predecessor. + if (!CP.isPartial() && !CP.isPhys()) + if (removePartialRedundancy(CP, *CopyMI)) + return true; + // Otherwise, we are unable to join the intervals. DEBUG(dbgs() << "\tInterference!\n"); Again = true; // May be possible to coalesce later. @@ -1583,6 +1743,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { return false; } } + + // We must also check for overlaps with regmask clobbers. + BitVector RegMaskUsable; + if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) && + !RegMaskUsable.test(DstReg)) { + DEBUG(dbgs() << "\t\tRegMask interference\n"); + return false; + } } // Skip any value computations, we are not adding new values to the @@ -1636,14 +1804,6 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { DEBUG(dbgs() << "\t\tInterference (read): " << *MI); return false; } - - // We must also check for clobbers caused by regmasks. - for (const auto &MO : MI->operands()) { - if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) { - DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI); - return false; - } - } } } @@ -2506,11 +2666,17 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) { // Look for values being erased. bool DidPrune = false; for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) { - if (Vals[i].Resolution != CR_Erase) + // We should trigger in all cases in which eraseInstrs() does something. + // match what eraseInstrs() is doing, print a message so + if (Vals[i].Resolution != CR_Erase && + (Vals[i].Resolution != CR_Keep || !Vals[i].ErasableImplicitDef || + !Vals[i].Pruned)) continue; // Check subranges at the point where the copy will be removed. SlotIndex Def = LR.getValNumInfo(i)->def; + // Print message so mismatches with eraseInstrs() can be diagnosed. + DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def << '\n'); for (LiveInterval::SubRange &S : LI.subranges()) { LiveQueryResult Q = S.Query(Def); @@ -2738,39 +2904,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, LaneBitmask LaneMask, CoalescerPair &CP) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - for (LiveInterval::SubRange &R : LI.subranges()) { - LaneBitmask RMask = R.LaneMask; - // LaneMask of subregisters common to subrange R and ToMerge. - LaneBitmask Common = RMask & LaneMask; - // There is nothing to do without common subregs. - if (Common.none()) - continue; - - DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into " - << PrintLaneMask(Common) << '\n'); - // LaneMask of subregisters contained in the R range but not in ToMerge, - // they have to split into their own subrange. - LaneBitmask LRest = RMask & ~LaneMask; - LiveInterval::SubRange *CommonRange; - if (LRest.any()) { - R.LaneMask = LRest; - DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n'); - // Duplicate SubRange for newly merged common stuff. - CommonRange = LI.createSubRangeFrom(Allocator, Common, R); + LI.refineSubRanges(Allocator, LaneMask, + [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) { + if (SR.empty()) { + SR.assign(ToMerge, Allocator); } else { - // Reuse the existing range. - R.LaneMask = Common; - CommonRange = &R; + // joinSubRegRange() destroys the merged range, so we need a copy. + LiveRange RangeCopy(ToMerge, Allocator); + joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP); } - LiveRange RangeCopy(ToMerge, Allocator); - joinSubRegRanges(*CommonRange, RangeCopy, Common, CP); - LaneMask &= ~RMask; - } - - if (LaneMask.any()) { - DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n'); - LI.createSubRangeFrom(Allocator, LaneMask, ToMerge); - } + }); } bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { @@ -3077,7 +3220,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) { CurrList(WorkList.begin() + PrevSize, WorkList.end()); if (copyCoalesceWorkList(CurrList)) WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(), - (MachineInstr*)nullptr), WorkList.end()); + nullptr), WorkList.end()); } void RegisterCoalescer::coalesceLocals() { diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp index fc84aebb14d7..c726edc88b41 100644 --- a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp @@ -1,4 +1,4 @@ -//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===// +//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +12,37 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> using namespace llvm; @@ -52,6 +76,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure, } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure, const TargetRegisterInfo *TRI) { @@ -97,6 +122,7 @@ void RegPressureTracker::dump() const { P.dump(TRI); } +LLVM_DUMP_METHOD void PressureDiff::dump(const TargetRegisterInfo &TRI) const { const char *sep = ""; for (const PressureChange &Change : *this) { @@ -108,6 +134,7 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const { } dbgs() << '\n'; } +#endif void RegPressureTracker::increaseRegPressure(unsigned RegUnit, LaneBitmask PreviousMask, @@ -264,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const { MachineBasicBlock::const_iterator()); } - SlotIndex RegPressureTracker::getCurrSlot() const { MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward(CurrPos, MBB->end()); @@ -328,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits, unsigned RegUnit) { - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) @@ -340,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, RegisterMaskPair Pair) { unsigned RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -352,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits, unsigned RegUnit) { - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I == RegUnits.end()) { @@ -366,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits, RegisterMaskPair Pair) { unsigned RegUnit = Pair.RegUnit; assert(Pair.LaneMask.any()); - auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { return Other.RegUnit == RegUnit; }); if (I != RegUnits.end()) { @@ -423,6 +449,8 @@ namespace { /// /// FIXME: always ignore tied opers class RegisterOperandsCollector { + friend class llvm::RegisterOperands; + RegisterOperands &RegOpers; const TargetRegisterInfo &TRI; const MachineRegisterInfo &MRI; @@ -517,11 +545,9 @@ class RegisterOperandsCollector { addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll())); } } - - friend class llvm::RegisterOperands; }; -} // namespace +} // end anonymous namespace void RegisterOperands::collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, @@ -674,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair, assert(Pair.LaneMask.any()); unsigned RegUnit = Pair.RegUnit; - auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { + auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) { return Other.RegUnit == RegUnit; }); LaneBitmask PrevMask; @@ -772,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers, if (!TrackLaneMasks) { addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask)); } else { - auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { - return Other.RegUnit == Reg; - }); + auto I = + llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) { + return Other.RegUnit == Reg; + }); bool IsRedef = I != LiveUses->end(); if (IsRedef) { // ignore re-defs here... @@ -1154,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) { int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc(); - if (CritInc > 0 && CritInc <= INT16_MAX) { + if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) { Delta.CriticalMax = PressureChange(PSetID); Delta.CriticalMax.setUnitInc(CritInc); } diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp index fdf741fd58f7..0635e5c0a63c 100644 --- a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -1,4 +1,4 @@ -//===-- RegisterScavenging.cpp - Machine register scavenging --------------===// +//===- RegisterScavenging.cpp - Machine register scavenging ---------------===// // // The LLVM Compiler Infrastructure // @@ -15,28 +15,32 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <iterator> +#include <limits> +#include <string> + using namespace llvm; #define DEBUG_TYPE "reg-scavenging" void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) { - for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { - LaneBitmask UnitMask = (*RUI).second; - if (UnitMask.none() || (LaneMask & UnitMask).any()) - RegUnitsAvailable.reset((*RUI).first); - } + LiveUnits.addRegMasked(Reg, LaneMask); } void RegScavenger::init(MachineBasicBlock &MBB) { @@ -44,6 +48,7 @@ void RegScavenger::init(MachineBasicBlock &MBB) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); + LiveUnits.init(*TRI); assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) && "Target changed?"); @@ -51,45 +56,28 @@ void RegScavenger::init(MachineBasicBlock &MBB) { // Self-initialize. if (!this->MBB) { NumRegUnits = TRI->getNumRegUnits(); - RegUnitsAvailable.resize(NumRegUnits); KillRegUnits.resize(NumRegUnits); DefRegUnits.resize(NumRegUnits); TmpRegUnits.resize(NumRegUnits); } this->MBB = &MBB; - for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(), - IE = Scavenged.end(); I != IE; ++I) { - I->Reg = 0; - I->Restore = nullptr; + for (ScavengedInfo &SI : Scavenged) { + SI.Reg = 0; + SI.Restore = nullptr; } - // All register units start out unused. - RegUnitsAvailable.set(); - - // Pristine CSRs are not available. - BitVector PR = MF.getFrameInfo().getPristineRegs(MF); - for (int I = PR.find_first(); I>0; I = PR.find_next(I)) - setRegUsed(I); - Tracking = false; } -void RegScavenger::setLiveInsUsed(const MachineBasicBlock &MBB) { - for (const auto &LI : MBB.liveins()) - setRegUsed(LI.PhysReg, LI.LaneMask); -} - void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) { init(MBB); - setLiveInsUsed(MBB); + LiveUnits.addLiveIns(MBB); } void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) { init(MBB); - // Merge live-ins of successors to get live-outs. - for (const MachineBasicBlock *Succ : MBB.successors()) - setLiveInsUsed(*Succ); + LiveUnits.addLiveOuts(MBB); // Move internal iterator at the last instruction of the block. if (MBB.begin() != MBB.end()) { @@ -263,36 +251,7 @@ void RegScavenger::backward() { assert(Tracking && "Must be tracking to determine kills and defs"); const MachineInstr &MI = *MBBI; - // Defined or clobbered registers are available now. - for (const MachineOperand &MO : MI.operands()) { - if (MO.isRegMask()) { - for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; - ++RU) { - for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) { - if (MO.clobbersPhysReg(*RURI)) { - RegUnitsAvailable.set(RU); - break; - } - } - } - } else if (MO.isReg() && MO.isDef()) { - unsigned Reg = MO.getReg(); - if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || - isReserved(Reg)) - continue; - addRegUnits(RegUnitsAvailable, Reg); - } - } - // Mark read registers as unavailable. - for (const MachineOperand &MO : MI.uses()) { - if (MO.isReg() && MO.readsReg()) { - unsigned Reg = MO.getReg(); - if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || - isReserved(Reg)) - continue; - removeRegUnits(RegUnitsAvailable, Reg); - } - } + LiveUnits.stepBackward(MI); if (MBBI == MBB->begin()) { MBBI = MachineBasicBlock::iterator(nullptr); @@ -302,12 +261,9 @@ void RegScavenger::backward() { } bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const { - if (includeReserved && isReserved(Reg)) - return true; - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) - if (!RegUnitsAvailable.test(*RUI)) - return true; - return false; + if (isReserved(Reg)) + return includeReserved; + return !LiveUnits.available(Reg); } unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const { @@ -438,10 +394,10 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, // Find an available scavenging slot with size and alignment matching // the requirements of the class RC. const MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned NeedSize = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + unsigned NeedSize = TRI->getSpillSize(*RC); + unsigned NeedAlign = TRI->getSpillAlignment(*RC); - unsigned SI = Scavenged.size(), Diff = UINT_MAX; + unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max(); int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd(); for (unsigned I = 0; I < Scavenged.size(); ++I) { if (Scavenged[I].Reg != 0) diff --git a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp index 2f7ee8bf414c..cc32e43968bb 100644 --- a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp @@ -112,11 +112,11 @@ char RenameIndependentSubregs::ID; char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID; -INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs", +INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, DEBUG_TYPE, "Rename Independent Subregisters", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs", +INITIALIZE_PASS_END(RenameIndependentSubregs, DEBUG_TYPE, "Rename Independent Subregisters", false, false) bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const { diff --git a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp index 451964199ba5..3e259927ac5c 100644 --- a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp +++ b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp @@ -30,17 +30,23 @@ namespace { /// Tells whether or not this pass should emit a fallback /// diagnostic when it resets a function. bool EmitFallbackDiag; + /// Whether we should abort immediately instead of resetting the function. + bool AbortOnFailedISel; public: static char ID; // Pass identification, replacement for typeid - ResetMachineFunction(bool EmitFallbackDiag = false) - : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag) {} + ResetMachineFunction(bool EmitFallbackDiag = false, + bool AbortOnFailedISel = false) + : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag), + AbortOnFailedISel(AbortOnFailedISel) {} StringRef getPassName() const override { return "ResetMachineFunction"; } bool runOnMachineFunction(MachineFunction &MF) override { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) { + if (AbortOnFailedISel) + report_fatal_error("Instruction selection failed"); DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n'); ++NumFunctionsReset; MF.reset(); @@ -62,6 +68,7 @@ INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE, "reset machine function if ISel failed", false, false) MachineFunctionPass * -llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false) { - return new ResetMachineFunction(EmitFallbackDiag); +llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false, + bool AbortOnFailedISel = false) { + return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel); } diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp index 2b82df293c14..8584a9b7c897 100644 --- a/contrib/llvm/lib/CodeGen/SafeStack.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp @@ -19,10 +19,12 @@ #include "SafeStackLayout.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" @@ -50,7 +52,7 @@ using namespace llvm; using namespace llvm::safestack; -#define DEBUG_TYPE "safestack" +#define DEBUG_TYPE "safe-stack" namespace llvm { @@ -92,11 +94,11 @@ public: /// determined statically), and the unsafe stack, which contains all /// local variables that are accessed in ways that we can't prove to /// be safe. -class SafeStack : public FunctionPass { - const TargetMachine *TM; - const TargetLoweringBase *TL; - const DataLayout *DL; - ScalarEvolution *SE; +class SafeStack { + Function &F; + const TargetLoweringBase &TL; + const DataLayout &DL; + ScalarEvolution &SE; Type *StackPtrTy; Type *IntPtrTy; @@ -171,33 +173,21 @@ class SafeStack : public FunctionPass { uint64_t AllocaSize); public: - static char ID; // Pass identification, replacement for typeid. - SafeStack(const TargetMachine *TM) - : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) { - initializeSafeStackPass(*PassRegistry::getPassRegistry()); - } - SafeStack() : SafeStack(nullptr) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolutionWrapperPass>(); - } - - bool doInitialization(Module &M) override { - DL = &M.getDataLayout(); - - StackPtrTy = Type::getInt8PtrTy(M.getContext()); - IntPtrTy = DL->getIntPtrType(M.getContext()); - Int32Ty = Type::getInt32Ty(M.getContext()); - Int8Ty = Type::getInt8Ty(M.getContext()); - - return false; - } - - bool runOnFunction(Function &F) override; -}; // class SafeStack + SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL, + ScalarEvolution &SE) + : F(F), TL(TL), DL(DL), SE(SE), + StackPtrTy(Type::getInt8PtrTy(F.getContext())), + IntPtrTy(DL.getIntPtrType(F.getContext())), + Int32Ty(Type::getInt32Ty(F.getContext())), + Int8Ty(Type::getInt8Ty(F.getContext())) {} + + // Run the transformation on the associated function. + // Returns whether the function was changed. + bool run(); +}; uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { - uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); if (AI->isArrayAllocation()) { auto C = dyn_cast<ConstantInt>(AI->getArraySize()); if (!C) @@ -209,11 +199,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, const Value *AllocaPtr, uint64_t AllocaSize) { - AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); - const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); + AllocaOffsetRewriter Rewriter(SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr)); - uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); - ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); + uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType()); + ConstantRange AccessStartRange = SE.getUnsignedRange(Expr); ConstantRange SizeRange = ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); ConstantRange AccessRange = AccessStartRange.add(SizeRange); @@ -226,8 +216,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, << *AllocaPtr << "\n" << " Access " << *Addr << "\n" << " SCEV " << *Expr - << " U: " << SE->getUnsignedRange(Expr) - << ", S: " << SE->getSignedRange(Expr) << "\n" + << " U: " << SE.getUnsignedRange(Expr) + << ", S: " << SE.getSignedRange(Expr) << "\n" << " Range " << AccessRange << "\n" << " AllocaRange " << AllocaRange << "\n" << " " << (Safe ? "safe" : "unsafe") << "\n"); @@ -266,7 +256,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { switch (I->getOpcode()) { case Instruction::Load: { - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr, AllocaSize)) return false; break; @@ -282,7 +272,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { return false; } - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()), AllocaPtr, AllocaSize)) return false; break; @@ -343,7 +333,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { } Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { - Value *StackGuardVar = TL->getIRStackGuard(IRB); + Value *StackGuardVar = TL.getIRStackGuard(IRB); if (!StackGuardVar) StackGuardVar = F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy); @@ -390,7 +380,7 @@ void SafeStack::findInsts(Function &F, if (!Arg.hasByValAttr()) continue; uint64_t Size = - DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + DL.getTypeStoreSize(Arg.getType()->getPointerElementType()); if (IsSafeStackAlloca(&Arg, Size)) continue; @@ -451,7 +441,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI, IRBuilder<> IRBFail(CheckTerm); // FIXME: respect -fsanitize-trap / -ftrap-function here? Constant *StackChkFail = F.getParent()->getOrInsertFunction( - "__stack_chk_fail", IRB.getVoidTy(), nullptr); + "__stack_chk_fail", IRB.getVoidTy()); IRBFail.CreateCall(StackChkFail, {}); } @@ -476,19 +466,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( if (StackGuardSlot) { Type *Ty = StackGuardSlot->getAllocatedType(); unsigned Align = - std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); + std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot), Align, SSC.getFullLiveRange()); } for (Argument *Arg : ByValArguments) { Type *Ty = Arg->getType()->getPointerElementType(); - uint64_t Size = DL->getTypeStoreSize(Ty); + uint64_t Size = DL.getTypeStoreSize(Ty); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. // Ensure the object is properly aligned. - unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty), Arg->getParamAlignment()); SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange()); } @@ -501,7 +491,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( // Ensure the object is properly aligned. unsigned Align = - std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()); + std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()); SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI)); } @@ -539,7 +529,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( unsigned Offset = SSL.getObjectOffset(Arg); Type *Ty = Arg->getType()->getPointerElementType(); - uint64_t Size = DL->getTypeStoreSize(Ty); + uint64_t Size = DL.getTypeStoreSize(Ty); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. @@ -550,7 +540,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( // Replace alloc with the new location. replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, - /*Deref=*/true, -Offset); + /*Deref=*/false, -Offset); Arg->replaceAllUsesWith(NewArg); IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode()); IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); @@ -565,7 +555,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( if (Size == 0) Size = 1; // Don't create zero-sized stack objects. - replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -Offset); + replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/false, -Offset); replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset); // Replace uses of the alloca with the new location. @@ -630,7 +620,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false); Type *Ty = AI->getAllocatedType(); - uint64_t TySize = DL->getTypeAllocSize(Ty); + uint64_t TySize = DL.getTypeAllocSize(Ty); Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize)); Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy); @@ -638,7 +628,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( // Align the SP value to satisfy the AllocaInst, type and stack alignments. unsigned Align = std::max( - std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()), + std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()), (unsigned)StackAlignment); assert(isPowerOf2_32(Align)); @@ -655,7 +645,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( if (AI->hasName() && isa<Instruction>(NewAI)) NewAI->takeName(AI); - replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true); + replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/false); AI->replaceAllUsesWith(NewAI); AI->eraseFromParent(); } @@ -685,25 +675,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( } } -bool SafeStack::runOnFunction(Function &F) { - DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); - - if (!F.hasFnAttribute(Attribute::SafeStack)) { - DEBUG(dbgs() << "[SafeStack] safestack is not requested" - " for this function\n"); - return false; - } - - if (F.isDeclaration()) { - DEBUG(dbgs() << "[SafeStack] function definition" - " is not available\n"); - return false; - } - - if (!TM) - report_fatal_error("Target machine is required"); - TL = TM->getSubtargetImpl(F)->getTargetLowering(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +bool SafeStack::run() { + assert(F.hasFnAttribute(Attribute::SafeStack) && + "Can't run SafeStack on a function without the attribute"); + assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration"); ++NumFunctions; @@ -736,7 +711,7 @@ bool SafeStack::runOnFunction(Function &F) { ++NumUnsafeStackRestorePointsFunctions; IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt()); - UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB); + UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB); // Load the current stack pointer (we'll also use it as a base pointer). // FIXME: use a dedicated register for it ? @@ -788,14 +763,67 @@ bool SafeStack::runOnFunction(Function &F) { return true; } +class SafeStackLegacyPass : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification, replacement for typeid.. + SafeStackLegacyPass() : FunctionPass(ID), TM(nullptr) { + initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + } + + bool runOnFunction(Function &F) override { + DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); + + if (!F.hasFnAttribute(Attribute::SafeStack)) { + DEBUG(dbgs() << "[SafeStack] safestack is not requested" + " for this function\n"); + return false; + } + + if (F.isDeclaration()) { + DEBUG(dbgs() << "[SafeStack] function definition" + " is not available\n"); + return false; + } + + TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + if (!TL) + report_fatal_error("TargetLowering instance is required"); + + auto *DL = &F.getParent()->getDataLayout(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + + // Compute DT and LI only for functions that have the attribute. + // This is only useful because the legacy pass manager doesn't let us + // compute analyzes lazily. + // In the backend pipeline, nothing preserves DT before SafeStack, so we + // would otherwise always compute it wastefully, even if there is no + // function with the safestack attribute. + DominatorTree DT(F); + LoopInfo LI(DT); + + ScalarEvolution SE(F, TLI, ACT, DT, LI); + + return SafeStack(F, *TL, *DL, SE).run(); + } +}; + } // anonymous namespace -char SafeStack::ID = 0; -INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack", - "Safe Stack instrumentation pass", false, false) -INITIALIZE_TM_PASS_END(SafeStack, "safe-stack", - "Safe Stack instrumentation pass", false, false) +char SafeStackLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, DEBUG_TYPE, + "Safe Stack instrumentation pass", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(SafeStackLegacyPass, DEBUG_TYPE, + "Safe Stack instrumentation pass", false, false) -FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) { - return new SafeStack(TM); -} +FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); } diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp index 7fbeaddb38e8..21f2fa497233 100644 --- a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp @@ -20,9 +20,10 @@ using namespace llvm::safestack; #define DEBUG_TYPE "safestackcoloring" +// Disabled by default due to PR32143. static cl::opt<bool> ClColoring("safe-stack-coloring", cl::desc("enable safe stack coloring"), - cl::Hidden, cl::init(true)); + cl::Hidden, cl::init(false)); const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) { const auto IT = AllocaNumbering.find(AI); @@ -236,6 +237,7 @@ void StackColoring::calculateLiveIntervals() { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void StackColoring::dumpAllocas() { dbgs() << "Allocas:\n"; for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) @@ -262,6 +264,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() { dbgs() << " " << AllocaNo << ": " << Range << "\n"; } } +#endif void StackColoring::run() { DEBUG(dumpAllocas()); diff --git a/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp new file mode 100644 index 000000000000..07b43a82ca99 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -0,0 +1,656 @@ +//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ===// +//=== instrinsics ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass replaces masked memory intrinsics - when unsupported by the target +// - with a chain of basic blocks, that deal with the elements one-by-one if the +// appropriate mask bit is set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "scalarize-masked-mem-intrin" + +namespace { + +class ScalarizeMaskedMemIntrin : public FunctionPass { + const TargetTransformInfo *TTI; + +public: + static char ID; // Pass identification, replacement for typeid + explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) { + initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "Scalarize Masked Memory Intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + +private: + bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); + bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); +}; +} // namespace + +char ScalarizeMaskedMemIntrin::ID = 0; +INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE, + "Scalarize unsupported masked memory intrinsics", false, false) + +FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() { + return new ScalarizeMaskedMemIntrin(); +} + +// Translate a masked load intrinsic like +// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, +// <16 x i1> %mask, <16 x i32> %passthru) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.load, label %else +// +// cond.load: ; preds = %0 +// %4 = getelementptr i32* %1, i32 0 +// %5 = load i32* %4 +// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 +// br label %else +// +// else: ; preds = %0, %cond.load +// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] +// %7 = extractelement <16 x i1> %mask, i32 1 +// %8 = icmp eq i1 %7, true +// br i1 %8, label %cond.load1, label %else2 +// +// cond.load1: ; preds = %else +// %9 = getelementptr i32* %1, i32 1 +// %10 = load i32* %9 +// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 +// br label %else2 +// +// else2: ; preds = %else, %cond.load1 +// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] +// %12 = extractelement <16 x i1> %mask, i32 2 +// %13 = icmp eq i1 %12, true +// br i1 %13, label %cond.load4, label %else5 +// +static void scalarizeMaskedLoad(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); + assert(VecType && "Unexpected return type of masked load intrinsic"); + + Type *EltTy = CI->getType()->getVectorElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = + isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + + if (isa<ConstantVector>(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = + Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_load = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = + Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked store intrinsic, like +// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.store, label %else +// +// cond.store: ; preds = %0 +// %4 = extractelement <16 x i32> %val, i32 0 +// %5 = getelementptr i32* %1, i32 0 +// store i32 %4, i32* %5 +// br label %else +// +// else: ; preds = %0, %cond.store +// %6 = extractelement <16 x i1> %mask, i32 1 +// %7 = icmp eq i1 %6, true +// br i1 %7, label %cond.store1, label %else2 +// +// cond.store1: ; preds = %else +// %8 = extractelement <16 x i32> %val, i32 1 +// %9 = getelementptr i32* %1, i32 1 +// store i32 %8, i32* %9 +// br label %else2 +// . . . +static void scalarizeMaskedStore(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast<VectorType>(Src->getType()); + assert(VecType && "Unexpected data type in masked store intrinsic"); + + Type *EltTy = VecType->getElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = + isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Builder.CreateAlignedStore(Src, Ptr, AlignVal); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + if (isa<ConstantVector>(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + } + CI->eraseFromParent(); + return; + } + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_store = icmp eq i1 %mask_1, true + // br i1 %to_store, label %cond.store, label %else + // + Value *Predicate = + Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = + IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void scalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa<ConstantVector>(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement( + VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void scalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa<VectorType>(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa<VectorType>(Ptrs->getType()) && + isa<PointerType>(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa<ConstantVector>(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + bool EverMadeChange = false; + + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + bool MadeChange = true; + while (MadeChange) { + MadeChange = false; + for (Function::iterator I = F.begin(); I != F.end();) { + BasicBlock *BB = &*I++; + bool ModifiedDTOnIteration = false; + MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); + + // Restart BB iteration if the dominator tree of the Function was changed + if (ModifiedDTOnIteration) + break; + } + + EverMadeChange |= MadeChange; + } + + return EverMadeChange; +} + +bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { + bool MadeChange = false; + + BasicBlock::iterator CurInstIterator = BB.begin(); + while (CurInstIterator != BB.end()) { + if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++)) + MadeChange |= optimizeCallInst(CI, ModifiedDT); + if (ModifiedDT) + return true; + } + + return MadeChange; +} + +bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI, + bool &ModifiedDT) { + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + if (II) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::masked_load: { + // Scalarize unsupported vector masked load + if (!TTI->isLegalMaskedLoad(CI->getType())) { + scalarizeMaskedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_store: { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { + scalarizeMaskedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + scalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + scalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } + } + } + + return false; +} diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp index 427d95268c74..dc72ac073258 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -1,4 +1,4 @@ -//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===// +//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===// // // The LLVM Compiler Infrastructure // @@ -7,22 +7,32 @@ // //===----------------------------------------------------------------------===// // -// This implements the ScheduleDAG class, which is a base class used by -// scheduling implementation classes. +/// \file Implements the ScheduleDAG class, which is a base class used by +/// scheduling implementation classes. // //===----------------------------------------------------------------------===// +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <climits> +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "pre-RA-sched" @@ -33,58 +43,52 @@ static cl::opt<bool> StressSchedOpt( cl::desc("Stress test instruction scheduling")); #endif -void SchedulingPriorityQueue::anchor() { } +void SchedulingPriorityQueue::anchor() {} ScheduleDAG::ScheduleDAG(MachineFunction &mf) : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()), TRI(mf.getSubtarget().getRegisterInfo()), MF(mf), - MRI(mf.getRegInfo()), EntrySU(), ExitSU() { + MRI(mf.getRegInfo()) { #ifndef NDEBUG StressSched = StressSchedOpt; #endif } -ScheduleDAG::~ScheduleDAG() {} +ScheduleDAG::~ScheduleDAG() = default; -/// Clear the DAG state (e.g. between scheduling regions). void ScheduleDAG::clearDAG() { SUnits.clear(); EntrySU = SUnit(); ExitSU = SUnit(); } -/// getInstrDesc helper to handle SDNodes. const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const { if (!Node || !Node->isMachineOpcode()) return nullptr; return &TII->get(Node->getMachineOpcode()); } -/// addPred - This adds the specified edge as a pred of the current node if -/// not already. It also adds the current node as a successor of the -/// specified node. bool SUnit::addPred(const SDep &D, bool Required) { // If this node already has this dependence, don't add a redundant one. - for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) { + for (SDep &PredDep : Preds) { // Zero-latency weak edges may be added purely for heuristic ordering. Don't // add them if another kind of edge already exists. - if (!Required && I->getSUnit() == D.getSUnit()) + if (!Required && PredDep.getSUnit() == D.getSUnit()) return false; - if (I->overlaps(D)) { - // Extend the latency if needed. Equivalent to removePred(I) + addPred(D). - if (I->getLatency() < D.getLatency()) { - SUnit *PredSU = I->getSUnit(); + if (PredDep.overlaps(D)) { + // Extend the latency if needed. Equivalent to + // removePred(PredDep) + addPred(D). + if (PredDep.getLatency() < D.getLatency()) { + SUnit *PredSU = PredDep.getSUnit(); // Find the corresponding successor in N. - SDep ForwardD = *I; + SDep ForwardD = PredDep; ForwardD.setSUnit(this); - for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(), - EE = PredSU->Succs.end(); II != EE; ++II) { - if (*II == ForwardD) { - II->setLatency(D.getLatency()); + for (SDep &SuccDep : PredSU->Succs) { + if (SuccDep == ForwardD) { + SuccDep.setLatency(D.getLatency()); break; } } - I->setLatency(D.getLatency()); + PredDep.setLatency(D.getLatency()); } return false; } @@ -95,8 +99,10 @@ bool SUnit::addPred(const SDep &D, bool Required) { SUnit *N = D.getSUnit(); // Update the bookkeeping. if (D.getKind() == SDep::Data) { - assert(NumPreds < UINT_MAX && "NumPreds will overflow!"); - assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!"); + assert(NumPreds < std::numeric_limits<unsigned>::max() && + "NumPreds will overflow!"); + assert(N->NumSuccs < std::numeric_limits<unsigned>::max() && + "NumSuccs will overflow!"); ++NumPreds; ++N->NumSuccs; } @@ -105,7 +111,8 @@ bool SUnit::addPred(const SDep &D, bool Required) { ++WeakPredsLeft; } else { - assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!"); + assert(NumPredsLeft < std::numeric_limits<unsigned>::max() && + "NumPredsLeft will overflow!"); ++NumPredsLeft; } } @@ -114,7 +121,8 @@ bool SUnit::addPred(const SDep &D, bool Required) { ++N->WeakSuccsLeft; } else { - assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!"); + assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() && + "NumSuccsLeft will overflow!"); ++N->NumSuccsLeft; } } @@ -127,51 +135,46 @@ bool SUnit::addPred(const SDep &D, bool Required) { return true; } -/// removePred - This removes the specified edge as a pred of the current -/// node if it exists. It also removes the current node as a successor of -/// the specified node. void SUnit::removePred(const SDep &D) { // Find the matching predecessor. - for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) - if (*I == D) { - // Find the corresponding successor in N. - SDep P = D; - P.setSUnit(this); - SUnit *N = D.getSUnit(); - SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P); - assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); - N->Succs.erase(Succ); - Preds.erase(I); - // Update the bookkeeping. - if (P.getKind() == SDep::Data) { - assert(NumPreds > 0 && "NumPreds will underflow!"); - assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); - --NumPreds; - --N->NumSuccs; - } - if (!N->isScheduled) { - if (D.isWeak()) - --WeakPredsLeft; - else { - assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); - --NumPredsLeft; - } - } - if (!isScheduled) { - if (D.isWeak()) - --N->WeakSuccsLeft; - else { - assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); - --N->NumSuccsLeft; - } - } - if (P.getLatency() != 0) { - this->setDepthDirty(); - N->setHeightDirty(); - } - return; + SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D); + if (I == Preds.end()) + return; + // Find the corresponding successor in N. + SDep P = D; + P.setSUnit(this); + SUnit *N = D.getSUnit(); + SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P); + assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); + N->Succs.erase(Succ); + Preds.erase(I); + // Update the bookkeeping. + if (P.getKind() == SDep::Data) { + assert(NumPreds > 0 && "NumPreds will underflow!"); + assert(N->NumSuccs > 0 && "NumSuccs will underflow!"); + --NumPreds; + --N->NumSuccs; + } + if (!N->isScheduled) { + if (D.isWeak()) + --WeakPredsLeft; + else { + assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); + --NumPredsLeft; } + } + if (!isScheduled) { + if (D.isWeak()) + --N->WeakSuccsLeft; + else { + assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); + --N->NumSuccsLeft; + } + } + if (P.getLatency() != 0) { + this->setDepthDirty(); + N->setHeightDirty(); + } } void SUnit::setDepthDirty() { @@ -181,9 +184,8 @@ void SUnit::setDepthDirty() { do { SUnit *SU = WorkList.pop_back_val(); SU->isDepthCurrent = false; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), - E = SU->Succs.end(); I != E; ++I) { - SUnit *SuccSU = I->getSUnit(); + for (SDep &SuccDep : SU->Succs) { + SUnit *SuccSU = SuccDep.getSUnit(); if (SuccSU->isDepthCurrent) WorkList.push_back(SuccSU); } @@ -197,18 +199,14 @@ void SUnit::setHeightDirty() { do { SUnit *SU = WorkList.pop_back_val(); SU->isHeightCurrent = false; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), - E = SU->Preds.end(); I != E; ++I) { - SUnit *PredSU = I->getSUnit(); + for (SDep &PredDep : SU->Preds) { + SUnit *PredSU = PredDep.getSUnit(); if (PredSU->isHeightCurrent) WorkList.push_back(PredSU); } } while (!WorkList.empty()); } -/// setDepthToAtLeast - Update this node's successors to reflect the -/// fact that this node's depth just increased. -/// void SUnit::setDepthToAtLeast(unsigned NewDepth) { if (NewDepth <= getDepth()) return; @@ -217,9 +215,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) { isDepthCurrent = true; } -/// setHeightToAtLeast - Update this node's predecessors to reflect the -/// fact that this node's height just increased. -/// void SUnit::setHeightToAtLeast(unsigned NewHeight) { if (NewHeight <= getHeight()) return; @@ -228,8 +223,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) { isHeightCurrent = true; } -/// ComputeDepth - Calculate the maximal path from the node to the exit. -/// +/// Calculates the maximal path from the node to the exit. void SUnit::ComputeDepth() { SmallVector<SUnit*, 8> WorkList; WorkList.push_back(this); @@ -238,12 +232,11 @@ void SUnit::ComputeDepth() { bool Done = true; unsigned MaxPredDepth = 0; - for (SUnit::const_pred_iterator I = Cur->Preds.begin(), - E = Cur->Preds.end(); I != E; ++I) { - SUnit *PredSU = I->getSUnit(); + for (const SDep &PredDep : Cur->Preds) { + SUnit *PredSU = PredDep.getSUnit(); if (PredSU->isDepthCurrent) MaxPredDepth = std::max(MaxPredDepth, - PredSU->Depth + I->getLatency()); + PredSU->Depth + PredDep.getLatency()); else { Done = false; WorkList.push_back(PredSU); @@ -261,8 +254,7 @@ void SUnit::ComputeDepth() { } while (!WorkList.empty()); } -/// ComputeHeight - Calculate the maximal path from the node to the entry. -/// +/// Calculates the maximal path from the node to the entry. void SUnit::ComputeHeight() { SmallVector<SUnit*, 8> WorkList; WorkList.push_back(this); @@ -271,12 +263,11 @@ void SUnit::ComputeHeight() { bool Done = true; unsigned MaxSuccHeight = 0; - for (SUnit::const_succ_iterator I = Cur->Succs.begin(), - E = Cur->Succs.end(); I != E; ++I) { - SUnit *SuccSU = I->getSUnit(); + for (const SDep &SuccDep : Cur->Succs) { + SUnit *SuccSU = SuccDep.getSUnit(); if (SuccSU->isHeightCurrent) MaxSuccHeight = std::max(MaxSuccHeight, - SuccSU->Height + I->getLatency()); + SuccSU->Height + SuccDep.getLatency()); else { Done = false; WorkList.push_back(SuccSU); @@ -310,6 +301,7 @@ void SUnit::biasCriticalPath() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const { if (this == &DAG->ExitSU) OS << "ExitSU"; @@ -319,15 +311,13 @@ void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const { OS << "SU(" << NodeNum << ")"; } -/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or -/// a group of nodes flagged together. -void SUnit::dump(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dump(const ScheduleDAG *G) const { print(dbgs(), G); dbgs() << ": "; G->dumpNode(this); } -void SUnit::dumpAll(const ScheduleDAG *G) const { +LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const { dump(G); dbgs() << " # preds left : " << NumPredsLeft << "\n"; @@ -343,41 +333,39 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { if (Preds.size() != 0) { dbgs() << " Predecessors:\n"; - for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end(); - I != E; ++I) { + for (const SDep &SuccDep : Preds) { dbgs() << " "; - switch (I->getKind()) { + switch (SuccDep.getKind()) { case SDep::Data: dbgs() << "data "; break; case SDep::Anti: dbgs() << "anti "; break; case SDep::Output: dbgs() << "out "; break; case SDep::Order: dbgs() << "ord "; break; } - I->getSUnit()->print(dbgs(), G); - if (I->isArtificial()) + SuccDep.getSUnit()->print(dbgs(), G); + if (SuccDep.isArtificial()) dbgs() << " *"; - dbgs() << ": Latency=" << I->getLatency(); - if (I->isAssignedRegDep()) - dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); + dbgs() << ": Latency=" << SuccDep.getLatency(); + if (SuccDep.isAssignedRegDep()) + dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI); dbgs() << "\n"; } } if (Succs.size() != 0) { dbgs() << " Successors:\n"; - for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end(); - I != E; ++I) { + for (const SDep &SuccDep : Succs) { dbgs() << " "; - switch (I->getKind()) { + switch (SuccDep.getKind()) { case SDep::Data: dbgs() << "data "; break; case SDep::Anti: dbgs() << "anti "; break; case SDep::Output: dbgs() << "out "; break; case SDep::Order: dbgs() << "ord "; break; } - I->getSUnit()->print(dbgs(), G); - if (I->isArtificial()) + SuccDep.getSUnit()->print(dbgs(), G); + if (SuccDep.isArtificial()) dbgs() << " *"; - dbgs() << ": Latency=" << I->getLatency(); - if (I->isAssignedRegDep()) - dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI); + dbgs() << ": Latency=" << SuccDep.getLatency(); + if (SuccDep.isAssignedRegDep()) + dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI); dbgs() << "\n"; } } @@ -385,47 +373,44 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { #endif #ifndef NDEBUG -/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that -/// their state is consistent. Return the number of scheduled nodes. -/// unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) { bool AnyNotSched = false; unsigned DeadNodes = 0; - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - if (!SUnits[i].isScheduled) { - if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) { + for (const SUnit &SUnit : SUnits) { + if (!SUnit.isScheduled) { + if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) { ++DeadNodes; continue; } if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has not been scheduled!\n"; AnyNotSched = true; } - if (SUnits[i].isScheduled && - (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) > - unsigned(INT_MAX)) { + if (SUnit.isScheduled && + (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) > + unsigned(std::numeric_limits<int>::max())) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has an unexpected " << (isBottomUp ? "Height" : "Depth") << " value!\n"; AnyNotSched = true; } if (isBottomUp) { - if (SUnits[i].NumSuccsLeft != 0) { + if (SUnit.NumSuccsLeft != 0) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has successors left!\n"; AnyNotSched = true; } } else { - if (SUnits[i].NumPredsLeft != 0) { + if (SUnit.NumPredsLeft != 0) { if (!AnyNotSched) dbgs() << "*** Scheduling failed! ***\n"; - SUnits[i].dump(this); + SUnit.dump(this); dbgs() << "has predecessors left!\n"; AnyNotSched = true; } @@ -436,36 +421,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) { } #endif -/// InitDAGTopologicalSorting - create the initial topological -/// ordering from the DAG to be scheduled. -/// -/// The idea of the algorithm is taken from -/// "Online algorithms for managing the topological order of -/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly -/// This is the MNR algorithm, which was first introduced by -/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in -/// "Maintaining a topological order under edge insertions". -/// -/// Short description of the algorithm: -/// -/// Topological ordering, ord, of a DAG maps each node to a topological -/// index so that for all edges X->Y it is the case that ord(X) < ord(Y). -/// -/// This means that if there is a path from the node X to the node Z, -/// then ord(X) < ord(Z). -/// -/// This property can be used to check for reachability of nodes: -/// if Z is reachable from X, then an insertion of the edge Z->X would -/// create a cycle. -/// -/// The algorithm first computes a topological ordering for the DAG by -/// initializing the Index2Node and Node2Index arrays and then tries to keep -/// the ordering up-to-date after edge insertions by reordering the DAG. -/// -/// On insertion of the edge X->Y, the algorithm first marks by calling DFS -/// the nodes reachable from Y, and then shifts them using Shift to lie -/// immediately after X in Index2Node. void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { + // The idea of the algorithm is taken from + // "Online algorithms for managing the topological order of + // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly + // This is the MNR algorithm, which was first introduced by + // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in + // "Maintaining a topological order under edge insertions". + // + // Short description of the algorithm: + // + // Topological ordering, ord, of a DAG maps each node to a topological + // index so that for all edges X->Y it is the case that ord(X) < ord(Y). + // + // This means that if there is a path from the node X to the node Z, + // then ord(X) < ord(Z). + // + // This property can be used to check for reachability of nodes: + // if Z is reachable from X, then an insertion of the edge Z->X would + // create a cycle. + // + // The algorithm first computes a topological ordering for the DAG by + // initializing the Index2Node and Node2Index arrays and then tries to keep + // the ordering up-to-date after edge insertions by reordering the DAG. + // + // On insertion of the edge X->Y, the algorithm first marks by calling DFS + // the nodes reachable from Y, and then shifts them using Shift to lie + // immediately after X in Index2Node. unsigned DAGSize = SUnits.size(); std::vector<SUnit*> WorkList; WorkList.reserve(DAGSize); @@ -476,18 +458,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { // Initialize the data structures. if (ExitSU) WorkList.push_back(ExitSU); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - int NodeNum = SU->NodeNum; - unsigned Degree = SU->Succs.size(); + for (SUnit &SU : SUnits) { + int NodeNum = SU.NodeNum; + unsigned Degree = SU.Succs.size(); // Temporarily use the Node2Index array as scratch space for degree counts. Node2Index[NodeNum] = Degree; // Is it a node without dependencies? if (Degree == 0) { - assert(SU->Succs.empty() && "SUnit should have no successors"); + assert(SU.Succs.empty() && "SUnit should have no successors"); // Collect leaf nodes. - WorkList.push_back(SU); + WorkList.push_back(&SU); } } @@ -497,9 +478,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { WorkList.pop_back(); if (SU->NodeNum < DAGSize) Allocate(SU->NodeNum, --Id); - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - SUnit *SU = I->getSUnit(); + for (const SDep &PredDep : SU->Preds) { + SUnit *SU = PredDep.getSUnit(); if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum]) // If all dependencies of the node are processed already, // then the node can be computed now. @@ -511,19 +491,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() { #ifndef NDEBUG // Check correctness of the ordering - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && + for (SUnit &SU : SUnits) { + for (const SDep &PD : SU.Preds) { + assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] && "Wrong topological sorting"); } } #endif } -/// AddPred - Updates the topological ordering to accommodate an edge -/// to be added from SUnit X to SUnit Y. void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { int UpperBound, LowerBound; LowerBound = Node2Index[Y->NodeNum]; @@ -540,16 +516,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) { } } -/// RemovePred - Updates the topological ordering to accommodate an -/// an edge to be removed from the specified node N from the predecessors -/// of the current node M. void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) { // InitDAGTopologicalSorting(); } -/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark -/// all nodes affected by the edge insertion. These nodes will later get new -/// topological indexes by means of the Shift method. void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, bool &HasLoop) { std::vector<const SUnit*> WorkList; @@ -560,8 +530,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, SU = WorkList.back(); WorkList.pop_back(); Visited.set(SU->NodeNum); - for (int I = SU->Succs.size()-1; I >= 0; --I) { - unsigned s = SU->Succs[I].getSUnit()->NodeNum; + for (const SDep &SuccDep + : make_range(SU->Succs.rbegin(), SU->Succs.rend())) { + unsigned s = SuccDep.getSUnit()->NodeNum; // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). if (s >= Node2Index.size()) continue; @@ -571,14 +542,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound, } // Visit successors if not already and in affected region. if (!Visited.test(s) && Node2Index[s] < UpperBound) { - WorkList.push_back(SU->Succs[I].getSUnit()); + WorkList.push_back(SuccDep.getSUnit()); } } } while (!WorkList.empty()); } -/// Shift - Renumber the nodes so that the topological ordering is -/// preserved. +std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU, + const SUnit &TargetSU, + bool &Success) { + std::vector<const SUnit*> WorkList; + int LowerBound = Node2Index[StartSU.NodeNum]; + int UpperBound = Node2Index[TargetSU.NodeNum]; + bool Found = false; + BitVector VisitedBack; + std::vector<int> Nodes; + + if (LowerBound > UpperBound) { + Success = false; + return Nodes; + } + + WorkList.reserve(SUnits.size()); + Visited.reset(); + + // Starting from StartSU, visit all successors up + // to UpperBound. + WorkList.push_back(&StartSU); + do { + const SUnit *SU = WorkList.back(); + WorkList.pop_back(); + for (int I = SU->Succs.size()-1; I >= 0; --I) { + const SUnit *Succ = SU->Succs[I].getSUnit(); + unsigned s = Succ->NodeNum; + // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). + if (Succ->isBoundaryNode()) + continue; + if (Node2Index[s] == UpperBound) { + Found = true; + continue; + } + // Visit successors if not already and in affected region. + if (!Visited.test(s) && Node2Index[s] < UpperBound) { + Visited.set(s); + WorkList.push_back(Succ); + } + } + } while (!WorkList.empty()); + + if (!Found) { + Success = false; + return Nodes; + } + + WorkList.clear(); + VisitedBack.resize(SUnits.size()); + Found = false; + + // Starting from TargetSU, visit all predecessors up + // to LowerBound. SUs that are visited by the two + // passes are added to Nodes. + WorkList.push_back(&TargetSU); + do { + const SUnit *SU = WorkList.back(); + WorkList.pop_back(); + for (int I = SU->Preds.size()-1; I >= 0; --I) { + const SUnit *Pred = SU->Preds[I].getSUnit(); + unsigned s = Pred->NodeNum; + // Edges to non-SUnits are allowed but ignored (e.g. EntrySU). + if (Pred->isBoundaryNode()) + continue; + if (Node2Index[s] == LowerBound) { + Found = true; + continue; + } + if (!VisitedBack.test(s) && Visited.test(s)) { + VisitedBack.set(s); + WorkList.push_back(Pred); + Nodes.push_back(s); + } + } + } while (!WorkList.empty()); + + assert(Found && "Error in SUnit Graph!"); + Success = true; + return Nodes; +} + void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, int UpperBound) { std::vector<int> L; @@ -598,28 +648,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, } } - for (unsigned j = 0; j < L.size(); ++j) { - Allocate(L[j], i - shift); + for (unsigned LI : L) { + Allocate(LI, i - shift); i = i + 1; } } - -/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will -/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU). bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) { // Is SU reachable from TargetSU via successor edges? if (IsReachable(SU, TargetSU)) return true; - for (SUnit::pred_iterator - I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I) - if (I->isAssignedRegDep() && - IsReachable(SU, I->getSUnit())) + for (const SDep &PredDep : TargetSU->Preds) + if (PredDep.isAssignedRegDep() && + IsReachable(SU, PredDep.getSUnit())) return true; return false; } -/// IsReachable - Checks if SU is reachable from TargetSU. bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, const SUnit *TargetSU) { // If insertion of the edge SU->TargetSU would create a cycle @@ -637,7 +682,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, return HasLoop; } -/// Allocate - assign the topological index to the node n. void ScheduleDAGTopologicalSort::Allocate(int n, int index) { Node2Index[n] = index; Index2Node[index] = n; @@ -647,4 +691,4 @@ ScheduleDAGTopologicalSort:: ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu) : SUnits(sunits), ExitSU(exitsu) {} -ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {} +ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default; diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 611c5a71bd5a..8035ea80364b 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This implements the ScheduleDAGInstrs class, which implements re-scheduling -// of MachineInstrs. +/// \file This implements the ScheduleDAGInstrs class, which implements +/// re-scheduling of MachineInstrs. // //===----------------------------------------------------------------------===// @@ -101,8 +101,8 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, SchedModel.init(ST.getSchedModel(), &ST, TII); } -/// getUnderlyingObjectFromInt - This is the function that does the work of -/// looking through basic ptrtoint+arithmetic+inttoptr sequences. +/// This is the function that does the work of looking through basic +/// ptrtoint+arithmetic+inttoptr sequences. static const Value *getUnderlyingObjectFromInt(const Value *V) { do { if (const Operator *U = dyn_cast<Operator>(V)) { @@ -129,8 +129,8 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) { } while (1); } -/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects -/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences. +/// This is a wrapper around GetUnderlyingObjects and adds support for basic +/// ptrtoint+arithmetic+inttoptr sequences. static void getUnderlyingObjects(const Value *V, SmallVectorImpl<Value *> &Objects, const DataLayout &DL) { @@ -158,9 +158,8 @@ static void getUnderlyingObjects(const Value *V, } while (!Working.empty()); } -/// getUnderlyingObjectsForInstr - If this machine instr has memory reference -/// information and it can be tracked to a normal reference to a known -/// object, return the Value for that object. +/// If this machine instr has memory reference information and it can be tracked +/// to a normal reference to a known object, return the Value for that object. static void getUnderlyingObjectsForInstr(const MachineInstr *MI, const MachineFrameInfo &MFI, UnderlyingObjectsVector &Objects, @@ -216,10 +215,6 @@ void ScheduleDAGInstrs::finishBlock() { BB = nullptr; } -/// Initialize the DAG and common scheduler state for the current scheduling -/// region. This does not actually create the DAG, only clears it. The -/// scheduling driver may call BuildSchedGraph multiple times per scheduling -/// region. void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, MachineBasicBlock::iterator begin, MachineBasicBlock::iterator end, @@ -230,20 +225,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb, NumRegionInstrs = regioninstrs; } -/// Close the current scheduling region. Don't clear any state in case the -/// driver wants to refer to the previous scheduling region. void ScheduleDAGInstrs::exitRegion() { // Nothing to do. } -/// addSchedBarrierDeps - Add dependencies from instructions in the current -/// list of instructions being scheduled to scheduling barrier by adding -/// the exit SU to the register defs and use list. This is because we want to -/// make sure instructions which define registers that are either used by -/// the terminator or are live-out are properly scheduled. This is -/// especially important when the definition latency of the return value(s) -/// are too high to be hidden by the branch or when the liveout registers -/// used by instructions in the fallthrough block. void ScheduleDAGInstrs::addSchedBarrierDeps() { MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr; ExitSU.setInstr(ExitMI); @@ -271,7 +256,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { } } -/// MO is an operand of SU's instruction that defines a physical register. Add +/// MO is an operand of SU's instruction that defines a physical register. Adds /// data dependencies from SU to any uses of the physical register. void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx); @@ -313,9 +298,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { } } -/// addPhysRegDeps - Add register dependencies (data, anti, and output) from -/// this SUnit to following instructions in the same scheduling region that -/// depend the physical register referenced at OperIdx. +/// \brief Adds register dependencies (data, anti, and output) from this SUnit +/// to following instructions in the same scheduling region that depend the +/// physical register referenced at OperIdx. void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { MachineInstr *MI = SU->getInstr(); MachineOperand &MO = MI->getOperand(OperIdx); @@ -406,9 +391,9 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const return TRI->getSubRegIndexLaneMask(SubReg); } -/// addVRegDefDeps - Add register output and data dependencies from this SUnit -/// to instructions that occur later in the same scheduling region if they read -/// from or write to the virtual register defined at OperIdx. +/// Adds register output and data dependencies from this SUnit to instructions +/// that occur later in the same scheduling region if they read from or write to +/// the virtual register defined at OperIdx. /// /// TODO: Hoist loop induction variable increments. This has to be /// reevaluated. Generally, IV scheduling should be done before coalescing. @@ -515,10 +500,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); } -/// addVRegUseDeps - Add a register data dependency if the instruction that -/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a -/// register antidependency from this SUnit to instructions that occur later in -/// the same scheduling region if they write the virtual register. +/// \brief Adds a register data dependency if the instruction that defines the +/// virtual register used at OperIdx is mapped to an SUnit. Add a register +/// antidependency from this SUnit to instructions that occur later in the same +/// scheduling region if they write the virtual register. /// /// TODO: Handle ExitSU "uses" properly. void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { @@ -545,87 +530,25 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { } } -/// Return true if MI is an instruction we are unable to reason about +/// Returns true if MI is an instruction we are unable to reason about /// (like a call or something with unmodeled side effects). static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { return MI->isCall() || MI->hasUnmodeledSideEffects() || (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA)); } -/// This returns true if the two MIs need a chain edge between them. -/// This is called on normal stores and loads. -static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI, - const DataLayout &DL, MachineInstr *MIa, - MachineInstr *MIb) { - const MachineFunction *MF = MIa->getParent()->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - - assert ((MIa->mayStore() || MIb->mayStore()) && - "Dependency checked between two loads"); - - // Let the target decide if memory accesses cannot possibly overlap. - if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA)) - return false; - - // To this point analysis is generic. From here on we do need AA. - if (!AA) - return true; - - // FIXME: Need to handle multiple memory operands to support all targets. - if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) - return true; - - MachineMemOperand *MMOa = *MIa->memoperands_begin(); - MachineMemOperand *MMOb = *MIb->memoperands_begin(); - - if (!MMOa->getValue() || !MMOb->getValue()) - return true; - - // The following interface to AA is fashioned after DAGCombiner::isAlias - // and operates with MachineMemOperand offset with some important - // assumptions: - // - LLVM fundamentally assumes flat address spaces. - // - MachineOperand offset can *only* result from legalization and - // cannot affect queries other than the trivial case of overlap - // checking. - // - These offsets never wrap and never step outside - // of allocated objects. - // - There should never be any negative offsets here. - // - // FIXME: Modify API to hide this math from "user" - // FIXME: Even before we go to AA we can reason locally about some - // memory objects. It can save compile time, and possibly catch some - // corner cases not currently covered. - - assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); - assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); - - int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); - int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; - int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset; - - AliasResult AAResult = - AA->alias(MemoryLocation(MMOa->getValue(), Overlapa, - UseTBAA ? MMOa->getAAInfo() : AAMDNodes()), - MemoryLocation(MMOb->getValue(), Overlapb, - UseTBAA ? MMOb->getAAInfo() : AAMDNodes())); - - return (AAResult != NoAlias); -} - -/// Check whether two objects need a chain edge and add it if needed. void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, unsigned Latency) { - if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(), - SUb->getInstr())) { + if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) { SDep Dep(SUa, SDep::MayAliasMem); Dep.setLatency(Latency); SUb->addPred(Dep); } } -/// Create an SUnit for each real instruction, numbered in top-down topological -/// order. The instruction order A < B, implies that no edge exists from B to A. +/// \brief Creates an SUnit for each real instruction, numbered in top-down +/// topological order. The instruction order A < B, implies that no edge exists +/// from B to A. /// /// Map each real instruction to its SUnit. /// @@ -682,14 +605,13 @@ void ScheduleDAGInstrs::initSUnits() { } class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> { - /// Current total number of SUs in map. unsigned NumNodes; /// 1 for loads, 0 for stores. (see comment in SUList) unsigned TrueMemOrderLatency; -public: +public: Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {} /// To keep NumNodes up to date, insert() is used instead of @@ -697,8 +619,8 @@ public: ValueType &operator[](const SUList &Key) { llvm_unreachable("Don't use. Use insert() instead."); }; - /// Add SU to the SUList of V. If Map grows huge, reduce its size - /// by calling reduce(). + /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling + /// reduce(). void inline insert(SUnit *SU, ValueType V) { MapVector::operator[](V).push_back(SU); NumNodes++; @@ -723,7 +645,7 @@ public: unsigned inline size() const { return NumNodes; } - /// Count the number of SUs in this map after a reduction. + /// Counts the number of SUs in this map after a reduction. void reComputeSize(void) { NumNodes = 0; for (auto &I : *this) @@ -797,9 +719,6 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { map.reComputeSize(); } -/// If RegPressure is non-null, compute register pressure as a side effect. The -/// DAG builder is an efficient place to do it because it already visits -/// operands. void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker, PressureDiffs *PDiffs, @@ -1088,10 +1007,6 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() { } } -/// Reduce maps in FIFO order, by N SUs. This is better than turning -/// every Nth memory SU into BarrierChain in buildSchedGraph(), since -/// it avoids unnecessary edges between seen SUs above the new -/// BarrierChain, and those below it. void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, Value2SUsMap &loads, unsigned N) { DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; @@ -1142,183 +1057,77 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, loads.dump()); } -/// \brief Initialize register live-range state for updating kills. -void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) { - // Start with no live registers. - LiveRegs.reset(); - - // Examine the live-in regs of all successors. - for (const MachineBasicBlock *Succ : BB->successors()) { - for (const auto &LI : Succ->liveins()) { - // Repeat, for reg and all subregs. - for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.set(*SubRegs); - } - } -} - -/// \brief If we change a kill flag on the bundle instruction implicit register -/// operands, then we also need to propagate that to any instructions inside -/// the bundle which had the same kill state. -static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg, - bool NewKillState, - const TargetRegisterInfo *TRI) { - if (MI->getOpcode() != TargetOpcode::BUNDLE) - return; - - // Walk backwards from the last instruction in the bundle to the first. - // Once we set a kill flag on an instruction, we bail out, as otherwise we - // might set it on too many operands. We will clear as many flags as we - // can though. - MachineBasicBlock::instr_iterator Begin = MI->getIterator(); - MachineBasicBlock::instr_iterator End = getBundleEnd(Begin); - while (Begin != End) { - if (NewKillState) { - if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false)) - return; - } else - (--End)->clearRegisterKills(Reg, TRI); - } -} - -bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) { - // Setting kill flag... - if (!MO.isKill()) { - MO.setIsKill(true); - toggleBundleKillFlag(MI, MO.getReg(), true, TRI); - return false; - } - - // If MO itself is live, clear the kill flag... - if (LiveRegs.test(MO.getReg())) { - MO.setIsKill(false); - toggleBundleKillFlag(MI, MO.getReg(), false, TRI); - return false; - } - - // If any subreg of MO is live, then create an imp-def for that - // subreg and keep MO marked as killed. - MO.setIsKill(false); - toggleBundleKillFlag(MI, MO.getReg(), false, TRI); - bool AllDead = true; - const unsigned SuperReg = MO.getReg(); - MachineInstrBuilder MIB(MF, MI); - for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) { - if (LiveRegs.test(*SubRegs)) { - MIB.addReg(*SubRegs, RegState::ImplicitDefine); - AllDead = false; - } - } +static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, + MachineInstr &MI, bool addToLiveRegs) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; - if(AllDead) { - MO.setIsKill(true); - toggleBundleKillFlag(MI, MO.getReg(), true, TRI); + // Things that are available after the instruction are killed by it. + bool IsKill = LiveRegs.available(MRI, Reg); + MO.setIsKill(IsKill); + if (IsKill && addToLiveRegs) + LiveRegs.addReg(Reg); } - return false; } -// FIXME: Reuse the LivePhysRegs utility for this. -void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { - DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n'); +void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) { + DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n'); - LiveRegs.resize(TRI->getNumRegs()); - BitVector killedRegs(TRI->getNumRegs()); - - startBlockForKills(MBB); + LiveRegs.init(*TRI); + LiveRegs.addLiveOuts(MBB); // Examine block from end to start... - unsigned Count = MBB->size(); - for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin(); - I != E; --Count) { - MachineInstr &MI = *--I; + for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { if (MI.isDebugValue()) continue; // Update liveness. Registers that are defed but not used in this // instruction are now dead. Mark register and all subregs as they // are completely defined. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (MO.isRegMask()) - LiveRegs.clearBitsNotInMask(MO.getRegMask()); - if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (Reg == 0) continue; - if (!MO.isDef()) continue; - // Ignore two-addr defs. - if (MI.isRegTiedToUseOperand(i)) continue; - - // Repeat for reg and all subregs. - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.reset(*SubRegs); - } - - // Examine all used registers and set/clear kill flag. When a - // register is used multiple times we only set the kill flag on - // the first use. Don't set kill flags on undef operands. - killedRegs.reset(); - - // toggleKillFlag can append new operands (implicit defs), so using - // a range-based loop is not safe. The new operands will be appended - // at the end of the operand list and they don't need to be visited, - // so iterating until the currently last operand is ok. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); - if ((Reg == 0) || MRI.isReserved(Reg)) continue; - - bool kill = false; - if (!killedRegs.test(Reg)) { - kill = true; - // A register is not killed if any subregs are live... - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - if (LiveRegs.test(*SubRegs)) { - kill = false; - break; - } - } - - // If subreg is not live, then register is killed if it became - // live in this instruction - if (kill) - kill = !LiveRegs.test(Reg); - } - - if (MO.isKill() != kill) { - DEBUG(dbgs() << "Fixing " << MO << " in "); - toggleKillFlag(&MI, MO); - DEBUG(MI.dump()); - DEBUG({ - if (MI.getOpcode() == TargetOpcode::BUNDLE) { - MachineBasicBlock::instr_iterator Begin = MI.getIterator(); - MachineBasicBlock::instr_iterator End = getBundleEnd(Begin); - while (++Begin != End) - DEBUG(Begin->dump()); - } - }); + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + const MachineOperand &MO = *O; + if (MO.isReg()) { + if (!MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + LiveRegs.removeReg(Reg); + } else if (MO.isRegMask()) { + LiveRegs.removeRegsInMask(MO); } - - killedRegs.set(Reg); } - // Mark any used register (that is not using undef) and subregs as - // now live... - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); - if ((Reg == 0) || MRI.isReserved(Reg)) continue; - - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.set(*SubRegs); + // If there is a bundle header fix it up first. + if (!MI.isBundled()) { + toggleKills(MRI, LiveRegs, MI, true); + } else { + MachineBasicBlock::instr_iterator First = MI.getIterator(); + if (MI.isBundle()) { + toggleKills(MRI, LiveRegs, MI, false); + ++First; + } + // Some targets make the (questionable) assumtion that the instructions + // inside the bundle are ordered and consequently only the last use of + // a register inside the bundle can kill it. + MachineBasicBlock::instr_iterator I = std::next(First); + while (I->isBundledWithSucc()) + ++I; + do { + if (!I->isDebugValue()) + toggleKills(MRI, LiveRegs, *I, true); + --I; + } while(I != First); } } } void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { + // Cannot completely remove virtual function even in release mode. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) SU->getInstr()->dump(); #endif @@ -1347,7 +1156,7 @@ std::string ScheduleDAGInstrs::getDAGName() const { //===----------------------------------------------------------------------===// namespace llvm { -/// \brief Internal state used to compute SchedDFSResult. +/// Internal state used to compute SchedDFSResult. class SchedDFSImpl { SchedDFSResult &R; @@ -1358,8 +1167,8 @@ class SchedDFSImpl { struct RootData { unsigned NodeID; - unsigned ParentNodeID; // Parent node (member of the parent subtree). - unsigned SubInstrCount; // Instr count in this tree only, not children. + unsigned ParentNodeID; ///< Parent node (member of the parent subtree). + unsigned SubInstrCount; ///< Instr count in this tree only, not children. RootData(unsigned id): NodeID(id), ParentNodeID(SchedDFSResult::InvalidSubtreeID), @@ -1375,7 +1184,7 @@ public: RootSet.setUniverse(R.DFSNodeData.size()); } - /// Return true if this node been visited by the DFS traversal. + /// Returns true if this node been visited by the DFS traversal. /// /// During visitPostorderNode the Node's SubtreeID is assigned to the Node /// ID. Later, SubtreeID is updated but remains valid. @@ -1384,7 +1193,7 @@ public: != SchedDFSResult::InvalidSubtreeID; } - /// Initialize this node's instruction count. We don't need to flag the node + /// Initializes this node's instruction count. We don't need to flag the node /// visited until visitPostorder because the DAG cannot have cycles. void visitPreorder(const SUnit *SU) { R.DFSNodeData[SU->NodeNum].InstrCount = @@ -1433,8 +1242,8 @@ public: RootSet[SU->NodeNum] = RData; } - /// Called once for each tree edge after calling visitPostOrderNode on the - /// predecessor. Increment the parent node's instruction count and + /// \brief Called once for each tree edge after calling visitPostOrderNode on + /// the predecessor. Increment the parent node's instruction count and /// preemptively join this subtree to its parent's if it is small enough. void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) { R.DFSNodeData[Succ->NodeNum].InstrCount @@ -1442,13 +1251,13 @@ public: joinPredSubtree(PredDep, Succ); } - /// Add a connection for cross edges. + /// Adds a connection for cross edges. void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) { ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ)); } - /// Set each node's subtree ID to the representative ID and record connections - /// between trees. + /// Sets each node's subtree ID to the representative ID and record + /// connections between trees. void finalize() { SubtreeClasses.compress(); R.DFSTreeData.resize(SubtreeClasses.getNumClasses()); @@ -1484,8 +1293,8 @@ public: } protected: - /// Join the predecessor subtree with the successor that is its DFS - /// parent. Apply some heuristics before joining. + /// Joins the predecessor subtree with the successor that is its DFS parent. + /// Applies some heuristics before joining. bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ, bool CheckLimit = true) { assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges"); @@ -1531,10 +1340,10 @@ protected: } while (FromTree != SchedDFSResult::InvalidSubtreeID); } }; -} // namespace llvm +} // end namespace llvm namespace { -/// \brief Manage the stack used by a reverse depth-first search over the DAG. +/// Manage the stack used by a reverse depth-first search over the DAG. class SchedDAGReverseDFS { std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack; public: @@ -1569,7 +1378,7 @@ static bool hasDataSucc(const SUnit *SU) { return false; } -/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first +/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first /// search from this root. void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) { if (!IsBottomUp) @@ -1626,8 +1435,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) { } } -LLVM_DUMP_METHOD -void ILPValue::print(raw_ostream &OS) const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const { OS << InstrCount << " / " << Length << " = "; if (!Length) OS << "BADILP"; @@ -1635,8 +1444,7 @@ void ILPValue::print(raw_ostream &OS) const { OS << format("%g", ((double)InstrCount / Length)); } -LLVM_DUMP_METHOD -void ILPValue::dump() const { +LLVM_DUMP_METHOD void ILPValue::dump() const { dbgs() << *this << '\n'; } @@ -1648,4 +1456,5 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { return OS; } -} // namespace llvm +} // end namespace llvm +#endif diff --git a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp index 83bc1ba7beb9..b3d83d5313af 100644 --- a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp +++ b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp @@ -1,4 +1,4 @@ -//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===// +//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===// // // The LLVM Compiler Infrastructure // @@ -15,11 +15,13 @@ #include "llvm/CodeGen/ScoreboardHazardRecognizer.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" +#include <cassert> using namespace llvm; @@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer( const InstrItineraryData *II, const ScheduleDAG *SchedDAG, const char *ParentDebugType) : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II), - DAG(SchedDAG), IssueWidth(0), IssueCount(0) { - + DAG(SchedDAG) { // Determine the maximum depth of any itinerary. This determines the depth of // the scoreboard. We always make the scoreboard at least 1 cycle deep to // avoid dealing with the boundary condition. diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2c7bffe76503..23a302f3e561 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" @@ -53,10 +54,6 @@ STATISTIC(SlicedLoads, "Number of load sliced"); namespace { static cl::opt<bool> - CombinerAA("combiner-alias-analysis", cl::Hidden, - cl::desc("Enable DAG combiner alias-analysis heuristics")); - - static cl::opt<bool> CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner's use of IR alias analysis")); @@ -117,7 +114,7 @@ namespace { SmallPtrSet<SDNode *, 32> CombinedNodes; // AA - Used for DAG load/store alias analysis. - AliasAnalysis &AA; + AliasAnalysis *AA; /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. @@ -133,6 +130,9 @@ namespace { /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { + assert(N->getOpcode() != ISD::DELETED_NODE && + "Deleted Node added to Worklist"); + // Skip handle nodes as they can't usefully be combined and confuse the // zero-use deletion strategy. if (N->getOpcode() == ISD::HANDLENODE) @@ -177,6 +177,7 @@ namespace { void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: + unsigned MaximumLegalStoreInBits; /// Check the specified integer node value to see if it can be simplified or /// if things it uses can be simplified by bit propagation. @@ -232,11 +233,18 @@ namespace { SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); + SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitSUB(SDNode *N); SDValue visitADDC(SDNode *N); + SDValue visitUADDO(SDNode *N); + SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitSUBC(SDNode *N); + SDValue visitUSUBO(SDNode *N); SDValue visitADDE(SDNode *N); + SDValue visitADDCARRY(SDNode *N); + SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); SDValue visitSUBE(SDNode *N); + SDValue visitSUBCARRY(SDNode *N); SDValue visitMUL(SDNode *N); SDValue useDivRem(SDNode *N); SDValue visitSDIV(SDNode *N); @@ -259,6 +267,7 @@ namespace { SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); SDValue visitRotate(SDNode *N); + SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); SDValue visitBITREVERSE(SDNode *N); SDValue visitCTLZ(SDNode *N); @@ -274,6 +283,7 @@ namespace { SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); + SDValue visitAssertZext(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -336,6 +346,7 @@ namespace { SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); SDValue foldSelectOfConstants(SDNode *N); + SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); @@ -344,6 +355,8 @@ namespace { bool NotExtCompare = false); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); + SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, + const SDLoc &DL); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans = true); @@ -361,14 +374,14 @@ namespace { SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); SDValue BuildLogBase2(SDValue Op, const SDLoc &DL); - SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags); - SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags); - SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags); - SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip); + SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags); + SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); + SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); + SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal); + SDNodeFlags Flags, bool Reciprocal); SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal); + SDNodeFlags Flags, bool Reciprocal); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); @@ -377,6 +390,7 @@ namespace { unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); + SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -384,9 +398,10 @@ namespace { SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); - SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask, - SDValue VecIn1, SDValue VecIn2, - unsigned LeftIdx); + SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, + ArrayRef<int> VectorMask, SDValue VecIn1, + SDValue VecIn2, unsigned LeftIdx); + SDValue matchVSelectOpSizesWithSetCC(SDNode *N); SDValue GetDemandedBits(SDValue V, const APInt &Mask); @@ -416,15 +431,12 @@ namespace { /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { - MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): - MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } + MemOpLink(LSBaseSDNode *N, int64_t Offset) + : MemNode(N), OffsetFromBase(Offset) {} // Ptr to the mem node. LSBaseSDNode *MemNode; // Offset from the base ptr. int64_t OffsetFromBase; - // What is the sequence number of this mem node. - // Lowest mem operand in the DAG starts at zero. - unsigned SequenceNum; }; /// This is a helper function for visitMUL to check the profitability @@ -435,12 +447,6 @@ namespace { SDValue &AddNode, SDValue &ConstNode); - /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a - /// constant build_vector of the stored constant values in Stores. - SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL, - ArrayRef<MemOpLink> Stores, - SmallVectorImpl<SDValue> &Chains, - EVT Ty) const; /// This is a helper function for visitAND and visitZERO_EXTEND. Returns /// true if the (and (load x) c) pattern matches an extload. ExtVT returns @@ -451,34 +457,35 @@ namespace { EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, bool &NarrowLoad); + /// Helper function for MergeConsecutiveStores which merges the + /// component store chains. + SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, + unsigned NumStores); + /// This is a helper function for MergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store. - /// \return number of stores that were merged into a merged store (always - /// a prefix of \p StoreNode). - bool MergeStoresOfConstantsOrVecElts( - SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, - bool IsConstantSrc, bool UseVector); + /// \return True if a merged store was created. + bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, + EVT MemVT, unsigned NumStores, + bool IsConstantSrc, bool UseVector); /// This is a helper function for MergeConsecutiveStores. /// Stores that may be merged are placed in StoreNodes. - /// Loads that may alias with those stores are placed in AliasLoadNodes. - void getStoreMergeAndAliasCandidates( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, - SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes); + void getStoreMergeCandidates(StoreSDNode *St, + SmallVectorImpl<MemOpLink> &StoreNodes); /// Helper function for MergeConsecutiveStores. Checks if /// Candidate stores have indirect dependency through their /// operands. \return True if safe to merge bool checkMergeStoreCandidatesForDependencies( - SmallVectorImpl<MemOpLink> &StoreNodes); + SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return number of stores that were merged into a merged store (the /// affected nodes are stored as a prefix in \p StoreNodes). - bool MergeConsecutiveStores(StoreSDNode *N, - SmallVectorImpl<MemOpLink> &StoreNodes); + bool MergeConsecutiveStores(StoreSDNode *N); /// \brief Try to transform a truncation where C is a constant: /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) @@ -489,10 +496,17 @@ namespace { SDValue distributeTruncateThroughAnd(SDNode *N); public: - DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) + DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), - OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { + OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) { ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); + + MaximumLegalStoreInBits = 0; + for (MVT VT : MVT::all_valuetypes()) + if (EVT(VT).isSimple() && VT != MVT::Other && + TLI.isTypeLegal(EVT(VT)) && + VT.getSizeInBits() >= MaximumLegalStoreInBits) + MaximumLegalStoreInBits = VT.getSizeInBits(); } /// Runs the dag combiner on all nodes in the work list @@ -607,10 +621,16 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, switch (Op.getOpcode()) { default: return false; - case ISD::ConstantFP: - // Don't invert constant FP values after legalize. The negated constant - // isn't necessarily legal. - return LegalOperations ? 0 : 1; + case ISD::ConstantFP: { + if (!LegalOperations) + return 1; + + // Don't invert constant FP values after legalization unless the target says + // the negated constant is legal. + EVT VT = Op.getValueType(); + return TLI.isOperationLegal(ISD::ConstantFP, VT) || + TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT); + } case ISD::FADD: // FIXME: determine better conditions for this xform. if (!Options->UnsafeFPMath) return 0; @@ -629,7 +649,8 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options->UnsafeFPMath && !Op.getNode()->getFlags()->hasNoSignedZeros()) + if (!Options->NoSignedZerosFPMath && + !Op.getNode()->getFlags().hasNoSignedZeros()) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) @@ -667,7 +688,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); - const SDNodeFlags *Flags = Op.getNode()->getFlags(); + const SDNodeFlags Flags = Op.getNode()->getFlags(); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); @@ -950,8 +971,8 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); - APInt KnownZero, KnownOne; - if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + KnownBits Known; + if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO)) return false; // Revisit the node. @@ -1079,37 +1100,36 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); + DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); + bool Replace0 = false; SDValue N0 = Op.getOperand(0); SDValue NN0 = PromoteOperand(N0, PVT, Replace0); - if (!NN0.getNode()) - return SDValue(); bool Replace1 = false; SDValue N1 = Op.getOperand(1); - SDValue NN1; - if (N0 == N1) - NN1 = NN0; - else { - NN1 = PromoteOperand(N1, PVT, Replace1); - if (!NN1.getNode()) - return SDValue(); - } + SDValue NN1 = PromoteOperand(N1, PVT, Replace1); + SDLoc DL(Op); - AddToWorklist(NN0.getNode()); - if (NN1.getNode()) - AddToWorklist(NN1.getNode()); + SDValue RV = + DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); - if (Replace0) + // New replace instances of N0 and N1 + if (Replace0 && N0 && N0.getOpcode() != ISD::DELETED_NODE && NN0 && + NN0.getOpcode() != ISD::DELETED_NODE) { + AddToWorklist(NN0.getNode()); ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); - if (Replace1) + } + + if (Replace1 && N1 && N1.getOpcode() != ISD::DELETED_NODE && NN1 && + NN1.getOpcode() != ISD::DELETED_NODE) { + AddToWorklist(NN1.getNode()); ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); + } - DEBUG(dbgs() << "\nPromoting "; - Op.getNode()->dump(&DAG)); - SDLoc DL(Op); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(Opc, DL, PVT, NN0, NN1)); + // Deal with Op being deleted. + if (Op && Op.getOpcode() != ISD::DELETED_NODE) + return RV; } return SDValue(); } @@ -1137,26 +1157,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); + DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); + bool Replace = false; SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); if (Opc == ISD::SRA) - N0 = SExtPromoteOperand(Op.getOperand(0), PVT); + N0 = SExtPromoteOperand(N0, PVT); else if (Opc == ISD::SRL) - N0 = ZExtPromoteOperand(Op.getOperand(0), PVT); + N0 = ZExtPromoteOperand(N0, PVT); else N0 = PromoteOperand(N0, PVT, Replace); + if (!N0.getNode()) return SDValue(); + SDLoc DL(Op); + SDValue RV = + DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); + AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); - DEBUG(dbgs() << "\nPromoting "; - Op.getNode()->dump(&DAG)); - SDLoc DL(Op); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1))); + // Deal with Op being deleted. + if (Op && Op.getOpcode() != ISD::DELETED_NODE) + return RV; } return SDValue(); } @@ -1361,8 +1387,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) { else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); - SDValue OpV = RV; - DAG.ReplaceAllUsesWith(N, &OpV); + DAG.ReplaceAllUsesWith(N, &RV); } // Push the new node and any users onto the worklist @@ -1389,9 +1414,13 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::ADD: return visitADD(N); case ISD::SUB: return visitSUB(N); case ISD::ADDC: return visitADDC(N); + case ISD::UADDO: return visitUADDO(N); case ISD::SUBC: return visitSUBC(N); + case ISD::USUBO: return visitUSUBO(N); case ISD::ADDE: return visitADDE(N); + case ISD::ADDCARRY: return visitADDCARRY(N); case ISD::SUBE: return visitSUBE(N); + case ISD::SUBCARRY: return visitSUBCARRY(N); case ISD::MUL: return visitMUL(N); case ISD::SDIV: return visitSDIV(N); case ISD::UDIV: return visitUDIV(N); @@ -1415,6 +1444,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); + case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); case ISD::CTLZ: return visitCTLZ(N); @@ -1430,6 +1460,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); + case ISD::AssertZext: return visitAssertZext(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -1574,7 +1605,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } SmallVector<SDNode *, 8> TFs; // List of token factors to visit. - SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. + SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. SmallPtrSet<SDNode*, 16> SeenOps; bool Changed = false; // If we should replace this token factor. @@ -1618,26 +1649,108 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } } - SDValue Result; + // Remove Nodes that are chained to another node in the list. Do so + // by walking up chains breath-first stopping when we've seen + // another operand. In general we must climb to the EntryNode, but we can exit + // early if we find all remaining work is associated with just one operand as + // no further pruning is possible. + + // List of nodes to search through and original Ops from which they originate. + SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; + SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. + SmallPtrSet<SDNode *, 16> SeenChains; + bool DidPruneOps = false; + + unsigned NumLeftToConsider = 0; + for (const SDValue &Op : Ops) { + Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); + OpWorkCount.push_back(1); + } + + auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { + // If this is an Op, we can remove the op from the list. Remark any + // search associated with it as from the current OpNumber. + if (SeenOps.count(Op) != 0) { + Changed = true; + DidPruneOps = true; + unsigned OrigOpNumber = 0; + while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) + OrigOpNumber++; + assert((OrigOpNumber != Ops.size()) && + "expected to find TokenFactor Operand"); + // Re-mark worklist from OrigOpNumber to OpNumber + for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { + if (Worklist[i].second == OrigOpNumber) { + Worklist[i].second = OpNumber; + } + } + OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; + OpWorkCount[OrigOpNumber] = 0; + NumLeftToConsider--; + } + // Add if it's a new chain + if (SeenChains.insert(Op).second) { + OpWorkCount[OpNumber]++; + Worklist.push_back(std::make_pair(Op, OpNumber)); + } + }; + + for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { + // We need at least be consider at least 2 Ops to prune. + if (NumLeftToConsider <= 1) + break; + auto CurNode = Worklist[i].first; + auto CurOpNumber = Worklist[i].second; + assert((OpWorkCount[CurOpNumber] > 0) && + "Node should not appear in worklist"); + switch (CurNode->getOpcode()) { + case ISD::EntryToken: + // Hitting EntryToken is the only way for the search to terminate without + // hitting + // another operand's search. Prevent us from marking this operand + // considered. + NumLeftToConsider++; + break; + case ISD::TokenFactor: + for (const SDValue &Op : CurNode->op_values()) + AddToWorklist(i, Op.getNode(), CurOpNumber); + break; + case ISD::CopyFromReg: + case ISD::CopyToReg: + AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); + break; + default: + if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) + AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); + break; + } + OpWorkCount[CurOpNumber]--; + if (OpWorkCount[CurOpNumber] == 0) + NumLeftToConsider--; + } // If we've changed things around then replace token factor. if (Changed) { + SDValue Result; if (Ops.empty()) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); } else { - // New and improved token factor. - Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + if (DidPruneOps) { + SmallVector<SDValue, 8> PrunedOps; + // + for (const SDValue &Op : Ops) { + if (SeenChains.count(Op.getNode()) == 0) + PrunedOps.push_back(Op); + } + Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); + } else { + Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + } } - - // Add users to worklist if AA is enabled, since it may introduce - // a lot of new chained token factors while removing memory deps. - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - return CombineTo(N, Result, UseAA /*add to worklist*/); + return Result; } - - return Result; + return SDValue(); } /// MERGE_VALUES can always be eliminated. @@ -1664,6 +1777,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } +SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { + auto BinOpcode = BO->getOpcode(); + assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB || + BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV || + BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM || + BinOpcode == ISD::UREM || BinOpcode == ISD::AND || + BinOpcode == ISD::OR || BinOpcode == ISD::XOR || + BinOpcode == ISD::SHL || BinOpcode == ISD::SRL || + BinOpcode == ISD::SRA || BinOpcode == ISD::FADD || + BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL || + BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) && + "Unexpected binary operator"); + + // Bail out if any constants are opaque because we can't constant fold those. + SDValue C1 = BO->getOperand(1); + if (!isConstantOrConstantVector(C1, true) && + !isConstantFPBuildVectorOrConstantFP(C1)) + return SDValue(); + + // Don't do this unless the old select is going away. We want to eliminate the + // binary operator, not replace a binop with a select. + // TODO: Handle ISD::SELECT_CC. + SDValue Sel = BO->getOperand(0); + if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) + return SDValue(); + + SDValue CT = Sel.getOperand(1); + if (!isConstantOrConstantVector(CT, true) && + !isConstantFPBuildVectorOrConstantFP(CT)) + return SDValue(); + + SDValue CF = Sel.getOperand(2); + if (!isConstantOrConstantVector(CF, true) && + !isConstantFPBuildVectorOrConstantFP(CF)) + return SDValue(); + + // We have a select-of-constants followed by a binary operator with a + // constant. Eliminate the binop by pulling the constant math into the select. + // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1 + EVT VT = Sel.getValueType(); + SDLoc DL(Sel); + SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1); + assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) || + isConstantFPBuildVectorOrConstantFP(NewCT)) && + "Failed to constant fold a binop with constant operands"); + + SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1); + assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) || + isConstantFPBuildVectorOrConstantFP(NewCF)) && + "Failed to constant fold a binop with constant operands"); + + return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); +} + SDValue DAGCombiner::visitADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -1702,16 +1869,36 @@ SDValue DAGCombiner::visitADD(SDNode *N) { if (isNullConstant(N1)) return N0; - // fold ((c1-A)+c2) -> (c1+c2)-A if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { - if (N0.getOpcode() == ISD::SUB) - if (isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), - N0.getOperand(1)); + // fold ((c1-A)+c2) -> (c1+c2)-A + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { + // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic. + return DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), + N0.getOperand(1)); + } + + // add (sext i1 X), 1 -> zext (not i1 X) + // We don't transform this pattern: + // add (zext i1 X), -1 -> sext (not i1 X) + // because most (?) targets generate better code for the zext form. + if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && + isOneConstantOrOneSplatConstant(N1)) { + SDValue X = N0.getOperand(0); + if ((!LegalOperations || + (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && + TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && + X.getScalarValueSizeInBits() == 1) { + SDValue Not = DAG.getNOT(DL, X, X.getValueType()); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); } + } } + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate add if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1)) return RADD; @@ -1774,6 +1961,19 @@ SDValue DAGCombiner::visitADD(SDNode *N) { VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); + if (SDValue Combined = visitADDLike(N0, N1, N)) + return Combined; + + if (SDValue Combined = visitADDLike(N1, N0, N)) + return Combined; + + return SDValue(); +} + +SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { + EVT VT = N0.getValueType(); + SDLoc DL(LocReference); + // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) @@ -1781,12 +1981,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) { DAG.getNode(ISD::SHL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1))); - if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB && - isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0))) - return DAG.getNode(ISD::SUB, DL, VT, N1, - DAG.getNode(ISD::SHL, DL, VT, - N0.getOperand(0).getOperand(1), - N0.getOperand(1))); if (N1.getOpcode() == ISD::AND) { SDValue AndOp0 = N1.getOperand(0); @@ -1797,7 +1991,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // and similar xforms where the inner op is either ~0 or 0. if (NumSignBits == DestBits && isOneConstantOrOneSplatConstant(N1->getOperand(1))) - return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0); + return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); } // add (sext i1), X -> sub X, (zext i1) @@ -1818,6 +2012,11 @@ SDValue DAGCombiner::visitADD(SDNode *N) { } } + // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) + if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) + return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), + N0, N1.getOperand(0), N1.getOperand(2)); + return SDValue(); } @@ -1825,38 +2024,80 @@ SDValue DAGCombiner::visitADDC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) - return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); if (N0C && !N1C) - return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0); + return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); // fold (addc x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); + DL, MVT::Glue)); - // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits. - APInt LHSZero, LHSOne; - APInt RHSZero, RHSOne; - DAG.computeKnownBits(N0, LHSZero, LHSOne); + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); + + return SDValue(); +} + +SDValue DAGCombiner::visitUADDO(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + EVT CarryVT = N->getValueType(1); + SDLoc DL(N); + + // If the flag result is dead, turn this into an ADD. + if (!N->hasAnyUseOfValue(1)) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getUNDEF(CarryVT)); + + // canonicalize constant to RHS. + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); + + // fold (uaddo x, 0) -> x + no carry out + if (isNullConstant(N1)) + return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getConstant(0, DL, CarryVT)); + + if (SDValue Combined = visitUADDOLike(N0, N1, N)) + return Combined; - if (LHSZero.getBoolValue()) { - DAG.computeKnownBits(N1, RHSZero, RHSOne); + if (SDValue Combined = visitUADDOLike(N1, N0, N)) + return Combined; + + return SDValue(); +} - // If all possibly-set bits on the LHS are clear on the RHS, return an OR. - // If all possibly-set bits on the RHS are clear on the LHS, return an OR. - if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero) - return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, - SDLoc(N), MVT::Glue)); +SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { + // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) + // If Y + 1 cannot overflow. + if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { + SDValue Y = N1.getOperand(0); + SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); + if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, + N1.getOperand(2)); } return SDValue(); @@ -1881,6 +2122,54 @@ SDValue DAGCombiner::visitADDE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitADDCARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + SDLoc DL(N); + + // canonicalize constant to RHS + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); + + // fold (addcarry x, y, false) -> (uaddo x, y) + if (isNullConstant(CarryIn)) + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); + + // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. + if (isNullConstant(N0) && isNullConstant(N1)) { + EVT VT = N0.getValueType(); + EVT CarryVT = CarryIn.getValueType(); + SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); + AddToWorklist(CarryExt.getNode()); + return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, + DAG.getConstant(1, DL, VT)), + DAG.getConstant(0, DL, CarryVT)); + } + + if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) + return Combined; + + if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) + return Combined; + + return SDValue(); +} + +SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, + SDNode *N) { + // Iff the flag result is dead: + // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) + if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::UADDO) && + isNullConstant(N1) && !N->hasAnyUseOfValue(1)) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), + N0.getOperand(0), N0.getOperand(1), CarryIn); + + return SDValue(); +} + // Since it may not be valid to emit a fold to zero for vector initializers // check if we can before folding. static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, @@ -1920,6 +2209,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { N1.getNode()); } + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); // fold (sub x, c) -> (add x, -c) @@ -1944,13 +2236,13 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { } // 0 - X --> 0 if the sub is NUW. - if (N->getFlags()->hasNoUnsignedWrap()) + if (N->getFlags().hasNoUnsignedWrap()) return N0; - if (DAG.MaskedValueIsZero(N1, ~APInt::getSignBit(BitWidth))) { + if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { // N1 is either 0 or the minimum signed value. If the sub is NSW, then // N1 must be 0 because negating the minimum signed value is undefined. - if (N->getFlags()->hasNoSignedWrap()) + if (N->getFlags().hasNoSignedWrap()) return N0; // 0 - X --> X if X is 0 or the minimum signed value. @@ -2066,6 +2358,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitUSUBO(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + EVT CarryVT = N->getValueType(1); + SDLoc DL(N); + + // If the flag result is dead, turn this into an SUB. + if (!N->hasAnyUseOfValue(1)) + return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), + DAG.getUNDEF(CarryVT)); + + // fold (usubo x, x) -> 0 + no borrow + if (N0 == N1) + return CombineTo(N, DAG.getConstant(0, DL, VT), + DAG.getConstant(0, DL, CarryVT)); + + // fold (usubo x, 0) -> x + no borrow + if (isNullConstant(N1)) + return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + + // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow + if (isAllOnesConstant(N0)) + return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), + DAG.getConstant(0, DL, CarryVT)); + + return SDValue(); +} + SDValue DAGCombiner::visitSUBE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2078,6 +2402,18 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + + // fold (subcarry x, y, false) -> (usubo x, y) + if (isNullConstant(CarryIn)) + return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); + + return SDValue(); +} + SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2131,6 +2467,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // fold (mul x, 1) -> x if (N1IsConst && ConstValue1 == 1 && IsFullSplat) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (mul x, -1) -> 0-x if (N1IsConst && ConstValue1.isAllOnesValue()) { SDLoc DL(N); @@ -2297,6 +2637,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) { return combined; } +static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if (DAG.isUndef(N->getOpcode(), {N0, N1})) + return DAG.getUNDEF(VT); + + // undef / X -> 0 + // undef % X -> 0 + if (N0.isUndef()) + return DAG.getConstant(0, DL, VT); + + return SDValue(); +} + SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2319,8 +2676,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { return N0; // fold (sdiv X, -1) -> 0-X if (N1C && N1C->isAllOnesValue()) - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), N0); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); + + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 @@ -2332,9 +2694,8 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { // better results in that case. The target-specific lowering should learn how // to handle exact sdivs efficiently. if (N1C && !N1C->isNullValue() && !N1C->isOpaque() && - !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() && - (N1C->getAPIntValue().isPowerOf2() || - (-N1C->getAPIntValue()).isPowerOf2())) { + !N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() || + (-N1C->getAPIntValue()).isPowerOf2())) { // Target-specific implementation of sdiv x, pow2. if (SDValue Res = BuildSDIVPow2(N)) return Res; @@ -2372,7 +2733,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { // If integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. Targets may check function attributes for size/speed // trade-offs. - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; @@ -2384,13 +2745,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem; - // undef / X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X / undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2414,6 +2768,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { N0C, N1C)) return Folded; + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { @@ -2444,7 +2804,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { } // fold (udiv x, c) -> alternate - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; @@ -2456,13 +2816,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem; - // undef / X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X / undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2482,32 +2835,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C)) return Folded; + if (SDValue V = simplifyDivRem(N, DAG)) + return V; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (isSigned) { // If we know the sign bits of both operands are zero, strength reduce to a // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } else { - // fold (urem x, pow2) -> (and x, pow2-1) + SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (DAG.isKnownToBeAPowerOfTwo(N1)) { - APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); + // fold (urem x, pow2) -> (and x, pow2-1) + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } - // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) if (N1.getOpcode() == ISD::SHL && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { - APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits()); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT)); + // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } } - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. @@ -2536,13 +2892,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (SDValue DivRem = useDivRem(N)) return DivRem.getValue(1); - // undef % X -> 0 - if (N0.isUndef()) - return DAG.getConstant(0, DL, VT); - // X % undef -> undef - if (N1.isUndef()) - return N1; - return SDValue(); } @@ -2932,95 +3281,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { return SDValue(); } +/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. +SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, + const SDLoc &DL) { + SDValue LL, LR, RL, RR, N0CC, N1CC; + if (!isSetCCEquivalent(N0, LL, LR, N0CC) || + !isSetCCEquivalent(N1, RL, RR, N1CC)) + return SDValue(); + + assert(N0.getValueType() == N1.getValueType() && + "Unexpected operand types for bitwise logic op"); + assert(LL.getValueType() == LR.getValueType() && + RL.getValueType() == RR.getValueType() && + "Unexpected operand types for setcc"); + + // If we're here post-legalization or the logic op type is not i1, the logic + // op type must match a setcc result type. Also, all folds require new + // operations on the left and right operands, so those types must match. + EVT VT = N0.getValueType(); + EVT OpVT = LL.getValueType(); + if (LegalOperations || VT != MVT::i1) + if (VT != getSetCCResultType(OpVT)) + return SDValue(); + if (OpVT != RL.getValueType()) + return SDValue(); + + ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); + ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); + bool IsInteger = OpVT.isInteger(); + if (LR == RR && CC0 == CC1 && IsInteger) { + bool IsZero = isNullConstantOrNullSplatConstant(LR); + bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); + + // All bits clear? + bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; + // All sign bits clear? + bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; + // Any bits set? + bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; + // Any sign bits set? + bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; + + // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) + // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) + // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) + // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) + if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { + SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); + AddToWorklist(Or.getNode()); + return DAG.getSetCC(DL, VT, Or, LR, CC1); + } + + // All bits set? + bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; + // All sign bits set? + bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; + // Any bits clear? + bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; + // Any sign bits clear? + bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; + + // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) + // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) + // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) + // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) + if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); + AddToWorklist(And.getNode()); + return DAG.getSetCC(DL, VT, And, LR, CC1); + } + } + + // TODO: What is the 'or' equivalent of this fold? + // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) + if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE && + ((isNullConstant(LR) && isAllOnesConstant(RR)) || + (isAllOnesConstant(LR) && isNullConstant(RR)))) { + SDValue One = DAG.getConstant(1, DL, OpVT); + SDValue Two = DAG.getConstant(2, DL, OpVT); + SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); + AddToWorklist(Add.getNode()); + return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); + } + + // Try more general transforms if the predicates match and the only user of + // the compares is the 'and' or 'or'. + if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && + N0.hasOneUse() && N1.hasOneUse()) { + // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 + // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 + if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { + SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); + SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); + SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); + SDValue Zero = DAG.getConstant(0, DL, OpVT); + return DAG.getSetCC(DL, VT, Or, Zero, CC1); + } + } + + // Canonicalize equivalent operands to LL == RL. + if (LL == RR && LR == RL) { + CC1 = ISD::getSetCCSwappedOperands(CC1); + std::swap(RL, RR); + } + + // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) + // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) + if (LL == RL && LR == RR) { + ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) + : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); + if (NewCC != ISD::SETCC_INVALID && + (!LegalOperations || + (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && + TLI.isOperationLegal(ISD::SETCC, OpVT)))) + return DAG.getSetCC(DL, VT, LL, LR, NewCC); + } + + return SDValue(); +} + /// This contains all DAGCombine rules which reduce two values combined by /// an And operation to a single value. This makes them reusable in the context /// of visitSELECT(). Rules involving constants are not included as /// visitSELECT() already handles those cases. -SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, - SDNode *LocReference) { +SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); + SDLoc DL(N); // fold (and x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) - return DAG.getConstant(0, SDLoc(LocReference), VT); - // fold (and (setcc x), (setcc y)) -> (setcc (and x, y)) - SDValue LL, LR, RL, RR, CC0, CC1; - if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ - ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); - ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - - if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 && - LL.getValueType().isInteger()) { - // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0) - if (isNullConstant(LR) && Op1 == ISD::SETEQ) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - if (isAllOnesConstant(LR)) { - // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1) - if (Op1 == ISD::SETEQ) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ANDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); - } - } - // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1) - if (Op1 == ISD::SETGT) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - } - } - // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2) - if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) && - Op0 == Op1 && LL.getValueType().isInteger() && - Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) || - (isAllOnesConstant(LR) && isNullConstant(RR)))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDLoc DL(N0); - SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(), - LL, DAG.getConstant(1, DL, - LL.getValueType())); - AddToWorklist(ADDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode, - DAG.getConstant(2, DL, LL.getValueType()), - ISD::SETUGE); - } - } - // canonicalize equivalent to ll == rl - if (LL == RR && LR == RL) { - Op1 = ISD::getSetCCSwappedOperands(Op1); - std::swap(RL, RR); - } - if (LL == RL && LR == RR) { - bool isInteger = LL.getValueType().isInteger(); - ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger); - if (Result != ISD::SETCC_INVALID && - (!LegalOperations || - (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (N0.getValueType() == CCVT || - (!LegalOperations && N0.getValueType() == MVT::i1)) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); - } - } - } + return DAG.getConstant(0, DL, VT); + + if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) + return V; if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64) { @@ -3037,13 +3430,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { ADDC |= Mask; if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { - SDLoc DL(N0); + SDLoc DL0(N0); SDValue NewAdd = - DAG.getNode(ISD::ADD, DL, VT, + DAG.getNode(ISD::ADD, DL0, VT, N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); CombineTo(N0.getNode(), NewAdd); // Return N so it doesn't get rechecked! - return SDValue(LocReference, 0); + return SDValue(N, 0); } } } @@ -3068,7 +3461,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, unsigned MaskBits = AndMask.countTrailingOnes(); EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); - if (APIntOps::isMask(AndMask) && + if (AndMask.isMask() && // Required bits must not span the two halves of the integer and // must fit in the half size type. (ShiftBits + MaskBits <= Size / 2) && @@ -3108,7 +3501,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, bool &NarrowLoad) { uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); - if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue())) + if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits)) return false; ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); @@ -3191,6 +3584,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate and if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; @@ -3299,6 +3696,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to // preserve semantics once we get rid of the AND. SDValue NewLoad(Load, 0); + + // Fold the AND away. NewLoad may get replaced immediately. + CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); + if (Load->getExtensionType() == ISD::EXTLOAD) { NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), SDLoc(Load), @@ -3316,10 +3717,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } - // Fold the AND away, taking care not to fold to the old load node if we - // replaced it. - CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); - return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -3412,7 +3809,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. - if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (zext_inreg (extload x)) -> (zextload x) @@ -3473,7 +3870,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) return SDValue(); - if (!TLI.isOperationLegal(ISD::BSWAP, VT)) + if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00) @@ -3585,27 +3982,36 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) return false; - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); + SDValue N0 = N.getOperand(0); + unsigned Opc0 = N0.getOpcode(); + if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) + return false; + + ConstantSDNode *N1C = nullptr; + // SHL or SRL: look upstream for AND mask operand + if (Opc == ISD::AND) + N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); + else if (Opc0 == ISD::AND) + N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N1C) return false; - unsigned Num; + unsigned MaskByteOffset; switch (N1C->getZExtValue()) { default: return false; - case 0xFF: Num = 0; break; - case 0xFF00: Num = 1; break; - case 0xFF0000: Num = 2; break; - case 0xFF000000: Num = 3; break; + case 0xFF: MaskByteOffset = 0; break; + case 0xFF00: MaskByteOffset = 1; break; + case 0xFF0000: MaskByteOffset = 2; break; + case 0xFF000000: MaskByteOffset = 3; break; } // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). - SDValue N0 = N.getOperand(0); if (Opc == ISD::AND) { - if (Num == 0 || Num == 2) { + if (MaskByteOffset == 0 || MaskByteOffset == 2) { // (x >> 8) & 0xff // (x >> 8) & 0xff0000 - if (N0.getOpcode() != ISD::SRL) + if (Opc0 != ISD::SRL) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3613,7 +4019,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { } else { // (x << 8) & 0xff00 // (x << 8) & 0xff000000 - if (N0.getOpcode() != ISD::SHL) + if (Opc0 != ISD::SHL) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3622,7 +4028,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { } else if (Opc == ISD::SHL) { // (x & 0xff) << 8 // (x & 0xff0000) << 8 - if (Num != 0 && Num != 2) + if (MaskByteOffset != 0 && MaskByteOffset != 2) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3630,17 +4036,17 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { } else { // Opc == ISD::SRL // (x & 0xff00) >> 8 // (x & 0xff000000) >> 8 - if (Num != 1 && Num != 3) + if (MaskByteOffset != 1 && MaskByteOffset != 3) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } - if (Parts[Num]) + if (Parts[MaskByteOffset]) return false; - Parts[Num] = N0.getOperand(0).getNode(); + Parts[MaskByteOffset] = N0.getOperand(0).getNode(); return true; } @@ -3657,7 +4063,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); - if (!TLI.isOperationLegal(ISD::BSWAP, VT)) + if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Look for either @@ -3672,18 +4078,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { if (N1.getOpcode() == ISD::OR && N00.getNumOperands() == 2 && N01.getNumOperands() == 2) { // (or (or (and), (and)), (or (and), (and))) - SDValue N000 = N00.getOperand(0); - if (!isBSwapHWordElement(N000, Parts)) + if (!isBSwapHWordElement(N00, Parts)) return SDValue(); - SDValue N001 = N00.getOperand(1); - if (!isBSwapHWordElement(N001, Parts)) + if (!isBSwapHWordElement(N01, Parts)) return SDValue(); - SDValue N010 = N01.getOperand(0); - if (!isBSwapHWordElement(N010, Parts)) + SDValue N10 = N1.getOperand(0); + if (!isBSwapHWordElement(N10, Parts)) return SDValue(); - SDValue N011 = N01.getOperand(1); - if (!isBSwapHWordElement(N011, Parts)) + SDValue N11 = N1.getOperand(1); + if (!isBSwapHWordElement(N11, Parts)) return SDValue(); } else { // (or (or (or (and), (and)), (and)), (and)) @@ -3723,65 +4127,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { /// This contains all DAGCombine rules which reduce two values combined by /// an Or operation to a single value \see visitANDLike(). -SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { +SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); + SDLoc DL(N); + // fold (or x, undef) -> -1 - if (!LegalOperations && - (N0.isUndef() || N1.isUndef())) { - EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT; - return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), - SDLoc(LocReference), VT); - } - // fold (or (setcc x), (setcc y)) -> (setcc (or x, y)) - SDValue LL, LR, RL, RR, CC0, CC1; - if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){ - ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get(); - ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get(); - - if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) { - // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0) - // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0) - if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR), - LR.getValueType(), LL, RL); - AddToWorklist(ORNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1); - } - } - // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1) - // fold (or (setgt X, -1), (setgt Y -1)) -> (setgt (and X, Y), -1) - if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) { - EVT CCVT = getSetCCResultType(LR.getValueType()); - if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) { - SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR), - LR.getValueType(), LL, RL); - AddToWorklist(ANDNode.getNode()); - return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1); - } - } - } - // canonicalize equivalent to ll == rl - if (LL == RR && LR == RL) { - Op1 = ISD::getSetCCSwappedOperands(Op1); - std::swap(RL, RR); - } - if (LL == RL && LR == RR) { - bool isInteger = LL.getValueType().isInteger(); - ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger); - if (Result != ISD::SETCC_INVALID && - (!LegalOperations || - (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { - EVT CCVT = getSetCCResultType(LL.getValueType()); - if (N0.getValueType() == CCVT || - (!LegalOperations && N0.getValueType() == MVT::i1)) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); - } - } - } + if (!LegalOperations && (N0.isUndef() || N1.isUndef())) + return DAG.getAllOnesConstant(DL, VT); + + if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) + return V; // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && @@ -3802,7 +4157,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1.getOperand(0)); - SDLoc DL(LocReference); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(LHSMask | RHSMask, DL, VT)); } @@ -3818,7 +4172,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(1), N1.getOperand(1)); - return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X); + return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); } return SDValue(); @@ -3847,14 +4201,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // fold (or x, -1) -> -1, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) // do not return N0, because undef node may exist in N0 - return DAG.getConstant( - APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N), - N0.getValueType()); + return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 - return DAG.getConstant( - APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N), - N1.getValueType()); + return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) // Do this only if the resulting shuffle is legal. @@ -3867,7 +4217,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); // Ensure both shuffles have a zero input. - if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) { + if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); @@ -3939,6 +4289,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // fold (or x, -1) -> -1 if (isAllOnesConstant(N1)) return N1; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (or x, c) -> c iff (x & ~c) == 0 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; @@ -3955,20 +4309,22 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // reassociate or if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) return ROR; + // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) - // iff (c1 & c2) == 0. - if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && - isa<ConstantSDNode>(N0.getOperand(1))) { - ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1)); - if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) { - if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, - N1C, C1)) - return DAG.getNode( - ISD::AND, SDLoc(N), VT, - DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); - return SDValue(); + // iff (c1 & c2) != 0. + if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) { + if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) { + if (SDValue COR = + DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1)) + return DAG.getNode( + ISD::AND, SDLoc(N), VT, + DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); + return SDValue(); + } } } + // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) @@ -3978,9 +4334,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); + if (SDValue Load = MatchLoadCombine(N)) + return Load; + // Simplify the operands using demanded-bits information. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); @@ -4190,8 +4548,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { - APInt AllBits = APInt::getAllOnesValue(EltSizeInBits); - SDValue Mask = DAG.getConstant(AllBits, DL, VT); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); if (LHSMask.getNode()) { APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); @@ -4349,6 +4706,299 @@ struct BaseIndexOffset { }; } // namespace +namespace { +/// Represents known origin of an individual byte in load combine pattern. The +/// value of the byte is either constant zero or comes from memory. +struct ByteProvider { + // For constant zero providers Load is set to nullptr. For memory providers + // Load represents the node which loads the byte from memory. + // ByteOffset is the offset of the byte in the value produced by the load. + LoadSDNode *Load; + unsigned ByteOffset; + + ByteProvider() : Load(nullptr), ByteOffset(0) {} + + static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { + return ByteProvider(Load, ByteOffset); + } + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } + + bool isConstantZero() const { return !Load; } + bool isMemory() const { return Load; } + + bool operator==(const ByteProvider &Other) const { + return Other.Load == Load && Other.ByteOffset == ByteOffset; + } + +private: + ByteProvider(LoadSDNode *Load, unsigned ByteOffset) + : Load(Load), ByteOffset(ByteOffset) {} +}; + +/// Recursively traverses the expression calculating the origin of the requested +/// byte of the given value. Returns None if the provider can't be calculated. +/// +/// For all the values except the root of the expression verifies that the value +/// has exactly one use and if it's not true return None. This way if the origin +/// of the byte is returned it's guaranteed that the values which contribute to +/// the byte are not used outside of this expression. +/// +/// Because the parts of the expression are not allowed to have more than one +/// use this function iterates over trees, not DAGs. So it never visits the same +/// node more than once. +const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, + unsigned Depth, + bool Root = false) { + // Typical i64 by i8 pattern requires recursion up to 8 calls depth + if (Depth == 10) + return None; + + if (!Root && !Op.hasOneUse()) + return None; + + assert(Op.getValueType().isScalarInteger() && "can't handle other types"); + unsigned BitWidth = Op.getValueSizeInBits(); + if (BitWidth % 8 != 0) + return None; + unsigned ByteWidth = BitWidth / 8; + assert(Index < ByteWidth && "invalid index requested"); + (void) ByteWidth; + + switch (Op.getOpcode()) { + case ISD::OR: { + auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); + if (!LHS) + return None; + auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); + if (!RHS) + return None; + + if (LHS->isConstantZero()) + return RHS; + if (RHS->isConstantZero()) + return LHS; + return None; + } + case ISD::SHL: { + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!ShiftOp) + return None; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return None; + uint64_t ByteShift = BitShift / 8; + + return Index < ByteShift + ? ByteProvider::getConstantZero() + : calculateByteProvider(Op->getOperand(0), Index - ByteShift, + Depth + 1); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? Optional<ByteProvider>(ByteProvider::getConstantZero()) + : None; + return calculateByteProvider(NarrowOp, Index, Depth + 1); + } + case ISD::BSWAP: + return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, + Depth + 1); + case ISD::LOAD: { + auto L = cast<LoadSDNode>(Op.getNode()); + if (L->isVolatile() || L->isIndexed()) + return None; + + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return L->getExtensionType() == ISD::ZEXTLOAD + ? Optional<ByteProvider>(ByteProvider::getConstantZero()) + : None; + return ByteProvider::getMemory(L, Index); + } + } + + return None; +} +} // namespace + +/// Match a pattern where a wide type scalar value is loaded by several narrow +/// loads and combined by shifts and ors. Fold it into a single load or a load +/// and a BSWAP if the targets supports it. +/// +/// Assuming little endian target: +/// i8 *a = ... +/// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) +/// => +/// i32 val = *((i32)a) +/// +/// i8 *a = ... +/// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] +/// => +/// i32 val = BSWAP(*((i32)a)) +/// +/// TODO: This rule matches complex patterns with OR node roots and doesn't +/// interact well with the worklist mechanism. When a part of the pattern is +/// updated (e.g. one of the loads) its direct users are put into the worklist, +/// but the root node of the pattern which triggers the load combine is not +/// necessarily a direct user of the changed node. For example, once the address +/// of t28 load is reassociated load combine won't be triggered: +/// t25: i32 = add t4, Constant:i32<2> +/// t26: i64 = sign_extend t25 +/// t27: i64 = add t2, t26 +/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 +/// t29: i32 = zero_extend t28 +/// t32: i32 = shl t29, Constant:i8<8> +/// t33: i32 = or t23, t32 +/// As a possible fix visitLoad can check if the load can be a part of a load +/// combine pattern and add corresponding OR roots to the worklist. +SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { + assert(N->getOpcode() == ISD::OR && + "Can only match load combining against OR nodes"); + + // Handles simple types only + EVT VT = N->getValueType(0); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + unsigned ByteWidth = VT.getSizeInBits() / 8; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = []( + unsigned BW, unsigned i) { return i; }; + std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = []( + unsigned BW, unsigned i) { return BW - i - 1; }; + + bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); + auto MemoryByteOffset = [&] (ByteProvider P) { + assert(P.isMemory() && "Must be a memory byte provider"); + unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); + assert(LoadBitWidth % 8 == 0 && + "can only analyze providers for individual bytes not bit"); + unsigned LoadByteWidth = LoadBitWidth / 8; + return IsBigEndianTarget + ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) + : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); + }; + + Optional<BaseIndexOffset> Base; + SDValue Chain; + + SmallSet<LoadSDNode *, 8> Loads; + Optional<ByteProvider> FirstByteProvider; + int64_t FirstOffset = INT64_MAX; + + // Check if all the bytes of the OR we are looking at are loaded from the same + // base address. Collect bytes offsets from Base address in ByteOffsets. + SmallVector<int64_t, 4> ByteOffsets(ByteWidth); + for (unsigned i = 0; i < ByteWidth; i++) { + auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); + if (!P || !P->isMemory()) // All the bytes must be loaded from memory + return SDValue(); + + LoadSDNode *L = P->Load; + assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && + "Must be enforced by calculateByteProvider"); + assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); + + // All loads must share the same chain + SDValue LChain = L->getChain(); + if (!Chain) + Chain = LChain; + else if (Chain != LChain) + return SDValue(); + + // Loads must share the same base address + BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); + if (!Base) + Base = Ptr; + else if (!Base->equalBaseIndex(Ptr)) + return SDValue(); + + // Calculate the offset of the current byte from the base address + int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset(*P); + ByteOffsets[i] = ByteOffsetFromBase; + + // Remember the first byte load + if (ByteOffsetFromBase < FirstOffset) { + FirstByteProvider = P; + FirstOffset = ByteOffsetFromBase; + } + + Loads.insert(L); + } + assert(Loads.size() > 0 && "All the bytes of the value must be loaded from " + "memory, so there must be at least one load which produces the value"); + assert(Base && "Base address of the accessed memory location must be set"); + assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + + // Check if the bytes of the OR we are looking at match with either big or + // little endian value load + bool BigEndian = true, LittleEndian = true; + for (unsigned i = 0; i < ByteWidth; i++) { + int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; + LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); + BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); + if (!BigEndian && !LittleEndian) + return SDValue(); + } + assert((BigEndian != LittleEndian) && "should be either or"); + assert(FirstByteProvider && "must be set"); + + // Ensure that the first byte is loaded from zero offset of the first load. + // So the combined value can be loaded from the first load address. + if (MemoryByteOffset(*FirstByteProvider) != 0) + return SDValue(); + LoadSDNode *FirstLoad = FirstByteProvider->Load; + + // The node we are looking at matches with the pattern, check if we can + // replace it with a single load and bswap if needed. + + // If the load needs byte swap check if the target supports it + bool NeedsBswap = IsBigEndianTarget != BigEndian; + + // Before legalize we can introduce illegal bswaps which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // load and byte shuffling instead of several loads and byte shuffling. + if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // Check that a load of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + VT, FirstLoad->getAddressSpace(), + FirstLoad->getAlignment(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + + SDValue NewLoad = + DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + + // Transfer chain users from old loads to the new load. + for (LoadSDNode *L : Loads) + DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); + + return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; +} + SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4386,6 +5036,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // reassociate xor if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1)) return RXOR; @@ -4403,9 +5057,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: - return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC); + return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); case ISD::SELECT_CC: - return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2), + return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), N0.getOperand(3), NotCC); } } @@ -4470,6 +5124,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { N01C->getAPIntValue(), DL, VT)); } } + + // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) && + TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { + if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) + if (C->getAPIntValue() == (OpSizeInBits - 1)) + return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0)); + } + // fold (xor x, x) -> 0 if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); @@ -4505,8 +5170,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return Tmp; // Simplify the expression using non-local knowledge. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); @@ -4662,7 +5326,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); // fold (shl 0, x) -> 0 - if (isNullConstant(N0)) + if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (shl x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) @@ -4673,6 +5337,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl undef, x) -> 0 if (N0.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -4763,7 +5431,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && - cast<BinaryWithFlagsSDNode>(N0)->Flags.hasExact()) { + N0->getFlags().hasExact()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t C1 = N0C1->getZExtValue(); uint64_t C2 = N1C->getZExtValue(); @@ -4788,12 +5456,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); SDValue Shift; if (c2 > c1) { - Mask = Mask.shl(c2 - c1); + Mask <<= c2 - c1; SDLoc DL(N); Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), DAG.getConstant(c2 - c1, DL, N1.getValueType())); } else { - Mask = Mask.lshr(c1 - c2); + Mask.lshrInPlace(c1 - c2); SDLoc DL(N); Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), DAG.getConstant(c1 - c2, DL, N1.getValueType())); @@ -4808,9 +5476,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && isConstantOrConstantVector(N1, /* No Opaques */ true)) { - unsigned BitSize = VT.getScalarSizeInBits(); SDLoc DL(N); - SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT); + SDValue AllBits = DAG.getAllOnesConstant(DL, VT); SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); } @@ -4851,6 +5518,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { unsigned OpSizeInBits = VT.getScalarSizeInBits(); // Arithmetic shifting an all-sign-bit value is a no-op. + // fold (sra 0, x) -> 0 + // fold (sra -1, x) -> -1 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) return N0; @@ -4865,18 +5534,16 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); - // fold (sra 0, x) -> 0 - if (isNullConstant(N0)) - return N0; - // fold (sra -1, x) -> -1 - if (isAllOnesConstant(N0)) - return N0; // fold (sra x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) return DAG.getUNDEF(VT); // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { @@ -5016,7 +5683,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); // fold (srl 0, x) -> 0 - if (isNullConstant(N0)) + if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (srl x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) @@ -5024,6 +5691,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // if (srl x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) @@ -5049,24 +5720,24 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) if (N1C && N0.getOpcode() == ISD::TRUNCATE && - N0.getOperand(0).getOpcode() == ISD::SRL && - isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) { - uint64_t c1 = - cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue(); - uint64_t c2 = N1C->getZExtValue(); - EVT InnerShiftVT = N0.getOperand(0).getValueType(); - EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType(); - uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); - // This is only valid if the OpSizeInBits + c1 = size of inner shift. - if (c1 + OpSizeInBits == InnerShiftSize) { - SDLoc DL(N0); - if (c1 + c2 >= InnerShiftSize) - return DAG.getConstant(0, DL, VT); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(ISD::SRL, DL, InnerShiftVT, - N0.getOperand(0)->getOperand(0), - DAG.getConstant(c1 + c2, DL, - ShiftCountVT))); + N0.getOperand(0).getOpcode() == ISD::SRL) { + if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { + uint64_t c1 = N001C->getZExtValue(); + uint64_t c2 = N1C->getZExtValue(); + EVT InnerShiftVT = N0.getOperand(0).getValueType(); + EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType(); + uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); + // This is only valid if the OpSizeInBits + c1 = size of inner shift. + if (c1 + OpSizeInBits == InnerShiftSize) { + SDLoc DL(N0); + if (c1 + c2 >= InnerShiftSize) + return DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getNode(ISD::SRL, DL, InnerShiftVT, + N0.getOperand(0).getOperand(0), + DAG.getConstant(c1 + c2, DL, + ShiftCountVT))); + } } } @@ -5074,9 +5745,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && isConstantOrConstantVector(N1, /* NoOpaques */ true)) { SDLoc DL(N); - APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits()); SDValue Mask = - DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1); + DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); AddToWorklist(Mask.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); } @@ -5097,7 +5767,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { DAG.getConstant(ShiftAmt, DL0, getShiftAmountTy(SmallVT))); AddToWorklist(SmallShift.getNode()); - APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt); + APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), @@ -5115,20 +5785,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). if (N1C && N0.getOpcode() == ISD::CTLZ && N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { - APInt KnownZero, KnownOne; - DAG.computeKnownBits(N0.getOperand(0), KnownZero, KnownOne); + KnownBits Known; + DAG.computeKnownBits(N0.getOperand(0), Known); // If any of the input bits are KnownOne, then the input couldn't be all // zeros, thus the result of the srl will always be zero. - if (KnownOne.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); + if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); // If all of the bits input the to ctlz node are known to be zero, then // the result of the ctlz is "32" and the result of the shift is one. - APInt UnknownBits = ~KnownZero; + APInt UnknownBits = ~Known.Zero; if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); // Otherwise, check to see if there is exactly one bit input to the ctlz. - if ((UnknownBits & (UnknownBits - 1)) == 0) { + if (UnknownBits.isPowerOf2()) { // Okay, we know that only that the single bit specified by UnknownBits // could be set on input to the CTLZ node. If this bit is set, the SRL // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair @@ -5202,6 +5872,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitABS(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // fold (abs c1) -> c2 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) + return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); + // fold (abs (abs x)) -> (abs x) + if (N0.getOpcode() == ISD::ABS) + return N0; + // fold (abs x) -> x iff not-negative + if (DAG.SignBitIsZero(N0)) + return N0; + return SDValue(); +} + SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -5217,7 +5903,11 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + // fold (bitreverse c1) -> c2 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) + return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); // fold (bitreverse (bitreverse x)) -> x if (N0.getOpcode() == ISD::BITREVERSE) return N0.getOperand(0); @@ -5311,7 +6001,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, } } -// TODO: We should handle other cases of selecting between {-1,0,1} here. SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5320,6 +6009,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { EVT CondVT = Cond.getValueType(); SDLoc DL(N); + if (!VT.isInteger()) + return SDValue(); + + auto *C1 = dyn_cast<ConstantSDNode>(N1); + auto *C2 = dyn_cast<ConstantSDNode>(N2); + if (!C1 || !C2) + return SDValue(); + + // Only do this before legalization to avoid conflicting with target-specific + // transforms in the other direction (create a select from a zext/sext). There + // is also a target-independent combine here in DAGCombiner in the other + // direction for (select Cond, -1, 0) when the condition is not i1. + if (CondVT == MVT::i1 && !LegalOperations) { + if (C1->isNullValue() && C2->isOne()) { + // select Cond, 0, 1 --> zext (!Cond) + SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); + if (VT != MVT::i1) + NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); + return NotCond; + } + if (C1->isNullValue() && C2->isAllOnesValue()) { + // select Cond, 0, -1 --> sext (!Cond) + SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); + if (VT != MVT::i1) + NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); + return NotCond; + } + if (C1->isOne() && C2->isNullValue()) { + // select Cond, 1, 0 --> zext (Cond) + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); + return Cond; + } + if (C1->isAllOnesValue() && C2->isNullValue()) { + // select Cond, -1, 0 --> sext (Cond) + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + return Cond; + } + + // For any constants that differ by 1, we can transform the select into an + // extend and add. Use a target hook because some targets may prefer to + // transform in the other direction. + if (TLI.convertSelectOfConstantsToMath()) { + if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { + // select Cond, C1, C1-1 --> add (zext Cond), C1-1 + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); + } + if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) { + // select Cond, C1, C1+1 --> add (sext Cond), C1+1 + if (VT != MVT::i1) + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); + } + } + + return SDValue(); + } + // fold (select Cond, 0, 1) -> (xor Cond, 1) // We can't do this reliably if integer based booleans have different contents // to floating point based booleans. This is because we can't tell whether we @@ -5329,15 +6079,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { // undiscoverable (or not reasonably discoverable). For example, it could be // in another basic block or it could require searching a complicated // expression. - if (VT.isInteger() && - (CondVT == MVT::i1 || (CondVT.isInteger() && - TLI.getBooleanContents(false, true) == - TargetLowering::ZeroOrOneBooleanContent && - TLI.getBooleanContents(false, false) == - TargetLowering::ZeroOrOneBooleanContent)) && - isNullConstant(N1) && isOneConstant(N2)) { - SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond, - DAG.getConstant(1, DL, CondVT)); + if (CondVT.isInteger() && + TLI.getBooleanContents(false, true) == + TargetLowering::ZeroOrOneBooleanContent && + TLI.getBooleanContents(false, false) == + TargetLowering::ZeroOrOneBooleanContent && + C1->isNullValue() && C2->isOne()) { + SDValue NotCond = + DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); if (VT.bitsEq(CondVT)) return NotCond; return DAG.getZExtOrTrunc(NotCond, DL, VT); @@ -5847,7 +6596,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { ISD::NON_EXTLOAD, MLD->isExpandingLoad()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, - MLD->isExpandingLoad()); + MLD->isExpandingLoad()); MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), @@ -5908,6 +6657,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (isAbs) { EVT VT = LHS.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) + return DAG.getNode(ISD::ABS, DL, VT, LHS); + SDValue Shift = DAG.getNode( ISD::SRA, DL, VT, LHS, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); @@ -5921,34 +6673,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. - // If the VSELECT result requires splitting and the mask is provided by a - // SETCC, then split both nodes and its operands before legalization. This - // prevents the type legalizer from unrolling SETCC into scalar comparisons - // and enables future optimizations (e.g. min/max pattern matching on X86). - if (N0.getOpcode() == ISD::SETCC) { - EVT VT = N->getValueType(0); - - // Check if any splitting is required. - if (TLI.getTypeAction(*DAG.getContext(), VT) != - TargetLowering::TypeSplitVector) - return SDValue(); - - SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH; - std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG); - std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1); - std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2); - - Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL); - Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH); - - // Add the new VSELECT nodes to the work list in case they need to be split - // again. - AddToWorklist(Lo.getNode()); - AddToWorklist(Hi.getNode()); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); - } - // Fold (vselect (build_vector all_ones), N1, N2) -> N1 if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; @@ -6258,6 +6982,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); + // Simplify TF. + AddToWorklist(NewChain.getNode()); + CombineTo(N, NewValue); // Replace uses of the original load (before extension) @@ -6270,9 +6997,55 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } +/// If we're narrowing or widening the result of a vector select and the final +/// size is the same size as a setcc (compare) feeding the select, then try to +/// apply the cast operation to the select's operands because matching vector +/// sizes for a select condition and other operands should be more efficient. +SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { + unsigned CastOpcode = Cast->getOpcode(); + assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || + CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || + CastOpcode == ISD::FP_ROUND) && + "Unexpected opcode for vector select narrowing/widening"); + + // We only do this transform before legal ops because the pattern may be + // obfuscated by target-specific operations after legalization. Do not create + // an illegal select op, however, because that may be difficult to lower. + EVT VT = Cast->getValueType(0); + if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + + SDValue VSel = Cast->getOperand(0); + if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || + VSel.getOperand(0).getOpcode() != ISD::SETCC) + return SDValue(); + + // Does the setcc have the same vector size as the casted select? + SDValue SetCC = VSel.getOperand(0); + EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); + if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + + // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) + SDValue A = VSel.getOperand(1); + SDValue B = VSel.getOperand(2); + SDValue CastA, CastB; + SDLoc DL(Cast); + if (CastOpcode == ISD::FP_ROUND) { + // FP_ROUND (fptrunc) has an extra flag operand to pass along. + CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); + CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); + } else { + CastA = DAG.getNode(CastOpcode, DL, VT, A); + CastB = DAG.getNode(CastOpcode, DL, VT, B); + } + return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); +} + SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) @@ -6281,8 +7054,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, - N0.getOperand(0)); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) @@ -6314,12 +7086,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign // bits, just sext from i32. if (NumSignBits > OpBits-MidBits) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); } else { // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign // bits, just truncate to i32. if (NumSignBits > OpBits-MidBits) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } // fold (sext (truncate x)) -> (sextinreg x). @@ -6329,7 +7101,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); else if (OpBits > DestBits) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op, + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(N0.getValueType())); } } @@ -6349,16 +7121,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), - ISD::SIGN_EXTEND); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6376,8 +7146,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { - SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, - LN0->getChain(), + SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); @@ -6411,7 +7180,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { LN0->getMemOperand()); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.sext(VT.getSizeInBits()); - SDLoc DL(N); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, @@ -6419,24 +7187,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { N0.getOperand(0).getValueType(), ExtLoad); CombineTo(N, And); CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, - ISD::SIGN_EXTEND); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } if (N0.getOpcode() == ISD::SETCC) { - EVT N0VT = N0.getOperand(0).getValueType(); + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); + EVT N00VT = N0.getOperand(0).getValueType(); + // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && - TLI.getBooleanContents(N0VT) == + TLI.getBooleanContents(N00VT) == TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. - EVT SVT = getSetCCResultType(N0VT); + EVT SVT = getSetCCResultType(N00VT); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result @@ -6444,19 +7215,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == SVT.getSizeInBits()) - return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), - N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); + return DAG.getSetCC(DL, VT, N00, N01, CC); // If the desired elements are smaller or larger than the source - // elements we can use a matching integer vector type and then - // truncate/sign extend - EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); - if (SVT == MatchingVectorType) { - SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, - N0.getOperand(0), N0.getOperand(1), - cast<CondCodeSDNode>(N0.getOperand(2))->get()); - return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT); + // elements, we can use a matching integer vector type and then + // truncate/sign extend. + EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); + if (SVT == MatchingVecType) { + SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); + return DAG.getSExtOrTrunc(VsetCC, DL, VT); } } @@ -6465,36 +7232,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // getBooleanContents(). unsigned SetCCWidth = N0.getScalarValueSizeInBits(); - SDLoc DL(N); // To determine the "true" side of the select, we need to know the high bit // of the value returned by the setcc if it evaluates to true. // If the type of the setcc is i1, then the true case of the select is just // sext(i1 1), that is, -1. // If the type of the setcc is larger (say, i8) then the value of the high - // bit depends on getBooleanContents(). So, ask TLI for a real "true" value + // bit depends on getBooleanContents(), so ask TLI for a real "true" value // of the appropriate width. - SDValue ExtTrueVal = - (SetCCWidth == 1) - ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), - DL, VT) - : TLI.getConstTrueVal(DAG, VT, DL); - - if (SDValue SCC = SimplifySelectCC( - DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal, - DAG.getConstant(0, DL, VT), - cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) + SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT) + : TLI.getConstTrueVal(DAG, VT, DL); + SDValue Zero = DAG.getConstant(0, DL, VT); + if (SDValue SCC = + SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) return SCC; if (!VT.isVector()) { - EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType()); - if (!LegalOperations || - TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { - SDLoc DL(N); - ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - SDValue SetCC = - DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); - return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, - DAG.getConstant(0, DL, VT)); + EVT SetCCVT = getSetCCResultType(N00VT); + // Don't do this transform for i1 because there's a select transform + // that would reverse it. + // TODO: We should not do this transform at all without a target hook + // because a sext is likely cheaper than a select? + if (SetCCVT.getScalarSizeInBits() != 1 && + (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { + SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); + return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); } } } @@ -6502,21 +7263,23 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext x) -> (zext x) if the sign bit is known zero. if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); + + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; return SDValue(); } // isTruncateOf - If N is a truncate of some other value, return true, record -// the value being truncated in Op and which of Op's bits are zero in KnownZero. -// This function computes KnownZero to avoid a duplicated call to +// the value being truncated in Op and which of Op's bits are zero/one in Known. +// This function computes KnownBits to avoid a duplicated call to // computeKnownBits in the caller. static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, - APInt &KnownZero) { - APInt KnownOne; + KnownBits &Known) { if (N->getOpcode() == ISD::TRUNCATE) { Op = N->getOperand(0); - DAG.computeKnownBits(Op, KnownZero, KnownOne); + DAG.computeKnownBits(Op, Known); return true; } @@ -6535,9 +7298,9 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, else return false; - DAG.computeKnownBits(Op, KnownZero, KnownOne); + DAG.computeKnownBits(Op, Known); - if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue()) + if (!(Known.Zero | 1).isAllOnesValue()) return false; return true; @@ -6562,8 +7325,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // This is valid when the truncated bits of x are already zero. // FIXME: We should extend this to work for vectors too. SDValue Op; - APInt KnownZero; - if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) { + KnownBits Known; + if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) { APInt TruncatedBits = (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ? APInt(Op.getValueSizeInBits(), 0) : @@ -6571,14 +7334,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueSizeInBits(), std::min(Op.getValueSizeInBits(), VT.getSizeInBits())); - if (TruncatedBits == (KnownZero & TruncatedBits)) { - if (VT.bitsGT(Op.getValueType())) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op); - if (VT.bitsLT(Op.getValueType())) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - - return Op; - } + if (TruncatedBits.isSubsetOf(Known.Zero)) + return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } // fold (zext (truncate (load x))) -> (zext (smaller load x)) @@ -6625,14 +7382,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { - SDValue Op = N0.getOperand(0); - if (SrcVT.bitsLT(VT)) { - Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } else if (SrcVT.bitsGT(VT)) { - Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } + SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); + AddToWorklist(Op.getNode()); return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); } } @@ -6646,11 +7397,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueType()) || !TLI.isZExtFree(N0.getValueType(), VT))) { SDValue X = N0.getOperand(0).getOperand(0); - if (X.getValueType().bitsLT(VT)) { - X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X); - } else if (X.getValueType().bitsGT(VT)) { - X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); - } + X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); @@ -6677,13 +7424,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); + CombineTo(N, ExtLoad); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -6837,6 +7585,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { ShAmt); } + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -6871,14 +7622,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { } // fold (aext (truncate x)) - if (N0.getOpcode() == ISD::TRUNCATE) { - SDValue TruncOp = N0.getOperand(0); - if (TruncOp.getValueType() == VT) - return TruncOp; // x iff x size == zext size. - if (TruncOp.getValueType().bitsGT(VT)) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp); - return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp); - } + if (N0.getOpcode() == ISD::TRUNCATE) + return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. @@ -6889,11 +7634,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { N0.getValueType())) { SDLoc DL(N); SDValue X = N0.getOperand(0).getOperand(0); - if (X.getValueType().bitsLT(VT)) { - X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); - } else if (X.getValueType().bitsGT(VT)) { - X = DAG.getNode(ISD::TRUNCATE, DL, VT, X); - } + X = DAG.getAnyExtOrTrunc(X, DL, VT); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, @@ -6991,9 +7732,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitAssertZext(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT EVT = cast<VTSDNode>(N1)->getVT(); + + // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt) + if (N0.getOpcode() == ISD::AssertZext && + EVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) + return N0; + + return SDValue(); +} + /// See if the specified operand can be simplified with the knowledge that only /// the bits specified by Mask are used. If so, return the simpler operand, /// otherwise return a null SDValue. +/// +/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can +/// simplify nodes with multiple uses more aggressively.) SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { switch (V.getOpcode()) { default: break; @@ -7029,6 +7786,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, V.getOperand(1)); } + break; + case ISD::AND: { + // X & -1 -> X (ignoring bits which aren't demanded). + ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1)); + if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask) + return V.getOperand(0); + break; + } } return SDValue(); } @@ -7169,7 +7934,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, DL, PtrType), - &Flags); + Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; @@ -7244,6 +8009,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } + // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x) + if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && + N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { + if (!LegalOperations || + TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) + return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT); + } + // fold (sext_in_reg (zext x)) -> (sext x) // iff we are extending the source sign bit. if (N0.getOpcode() == ISD::ZERO_EXTEND) { @@ -7254,7 +8029,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. - if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits))) + if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1))) return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType()); // fold operands of sext_in_reg based on knowledge that the top bits are not @@ -7496,6 +8271,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { VT.getSizeInBits()))) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } + // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { @@ -7517,6 +8293,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } } + // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), // where ... are all 'undef'. if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { @@ -7582,6 +8359,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) + // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) + // When the adde's carry is not used. + if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && + N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && + (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) { + SDLoc SL(N); + auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); + auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); + auto VTs = DAG.getVTList(VT, N0->getValueType(1)); + return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); + } + + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -7645,11 +8438,11 @@ static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, switch (N0.getOpcode()) { case ISD::AND: FPOpcode = ISD::FABS; - SignMask = ~APInt::getSignBit(SourceVT.getSizeInBits()); + SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits()); break; case ISD::XOR: FPOpcode = ISD::FNEG; - SignMask = APInt::getSignBit(SourceVT.getSizeInBits()); + SignMask = APInt::getSignMask(SourceVT.getSizeInBits()); break; // TODO: ISD::OR --> ISD::FNABS? default: @@ -7672,6 +8465,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (N0.isUndef()) + return DAG.getUNDEF(VT); + // If the input is a BUILD_VECTOR with all constant elements, fold this now. // Only do this before legalize, since afterward the target may be depending // on the bitconvert. @@ -7757,7 +8553,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { assert(VT.getSizeInBits() == 128); SDValue SignBit = DAG.getConstant( - APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); + APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); SDValue FlipBit; if (N0.getOpcode() == ISD::FNEG) { FlipBit = SignBit; @@ -7777,7 +8573,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); } - APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); + APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) return DAG.getNode(ISD::XOR, DL, VT, NewConv, DAG.getConstant(SignBit, DL, VT)); @@ -7825,7 +8621,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { } if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { - APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2); + APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); AddToWorklist(Cst.getNode()); SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); @@ -7846,7 +8642,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); } - APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); + APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, SDLoc(X), VT)); AddToWorklist(X.getNode()); @@ -8029,7 +8825,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { for (unsigned j = 0; j != NumOutputsPerInput; ++j) { APInt ThisVal = OpVal.trunc(DstBitSize); Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); - OpVal = OpVal.lshr(DstBitSize); + OpVal.lshrInPlace(DstBitSize); } // For big endian targets, swap the order of the pieces of each element. @@ -8040,6 +8836,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { return DAG.getBuildVector(VT, DL, Ops); } +static bool isContractable(SDNode *N) { + SDNodeFlags F = N->getFlags(); + return F.hasAllowContract() || F.hasUnsafeAlgebra(); +} + /// Try to perform FMA combining on a given FADD node. SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); @@ -8048,24 +8849,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + // If the addition is not contractable, do not combine. + if (!AllowFusionGlobally && !isContractable(N)) + return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); - ; - if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -8073,35 +8877,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // Is the node an FMUL and contractable either due to global flags or + // SDNodeFlags. + auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { + if (N.getOpcode() != ISD::FMUL) + return false; + return AllowFusionGlobally || isContractable(N.getNode()); + }; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. - if (Aggressive && N0.getOpcode() == ISD::FMUL && - N1.getOpcode() == ISD::FMUL) { + if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { if (N0.getNode()->use_size() > N1.getNode()->use_size()) std::swap(N0, N1); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) - if (N0.getOpcode() == ISD::FMUL && - (Aggressive || N0->hasOneUse())) { + if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FMUL && - (Aggressive || N1->hasOneUse())) { + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } // Look through FP_EXTEND nodes to do more combining. - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N00)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8113,7 +8921,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N10)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), @@ -8154,7 +8962,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N0)); } - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( @@ -8169,7 +8977,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); - if (N020.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N020)) return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), N1); @@ -8195,7 +9003,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); - if (N002.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N002)) return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), N1); @@ -8208,7 +9016,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N12 = N1.getOperand(2); if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); - if (N120.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N120)) return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), N0); @@ -8224,7 +9032,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == PreferredFusedOpcode) { SDValue N102 = N10.getOperand(2); - if (N102.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N102)) return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), N0); @@ -8244,23 +9052,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool AllowFusion = - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); - // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + // If the subtraction is not contractable, do not combine. + if (!AllowFusionGlobally && !isContractable(N)) + return SDValue(); + const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); - if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel)) + if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. @@ -8268,9 +9079,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // Is the node an FMUL and contractable either due to global flags or + // SDNodeFlags. + auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { + if (N.getOpcode() != ISD::FMUL) + return false; + return AllowFusionGlobally || isContractable(N.getNode()); + }; + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) - if (N0.getOpcode() == ISD::FMUL && - (Aggressive || N0->hasOneUse())) { + if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1)); @@ -8278,16 +9096,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FMUL && - (Aggressive || N1->hasOneUse())) + if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0); // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) - if (N0.getOpcode() == ISD::FNEG && - N0.getOperand(0).getOpcode() == ISD::FMUL && + if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); @@ -8297,12 +9113,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N00)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8316,7 +9132,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N10)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8336,7 +9152,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N000)) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8358,7 +9174,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); - if (N000.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N000)) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -8378,10 +9194,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma x, y (fma u, v, (fneg z))) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && - N0.getOpcode() == PreferredFusedOpcode && - N0.getOperand(2).getOpcode() == ISD::FMUL && - N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { + if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && + isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && + N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8395,9 +9210,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma (fneg y), z, (fma (fneg u), v, x)) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. - if (Options.UnsafeFPMath && - N1.getOpcode() == PreferredFusedOpcode && - N1.getOperand(2).getOpcode() == ISD::FMUL) { + if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && + isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8410,14 +9224,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N21, N0)); } - if (AllowFusion && LookThroughFPExt) { + if (LookThroughFPExt) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); - if (N020.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N020)) return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8440,7 +9254,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); - if (N002.getOpcode() == ISD::FMUL) + if (isContractableFMUL(N002)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -8461,7 +9275,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N1.getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N1.getOperand(2).getOperand(0); - if (N120.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N120)) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8488,7 +9302,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N100 = N1.getOperand(0).getOperand(0); SDValue N101 = N1.getOperand(0).getOperand(1); SDValue N102 = N1.getOperand(0).getOperand(2); - if (N102.getOpcode() == ISD::FMUL) { + if (isContractableFMUL(N102)) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -8601,6 +9415,14 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { return SDValue(); } +static bool isFMulNegTwo(SDValue &N) { + if (N.getOpcode() != ISD::FMUL) + return false; + if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1))) + return CFP->isExactlyValue(-2.0); + return false; +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8609,7 +9431,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) @@ -8624,6 +9446,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) @@ -8636,8 +9461,18 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return DAG.getNode(ISD::FSUB, DL, VT, N1, GetNegatedExpression(N0, DAG, LegalOperations), Flags); + // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) + // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B)) + if ((isFMulNegTwo(N0) && N0.hasOneUse()) || + (isFMulNegTwo(N1) && N1.hasOneUse())) { + bool N1IsFMul = isFMulNegTwo(N1); + SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags); + } + // FIXME: Auto-upgrade the target/function-level option. - if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // fold (fadd A, 0) -> A if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1)) if (N1C->isZero()) @@ -8760,7 +9595,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) @@ -8771,13 +9606,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (N0CFP && N1CFP) return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. - if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) { + if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) @@ -8828,7 +9666,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) { @@ -8850,6 +9688,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (N1CFP && N1CFP->isExactlyValue(1.0)) return N0; + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (Options.UnsafeFPMath) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) @@ -8969,7 +9810,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), - &Flags), &Flags); + Flags), Flags); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) @@ -8979,7 +9820,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), - &Flags), + Flags), N2); } } @@ -9005,16 +9846,16 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, - DAG.getConstantFP(1.0, DL, VT), &Flags), - &Flags); + DAG.getConstantFP(1.0, DL, VT), Flags), + Flags); } // (fma x, c, (fneg x)) -> (fmul x, (c-1)) if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, - DAG.getConstantFP(-1.0, DL, VT), &Flags), - &Flags); + DAG.getConstantFP(-1.0, DL, VT), Flags), + Flags); } } @@ -9030,8 +9871,8 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; - const SDNodeFlags *Flags = N->getFlags(); - if (!UnsafeMath && !Flags->hasAllowReciprocal()) + const SDNodeFlags Flags = N->getFlags(); + if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); // Skip if current node is a reciprocal. @@ -9054,7 +9895,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { // This division is eligible for optimization only if global unsafe math // is enabled or if this division allows reciprocal formation. - if (UnsafeMath || U->getFlags()->hasAllowReciprocal()) + if (UnsafeMath || U->getFlags().hasAllowReciprocal()) Users.insert(U); } } @@ -9093,7 +9934,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; + SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) @@ -9104,6 +9945,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (N0CFP && N1CFP) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; + if (Options.UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { @@ -9204,8 +10048,10 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) - return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, - &cast<BinaryWithFlagsSDNode>(N)->Flags); + return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); + + if (SDValue NewSel = foldBinOpIntoSelect(N)) + return NewSel; return SDValue(); } @@ -9222,7 +10068,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { // For now, create a Flags object for use with all unsafe math transforms. SDNodeFlags Flags; Flags.setUnsafeAlgebra(true); - return buildSqrtEstimate(N0, &Flags); + return buildSqrtEstimate(N0, Flags); } /// copysign(x, fp_extend(y)) -> copysign(x, y) @@ -9497,6 +10343,9 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { Tmp, N0.getOperand(1)); } + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -9563,6 +10412,9 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } + if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) + return NewVSel; + return SDValue(); } @@ -9624,11 +10476,11 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x80... per scalar element // and splat it. - SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits()); + SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x80... - SignMask = APInt::getSignBit(IntVT.getSizeInBits()); + SignMask = APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL0(N0); Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, @@ -9648,10 +10500,10 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (Level >= AfterLegalizeDAG && (TLI.isFPImmLegal(CVal, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT))) - return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), - DAG.getNode(ISD::FNEG, SDLoc(N), VT, - N0.getOperand(1)), - &cast<BinaryWithFlagsSDNode>(N0)->Flags); + return DAG.getNode( + ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), + DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), + N0->getFlags()); } } @@ -9729,11 +10581,11 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x7f... per scalar element // and splat it. - SignMask = ~APInt::getSignBit(N0.getScalarValueSizeInBits()); + SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x7f... - SignMask = ~APInt::getSignBit(IntVT.getSizeInBits()); + SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL(N0); Int = DAG.getNode(ISD::AND, DL, IntVT, Int, @@ -10361,7 +11213,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { dbgs() << "\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); - + AddUsersToWorklist(Chain.getNode()); if (N->use_empty()) deleteAndRecombine(N); @@ -10414,7 +11266,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { StoreSDNode *PrevST = cast<StoreSDNode>(Chain); if (PrevST->getBasePtr() == Ptr && PrevST->getValue().getValueType() == N->getValueType(0)) - return CombineTo(N, Chain.getOperand(1), Chain); + return CombineTo(N, PrevST->getOperand(1), Chain); } } @@ -10432,14 +11284,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { } } - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); -#ifndef NDEBUG - if (CombinerAAOnlyFunc.getNumOccurrences() && - CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) - UseAA = false; -#endif - if (UseAA && LD->isUnindexed()) { + if (LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); @@ -10959,7 +11804,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { // Check if this is a trunc(lshr). if (User->getOpcode() == ISD::SRL && User->hasOneUse() && isa<ConstantSDNode>(User->getOperand(1))) { - Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue(); + Shift = User->getConstantOperandVal(1); User = *User->use_begin(); } @@ -11021,6 +11866,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, ArgChains); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); + AddToWorklist(Chain.getNode()); return true; } @@ -11414,18 +12260,24 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, return false; } -SDValue DAGCombiner::getMergedConstantVectorStore( - SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores, - SmallVectorImpl<SDValue> &Chains, EVT Ty) const { - SmallVector<SDValue, 8> BuildVector; +SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, + unsigned NumStores) { + SmallVector<SDValue, 8> Chains; + SmallPtrSet<const SDNode *, 8> Visited; + SDLoc StoreDL(StoreNodes[0].MemNode); + + for (unsigned i = 0; i < NumStores; ++i) { + Visited.insert(StoreNodes[i].MemNode); + } - for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { - StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode); - Chains.push_back(St->getChain()); - BuildVector.push_back(St->getValue()); + // don't include nodes that are children + for (unsigned i = 0; i < NumStores; ++i) { + if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) + Chains.push_back(StoreNodes[i].MemNode->getChain()); } - return DAG.getBuildVector(Ty, SL, BuildVector); + assert(Chains.size() > 0 && "Chain should have generated a chain"); + return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( @@ -11436,22 +12288,8 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( return false; int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - unsigned LatestNodeUsed = 0; - - for (unsigned i=0; i < NumStores; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // latest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) - LatestNodeUsed = i; - } - - SmallVector<SDValue, 8> Chains; // The latest Node in the DAG. - LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); SDValue StoredVal; @@ -11467,7 +12305,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); if (IsConstantSrc) { - StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty); + SmallVector<SDValue, 8> BuildVector; + for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); + SDValue Val = St->getValue(); + if (MemVT.getScalarType().isInteger()) + if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue())) + Val = DAG.getConstant( + (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(), + SDLoc(CFP), MemVT); + BuildVector.push_back(Val); + } + StoredVal = DAG.getBuildVector(Ty, DL, BuildVector); } else { SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumStores; ++i) { @@ -11477,7 +12326,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( if (Val.getValueType() != MemVT) return false; Ops.push_back(Val); - Chains.push_back(St->getChain()); } // Build the extracted vector elements back into a vector. @@ -11497,14 +12345,13 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( for (unsigned i = 0; i < NumStores; ++i) { unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); - Chains.push_back(St->getChain()); SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes * 8; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { - StoreInt |= C->getAPIntValue().zext(SizeInBits); + StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits); } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { - StoreInt |= C->getValueAPF().bitcastToAPInt().zext(SizeInBits); + StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); } else { llvm_unreachable("Invalid constant element type"); } @@ -11515,54 +12362,27 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } - assert(!Chains.empty()); - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstInChain->getAlignment()); - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - if (UseAA) { - // Replace all merged stores with the new store. - for (unsigned i = 0; i < NumStores; ++i) - CombineTo(StoreNodes[i].MemNode, NewStore); - } else { - // Replace the last store with the new store. - CombineTo(LatestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumStores; ++i) { - if (StoreNodes[i].MemNode == LatestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - // ReplaceAllUsesWith will replace all uses that existed when it was - // called, but graph optimizations may cause new ones to appear. For - // example, the case in pr14333 looks like - // - // St's chain -> St -> another store -> X - // - // And the only difference from St to the other store is the chain. - // When we change it's chain to be St's chain they become identical, - // get CSEed and the net result is that X is now a use of St. - // Since we know that St is redundant, just iterate. - while (!St->use_empty()) - DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); - } - } + // Replace all merged stores with the new store. + for (unsigned i = 0; i < NumStores; ++i) + CombineTo(StoreNodes[i].MemNode, NewStore); - StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end()); + AddToWorklist(NewChain.getNode()); return true; } -void DAGCombiner::getStoreMergeAndAliasCandidates( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, - SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) { +void DAGCombiner::getStoreMergeCandidates( + StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); + EVT MemVT = St->getMemoryVT(); // We must have a base and an offset. if (!BasePtr.Base.getNode()) @@ -11572,104 +12392,71 @@ void DAGCombiner::getStoreMergeAndAliasCandidates( if (BasePtr.Base.isUndef()) return; - // Walk up the chain and look for nodes with offsets from the same - // base pointer. Stop when reaching an instruction with a different kind - // or instruction which has a different base pointer. - EVT MemVT = St->getMemoryVT(); - unsigned Seq = 0; - StoreSDNode *Index = St; - - - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - - if (UseAA) { - // Look at other users of the same chain. Stores on the same chain do not - // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized - // to be on the same chain, so don't bother looking at adjacent chains. - - SDValue Chain = St->getChain(); - for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) { - if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { - if (I.getOperandNo() != 0) - continue; - - if (OtherST->isVolatile() || OtherST->isIndexed()) - continue; - - if (OtherST->getMemoryVT() != MemVT) - continue; - - BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG); - - if (Ptr.equalBaseIndex(BasePtr)) - StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++)); - } - } - - return; - } - - while (Index) { - // If the chain has more than one use, then we can't reorder the mem ops. - if (Index != St && !SDValue(Index, 0)->hasOneUse()) - break; - - // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); - - // Check that the base pointer is the same as the original one. - if (!Ptr.equalBaseIndex(BasePtr)) - break; - - // The memory operands must not be volatile. - if (Index->isVolatile() || Index->isIndexed()) - break; - - // No truncation. - if (Index->isTruncatingStore()) - break; - - // The stored memory type must be the same. - if (Index->getMemoryVT() != MemVT) - break; - - // We do not allow under-aligned stores in order to prevent - // overriding stores. NOTE: this is a bad hack. Alignment SHOULD - // be irrelevant here; what MATTERS is that we not move memory - // operations that potentially overlap past each-other. - if (Index->getAlignment() < MemVT.getStoreSize()) - break; - - // We found a potential memory operand to merge. - StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); - - // Find the next memory operand in the chain. If the next operand in the - // chain is a store then move up and continue the scan with the next - // memory operand. If the next operand is a load save it and use alias - // information to check if it interferes with anything. - SDNode *NextInChain = Index->getChain().getNode(); - while (1) { - if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { - // We found a store node. Use it for the next iteration. - Index = STn; - break; - } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) { - if (Ldn->isVolatile()) { - Index = nullptr; - break; + bool IsLoadSrc = isa<LoadSDNode>(St->getValue()); + bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) || + isa<ConstantFPSDNode>(St->getValue()); + bool IsExtractVecSrc = + (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); + auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool { + if (Other->isVolatile() || Other->isIndexed()) + return false; + // We can merge constant floats to equivalent integers + if (Other->getMemoryVT() != MemVT) + if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && + isa<ConstantFPSDNode>(Other->getValue()))) + return false; + if (IsLoadSrc) + if (!isa<LoadSDNode>(Other->getValue())) + return false; + if (IsConstantSrc) + if (!(isa<ConstantSDNode>(Other->getValue()) || + isa<ConstantFPSDNode>(Other->getValue()))) + return false; + if (IsExtractVecSrc) + if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) + return false; + Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); + return (Ptr.equalBaseIndex(BasePtr)); + }; + // We looking for a root node which is an ancestor to all mergable + // stores. We search up through a load, to our root and then down + // through all children. For instance we will find Store{1,2,3} if + // St is Store1, Store2. or Store3 where the root is not a load + // which always true for nonvolatile ops. TODO: Expand + // the search to find all valid candidates through multiple layers of loads. + // + // Root + // |-------|-------| + // Load Load Store3 + // | | + // Store1 Store2 + // + // FIXME: We should be able to climb and + // descend TokenFactors to find candidates as well. + + SDNode *RootNode = (St->getChain()).getNode(); + + if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { + RootNode = Ldn->getChain().getNode(); + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain + for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) + if (I2.getOperandNo() == 0) + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { + BaseIndexOffset Ptr; + if (CandidateMatch(OtherST, Ptr)) + StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset)); + } + } else + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + if (I.getOperandNo() == 0) + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { + BaseIndexOffset Ptr; + if (CandidateMatch(OtherST, Ptr)) + StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset)); } - - // Save the load node for later. Continue the scan. - AliasLoadNodes.push_back(Ldn); - NextInChain = Ldn->getChain().getNode(); - continue; - } else { - Index = nullptr; - break; - } - } - } } // We need to check that merging these stores does not cause a loop @@ -11678,31 +12465,34 @@ void DAGCombiner::getStoreMergeAndAliasCandidates( // through the chain). Check in parallel by searching up from // non-chain operands of candidates. bool DAGCombiner::checkMergeStoreCandidatesForDependencies( - SmallVectorImpl<MemOpLink> &StoreNodes) { + SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) { SmallPtrSet<const SDNode *, 16> Visited; SmallVector<const SDNode *, 8> Worklist; // search ops of store candidates - for (unsigned i = 0; i < StoreNodes.size(); ++i) { + for (unsigned i = 0; i < NumStores; ++i) { SDNode *n = StoreNodes[i].MemNode; // Potential loops may happen only through non-chain operands for (unsigned j = 1; j < n->getNumOperands(); ++j) Worklist.push_back(n->getOperand(j).getNode()); } // search through DAG. We can stop early if we find a storenode - for (unsigned i = 0; i < StoreNodes.size(); ++i) { + for (unsigned i = 0; i < NumStores; ++i) { if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist)) return false; } return true; } -bool DAGCombiner::MergeConsecutiveStores( - StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) { +bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + + if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) + return false; + bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute( Attribute::NoImplicitFloat); @@ -11731,376 +12521,400 @@ bool DAGCombiner::MergeConsecutiveStores( if (MemVT.isVector() && IsLoadSrc) return false; - // Only look at ends of store sequences. - SDValue Chain = SDValue(St, 0); - if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) - return false; - - // Save the LoadSDNodes that we find in the chain. - // We need to make sure that these nodes do not interfere with - // any of the store nodes. - SmallVector<LSBaseSDNode*, 8> AliasLoadNodes; - - getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes); + SmallVector<MemOpLink, 8> StoreNodes; + // Find potential store merge candidates by searching through chain sub-DAG + getStoreMergeCandidates(St, StoreNodes); // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; - // only do dependence check in AA case - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); - if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes)) - return false; - // Sort the memory operands according to their distance from the - // base pointer. As a secondary criteria: make sure stores coming - // later in the code come first in the list. This is important for - // the non-UseAA case, because we're merging stores into the FINAL - // store along a chain which potentially contains aliasing stores. - // Thus, if there are multiple stores to the same address, the last - // one can be considered for merging but not the others. + // base pointer. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { - return LHS.OffsetFromBase < RHS.OffsetFromBase || - (LHS.OffsetFromBase == RHS.OffsetFromBase && - LHS.SequenceNum < RHS.SequenceNum); - }); - - // Scan the memory operations on the chain and find the first non-consecutive - // store memory address. - unsigned LastConsecutiveStore = 0; - int64_t StartAddress = StoreNodes[0].OffsetFromBase; - for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) { - + return LHS.OffsetFromBase < RHS.OffsetFromBase; + }); + + // Store Merge attempts to merge the lowest stores. This generally + // works out as if successful, as the remaining stores are checked + // after the first collection of stores is merged. However, in the + // case that a non-mergeable store is found first, e.g., {p[-2], + // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent + // mergeable cases. To prevent this, we prune such stores from the + // front of StoreNodes here. + + bool RV = false; + while (StoreNodes.size() > 1) { + unsigned StartIdx = 0; + while ((StartIdx + 1 < StoreNodes.size()) && + StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != + StoreNodes[StartIdx + 1].OffsetFromBase) + ++StartIdx; + + // Bail if we don't have enough candidates to merge. + if (StartIdx + 1 >= StoreNodes.size()) + return RV; + + if (StartIdx) + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); + + // Scan the memory operations on the chain and find the first + // non-consecutive store memory address. + unsigned NumConsecutiveStores = 1; + int64_t StartAddress = StoreNodes[0].OffsetFromBase; // Check that the addresses are consecutive starting from the second // element in the list of stores. - if (i > 0) { + for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { int64_t CurrAddress = StoreNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; + NumConsecutiveStores = i + 1; } - // Check if this store interferes with any of the loads that we found. - // If we find a load that alias with this store. Stop the sequence. - if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) { - return isAlias(Ldn, StoreNodes[i].MemNode); - })) - break; + if (NumConsecutiveStores < 2) { + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumConsecutiveStores); + continue; + } - // Mark this node as useful. - LastConsecutiveStore = i; - } + // Check that we can merge these candidates without causing a cycle + if (!checkMergeStoreCandidatesForDependencies(StoreNodes, + NumConsecutiveStores)) { + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumConsecutiveStores); + continue; + } - // The node with the lowest store address. - LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; - unsigned FirstStoreAS = FirstInChain->getAddressSpace(); - unsigned FirstStoreAlign = FirstInChain->getAlignment(); - LLVMContext &Context = *DAG.getContext(); - const DataLayout &DL = DAG.getDataLayout(); - - // Store the constants into memory as one consecutive store. - if (IsConstantSrc) { - unsigned LastLegalType = 0; - unsigned LastLegalVectorType = 0; - bool NonZero = false; - for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - SDValue StoredVal = St->getValue(); - - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { - NonZero |= !C->isNullValue(); - } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) { - NonZero |= !C->getConstantFPValue()->isNullValue(); - } else { - // Non-constant. - break; - } + // The node with the lowest store address. + LLVMContext &Context = *DAG.getContext(); + const DataLayout &DL = DAG.getDataLayout(); - // Find a legal type for the constant store. - unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); - bool IsFast; - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) { - LastLegalType = i+1; - // Or check whether a truncstore is legal. - } else if (TLI.getTypeAction(Context, StoreTy) == - TargetLowering::TypePromoteInteger) { - EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFast) && + // Store the constants into memory as one consecutive store. + if (IsConstantSrc) { + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + unsigned LastLegalType = 0; + unsigned LastLegalVectorType = 0; + bool NonZero = false; + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); + SDValue StoredVal = ST->getValue(); + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { + NonZero |= !C->isNullValue(); + } else if (ConstantFPSDNode *C = + dyn_cast<ConstantFPSDNode>(StoredVal)) { + NonZero |= !C->getConstantFPValue()->isNullValue(); + } else { + // Non-constant. + break; + } + + // Find a legal type for the constant store. + unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; + EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); + bool IsFast = false; + if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) { LastLegalType = i + 1; + // Or check whether a truncstore is legal. + } else if (!LegalTypes && + TLI.getTypeAction(Context, StoreTy) == + TargetLowering::TypePromoteInteger) { + EVT LegalizedStoredValueTy = + TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); + if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFast) && + IsFast) { + LastLegalType = i + 1; + } + } + + // We only use vectors if the constant is known to be zero or the target + // allows it and the function is not marked with the noimplicitfloat + // attribute. + if ((!NonZero || + TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && + !NoVectors) { + // Find a legal type for the vector store. + EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1); + if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty) && + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && + IsFast) + LastLegalVectorType = i + 1; } } - // We only use vectors if the constant is known to be zero or the target - // allows it and the function is not marked with the noimplicitfloat - // attribute. - if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1, - FirstStoreAS)) && - !NoVectors) { - // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); - if (TLI.isTypeLegal(Ty) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) - LastLegalVectorType = i + 1; + // Check if we found a legal integer type that creates a meaningful merge. + if (LastLegalType < 2 && LastLegalVectorType < 2) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + continue; } - } - // Check if we found a legal integer type to store. - if (LastLegalType == 0 && LastLegalVectorType == 0) - return false; + bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; + unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; - bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; - unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; + bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + true, UseVector); + if (!Merged) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + continue; + } + // Remove merged stores for next iteration. + RV = true; + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + continue; + } - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, - true, UseVector); - } + // When extracting multiple vector elements, try to store them + // in one vector store rather than a sequence of scalar stores. + if (IsExtractVecSrc) { + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + unsigned NumStoresToMerge = 1; + bool IsVec = MemVT.isVector(); + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + unsigned StoreValOpcode = St->getValue().getOpcode(); + // This restriction could be loosened. + // Bail out if any stored values are not elements extracted from a + // vector. It should be possible to handle mixed sources, but load + // sources need more careful handling (see the block of code below that + // handles consecutive loads). + if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && + StoreValOpcode != ISD::EXTRACT_SUBVECTOR) + return RV; - // When extracting multiple vector elements, try to store them - // in one vector store rather than a sequence of scalar stores. - if (IsExtractVecSrc) { - unsigned NumStoresToMerge = 0; - bool IsVec = MemVT.isVector(); - for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - unsigned StoreValOpcode = St->getValue().getOpcode(); - // This restriction could be loosened. - // Bail out if any stored values are not elements extracted from a vector. - // It should be possible to handle mixed sources, but load sources need - // more careful handling (see the block of code below that handles - // consecutive loads). - if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && - StoreValOpcode != ISD::EXTRACT_SUBVECTOR) - return false; + // Find a legal type for the vector store. + unsigned Elts = i + 1; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + EVT Ty = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); + bool IsFast; + if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty) && + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && + IsFast) + NumStoresToMerge = i + 1; + } - // Find a legal type for the vector store. - unsigned Elts = i + 1; - if (IsVec) { - // When merging vector stores, get the total number of elements. - Elts *= MemVT.getVectorNumElements(); + bool Merged = MergeStoresOfConstantsOrVecElts( + StoreNodes, MemVT, NumStoresToMerge, false, true); + if (!Merged) { + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumStoresToMerge); + continue; } - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); - bool IsFast; - if (TLI.isTypeLegal(Ty) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && IsFast) - NumStoresToMerge = i + 1; + // Remove merged stores for next iteration. + StoreNodes.erase(StoreNodes.begin(), + StoreNodes.begin() + NumStoresToMerge); + RV = true; + continue; } - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge, - false, true); - } + // Below we handle the case of multiple consecutive stores that + // come from multiple consecutive loads. We merge them into a single + // wide load and a single wide store. - // Below we handle the case of multiple consecutive stores that - // come from multiple consecutive loads. We merge them into a single - // wide load and a single wide store. + // Look for load nodes which are used by the stored values. + SmallVector<MemOpLink, 8> LoadNodes; - // Look for load nodes which are used by the stored values. - SmallVector<MemOpLink, 8> LoadNodes; + // Find acceptable loads. Loads need to have the same chain (token factor), + // must not be zext, volatile, indexed, and they must be consecutive. + BaseIndexOffset LdBasePtr; + for (unsigned i = 0; i < NumConsecutiveStores; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); + if (!Ld) + break; - // Find acceptable loads. Loads need to have the same chain (token factor), - // must not be zext, volatile, indexed, and they must be consecutive. - BaseIndexOffset LdBasePtr; - for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); - if (!Ld) break; + // Loads must only have one use. + if (!Ld->hasNUsesOfValue(1, 0)) + break; - // Loads must only have one use. - if (!Ld->hasNUsesOfValue(1, 0)) - break; + // The memory operands must not be volatile. + if (Ld->isVolatile() || Ld->isIndexed()) + break; - // The memory operands must not be volatile. - if (Ld->isVolatile() || Ld->isIndexed()) - break; + // We do not accept ext loads. + if (Ld->getExtensionType() != ISD::NON_EXTLOAD) + break; - // We do not accept ext loads. - if (Ld->getExtensionType() != ISD::NON_EXTLOAD) - break; + // The stored memory type must be the same. + if (Ld->getMemoryVT() != MemVT) + break; - // The stored memory type must be the same. - if (Ld->getMemoryVT() != MemVT) - break; + BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); + // If this is not the first ptr that we check. + if (LdBasePtr.Base.getNode()) { + // The base ptr must be the same. + if (!LdPtr.equalBaseIndex(LdBasePtr)) + break; + } else { + // Check that all other base pointers are the same as this one. + LdBasePtr = LdPtr; + } - BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); - // If this is not the first ptr that we check. - if (LdBasePtr.Base.getNode()) { - // The base ptr must be the same. - if (!LdPtr.equalBaseIndex(LdBasePtr)) - break; - } else { - // Check that all other base pointers are the same as this one. - LdBasePtr = LdPtr; + // We found a potential memory operand to merge. + LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset)); } - // We found a potential memory operand to merge. - LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0)); - } - - if (LoadNodes.size() < 2) - return false; + if (LoadNodes.size() < 2) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + continue; + } - // If we have load/store pair instructions and we only have two values, - // don't bother. - unsigned RequiredAlignment; - if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && - St->getAlignment() >= RequiredAlignment) - return false; + // If we have load/store pair instructions and we only have two values, + // don't bother. + unsigned RequiredAlignment; + if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && + St->getAlignment() >= RequiredAlignment) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); + continue; + } + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); + LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); + unsigned FirstLoadAS = FirstLoad->getAddressSpace(); + unsigned FirstLoadAlign = FirstLoad->getAlignment(); + + // Scan the memory operations on the chain and find the first + // non-consecutive load memory address. These variables hold the index in + // the store node array. + unsigned LastConsecutiveLoad = 0; + // This variable refers to the size and not index in the array. + unsigned LastLegalVectorType = 0; + unsigned LastLegalIntegerType = 0; + StartAddress = LoadNodes[0].OffsetFromBase; + SDValue FirstChain = FirstLoad->getChain(); + for (unsigned i = 1; i < LoadNodes.size(); ++i) { + // All loads must share the same chain. + if (LoadNodes[i].MemNode->getChain() != FirstChain) + break; - LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); - unsigned FirstLoadAS = FirstLoad->getAddressSpace(); - unsigned FirstLoadAlign = FirstLoad->getAlignment(); - - // Scan the memory operations on the chain and find the first non-consecutive - // load memory address. These variables hold the index in the store node - // array. - unsigned LastConsecutiveLoad = 0; - // This variable refers to the size and not index in the array. - unsigned LastLegalVectorType = 0; - unsigned LastLegalIntegerType = 0; - StartAddress = LoadNodes[0].OffsetFromBase; - SDValue FirstChain = FirstLoad->getChain(); - for (unsigned i = 1; i < LoadNodes.size(); ++i) { - // All loads must share the same chain. - if (LoadNodes[i].MemNode->getChain() != FirstChain) - break; + int64_t CurrAddress = LoadNodes[i].OffsetFromBase; + if (CurrAddress - StartAddress != (ElementSizeBytes * i)) + break; + LastConsecutiveLoad = i; + // Find a legal type for the vector store. + EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1); + bool IsFastSt, IsFastLd; + if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && + IsFastSt && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && + IsFastLd) { + LastLegalVectorType = i + 1; + } - int64_t CurrAddress = LoadNodes[i].OffsetFromBase; - if (CurrAddress - StartAddress != (ElementSizeBytes * i)) - break; - LastConsecutiveLoad = i; - // Find a legal type for the vector store. - EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1); - bool IsFastSt, IsFastLd; - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && IsFastLd) { - LastLegalVectorType = i + 1; - } - - // Find a legal type for the integer store. - unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - StoreTy = EVT::getIntegerVT(Context, SizeInBits); - if (TLI.isTypeLegal(StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && IsFastLd) - LastLegalIntegerType = i + 1; - // Or check whether a truncstore and extload is legal. - else if (TLI.getTypeAction(Context, StoreTy) == - TargetLowering::TypePromoteInteger) { - EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(Context, StoreTy); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFastSt) && + // Find a legal type for the integer store. + unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; + StoreTy = EVT::getIntegerVT(Context, SizeInBits); + if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstLoadAS, FirstLoadAlign, &IsFastLd) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && IsFastLd) - LastLegalIntegerType = i+1; + LastLegalIntegerType = i + 1; + // Or check whether a truncstore and extload is legal. + else if (TLI.getTypeAction(Context, StoreTy) == + TargetLowering::TypePromoteInteger) { + EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy); + if (!LegalTypes && + TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) && + TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, + StoreTy) && + TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, + StoreTy) && + TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFastSt) && + IsFastSt && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstLoadAS, FirstLoadAlign, &IsFastLd) && + IsFastLd) + LastLegalIntegerType = i + 1; + } } - } - // Only use vector types if the vector type is larger than the integer type. - // If they are the same, use integers. - bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; - unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType); + // Only use vector types if the vector type is larger than the integer type. + // If they are the same, use integers. + bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; + unsigned LastLegalType = + std::max(LastLegalVectorType, LastLegalIntegerType); - // We add +1 here because the LastXXX variables refer to location while - // the NumElem refers to array/index size. - unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1; - NumElem = std::min(LastLegalType, NumElem); - - if (NumElem < 2) - return false; - - // Collect the chains from all merged stores. - SmallVector<SDValue, 8> MergeStoreChains; - MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); - - // The latest Node in the DAG. - unsigned LatestNodeUsed = 0; - for (unsigned i=1; i<NumElem; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // latest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) - LatestNodeUsed = i; + // We add +1 here because the LastXXX variables refer to location while + // the NumElem refers to array/index size. + unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); + NumElem = std::min(LastLegalType, NumElem); - MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); - } + if (NumElem < 2) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + continue; + } - LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; + // Find if it is better to use vectors or integers to load and store + // to memory. + EVT JointMemOpVT; + if (UseVectorTy) { + JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); + } else { + unsigned SizeInBits = NumElem * ElementSizeBytes * 8; + JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); + } - // Find if it is better to use vectors or integers to load and store - // to memory. - EVT JointMemOpVT; - if (UseVectorTy) { - JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); - } else { - unsigned SizeInBits = NumElem * ElementSizeBytes * 8; - JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); - } + SDLoc LoadDL(LoadNodes[0].MemNode); + SDLoc StoreDL(StoreNodes[0].MemNode); - SDLoc LoadDL(LoadNodes[0].MemNode); - SDLoc StoreDL(StoreNodes[0].MemNode); + // The merged loads are required to have the same incoming chain, so + // using the first's chain is acceptable. + SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), + FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), FirstLoadAlign); - // The merged loads are required to have the same incoming chain, so - // using the first's chain is acceptable. - SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), - FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), FirstLoadAlign); + SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); - SDValue NewStoreChain = - DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + AddToWorklist(NewStoreChain.getNode()); - SDValue NewStore = - DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); + SDValue NewStore = DAG.getStore( + NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), FirstStoreAlign); - // Transfer chain users from old loads to the new load. - for (unsigned i = 0; i < NumElem; ++i) { - LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), - SDValue(NewLoad.getNode(), 1)); - } + // Transfer chain users from old loads to the new load. + for (unsigned i = 0; i < NumElem; ++i) { + LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); + } - if (UseAA) { // Replace the all stores with the new store. for (unsigned i = 0; i < NumElem; ++i) CombineTo(StoreNodes[i].MemNode, NewStore); - } else { - // Replace the last store with the new store. - CombineTo(LatestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumElem; ++i) { - // Remove all Store nodes. - if (StoreNodes[i].MemNode == LatestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); - } + RV = true; + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + continue; } - - StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end()); - return true; + return RV; } SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { @@ -12256,19 +13070,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; - bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA - : DAG.getSubtarget().useAA(); -#ifndef NDEBUG - if (CombinerAAOnlyFunc.getNumOccurrences() && - CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) - UseAA = false; -#endif - if (UseAA && ST->isUnindexed()) { - // FIXME: We should do this even without AA enabled. AA will just allow - // FindBetterChain to work in more situations. The problem with this is that - // any combine that expects memory operations to be on consecutive chains - // first needs to be updated to look for users of the same chain. - + if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. if (findBetterNeighborChains(ST)) { @@ -12302,8 +13104,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SimplifyDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), - ST->getMemoryVT().getScalarSizeInBits()))) + ST->getMemoryVT().getScalarSizeInBits()))) { + // Re-visit the store if anything changed and the store hasn't been merged + // with another node (N is deleted) SimplifyDemandedBits will add Value's + // node back to the worklist if necessary, but we also need to re-visit + // the Store node itself. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); return SDValue(N, 0); + } } // If this is a load followed by a store to the same location, then the store @@ -12319,14 +13128,28 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } - // If this is a store followed by a store with the same value to the same - // location, then the store is dead/noop. if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { - if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() && - ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() && - ST1->isUnindexed() && !ST1->isVolatile()) { - // The store is dead, remove it. - return Chain; + if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && + !ST1->isVolatile() && ST1->getBasePtr() == Ptr && + ST->getMemoryVT() == ST1->getMemoryVT()) { + // If this is a store followed by a store with the same value to the same + // location, then the store is dead/noop. + if (ST1->getValue() == Value) { + // The store is dead, remove it. + return Chain; + } + + // If this is a store who's preceeding store to the same location + // and no one other node is chained to that store we can effectively + // drop the store. Do not remove stores to undef as they may be used as + // data sinks. + if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && + !ST1->getBasePtr().isUndef()) { + // ST1 is fully overwritten and can be elided. Combine with it's chain + // value. + CombineTo(ST1, ST1->getChain()); + return SDValue(); + } } } @@ -12347,15 +13170,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. - SmallVector<MemOpLink, 8> StoreNodes; - bool Changed = MergeConsecutiveStores(ST, StoreNodes); + bool Changed = MergeConsecutiveStores(ST); if (!Changed) break; - - if (any_of(StoreNodes, - [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) { - // ST has been merged and no longer exists. + // Return N as merge only uses CombineTo and no worklist clean + // up is necessary. + if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N)) return SDValue(N, 0); - } } } @@ -12364,7 +13184,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // Make sure to do this only after attempting to merge stores in order to // avoid changing the types of some subset of stores due to visit order, // preventing their merging. - if (isa<ConstantFPSDNode>(Value)) { + if (isa<ConstantFPSDNode>(ST->getValue())) { if (SDValue NewSt = replaceStoreOfFPConstant(ST)) return NewSt; } @@ -12493,10 +13313,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); - // If we can't generate a legal BUILD_VECTOR, exit - if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) - return SDValue(); - // Check that we know which element is being inserted if (!isa<ConstantSDNode>(EltNo)) return SDValue(); @@ -12511,8 +13327,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // do this only if indices are both constants and Idx1 < Idx0. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa<ConstantSDNode>(InVec.getOperand(2))) { - unsigned OtherElt = - cast<ConstantSDNode>(InVec.getOperand(2))->getZExtValue(); + unsigned OtherElt = InVec.getConstantOperandVal(2); if (Elt < OtherElt) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, @@ -12523,6 +13338,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { } } + // If we can't generate a legal BUILD_VECTOR, exit + if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) + return SDValue(); + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. @@ -12544,11 +13363,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // All the operands of BUILD_VECTOR must have the same type; // we enforce that here. EVT OpVT = Ops[0].getValueType(); - if (InVal.getValueType() != OpVT) - InVal = OpVT.bitsGT(InVal.getValueType()) ? - DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : - DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); - Ops[Elt] = InVal; + Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; } // Return the new vector @@ -12568,6 +13383,11 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); + ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? + ISD::NON_EXTLOAD : ISD::EXTLOAD; + if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) + return SDValue(); + Align = NewAlign; SDValue NewPtr = OriginalLoad->getBasePtr(); @@ -12639,6 +13459,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); + if (InVec.isUndef()) + return DAG.getUNDEF(NVT); + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the @@ -13022,7 +13845,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { return DAG.getNode(Opcode, DL, VT, BV); } -SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N, +SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef<int> VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx) { @@ -13300,6 +14123,35 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); + // Check if we can express BUILD VECTOR via subvector extract. + if (!LegalTypes && (N->getNumOperands() > 1)) { + SDValue Op0 = N->getOperand(0); + auto checkElem = [&](SDValue Op) -> uint64_t { + if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && + (Op0.getOperand(0) == Op.getOperand(0))) + if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) + return CNode->getZExtValue(); + return -1; + }; + + int Offset = checkElem(Op0); + for (unsigned i = 0; i < N->getNumOperands(); ++i) { + if (Offset + i != checkElem(N->getOperand(i))) { + Offset = -1; + break; + } + } + + if ((Offset == 0) && + (Op0.getOperand(0).getValueType() == N->getValueType(0))) + return Op0.getOperand(0); + if ((Offset != -1) && + ((Offset % N->getValueType(0).getVectorNumElements()) == + 0)) // IDX must be multiple of output size. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), + Op0.getOperand(0), Op0.getOperand(1)); + } + if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; @@ -13419,7 +14271,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDValue(); - int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + int ExtIdx = Op.getConstantOperandVal(1); // Ensure that we are extracting a subvector from a vector the same // size as the result. @@ -13491,8 +14343,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); - EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, - VT.getSizeInBits() / SclTy.getSizeInBits()); + unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); + if (VNTNumElms < 2) + return SDValue(); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) return SDValue(); @@ -13607,19 +14462,166 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +/// If we are extracting a subvector produced by a wide binary operator with at +/// at least one operand that was the result of a vector concatenation, then try +/// to use the narrow vector operands directly to avoid the concatenation and +/// extraction. +static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share + // some of these bailouts with other transforms. + + // The extract index must be a constant, so we can map it to a concat operand. + auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); + if (!ExtractIndex) + return SDValue(); + + // Only handle the case where we are doubling and then halving. A larger ratio + // may require more than two narrow binops to replace the wide binop. + EVT VT = Extract->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + assert((ExtractIndex->getZExtValue() % NumElems) == 0 && + "Extract index is not a multiple of the vector length."); + if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2) + return SDValue(); + + // We are looking for an optionally bitcasted wide vector binary operator + // feeding an extract subvector. + SDValue BinOp = Extract->getOperand(0); + if (BinOp.getOpcode() == ISD::BITCAST) + BinOp = BinOp.getOperand(0); + + // TODO: The motivating case for this transform is an x86 AVX1 target. That + // target has temptingly almost legal versions of bitwise logic ops in 256-bit + // flavors, but no other 256-bit integer support. This could be extended to + // handle any binop, but that may require fixing/adding other folds to avoid + // codegen regressions. + unsigned BOpcode = BinOp.getOpcode(); + if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) + return SDValue(); + + // The binop must be a vector type, so we can chop it in half. + EVT WideBVT = BinOp.getValueType(); + if (!WideBVT.isVector()) + return SDValue(); + + // Bail out if the target does not support a narrower version of the binop. + EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), + WideBVT.getVectorNumElements() / 2); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) + return SDValue(); + + // Peek through bitcasts of the binary operator operands if needed. + SDValue LHS = BinOp.getOperand(0); + if (LHS.getOpcode() == ISD::BITCAST) + LHS = LHS.getOperand(0); + + SDValue RHS = BinOp.getOperand(1); + if (RHS.getOpcode() == ISD::BITCAST) + RHS = RHS.getOperand(0); + + // We need at least one concatenation operation of a binop operand to make + // this transform worthwhile. The concat must double the input vector sizes. + // TODO: Should we also handle INSERT_SUBVECTOR patterns? + bool ConcatL = + LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; + bool ConcatR = + RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; + if (!ConcatL && !ConcatR) + return SDValue(); + + // If one of the binop operands was not the result of a concat, we must + // extract a half-sized operand for our new narrow binop. We can't just reuse + // the original extract index operand because we may have bitcasted. + unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems; + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + SDLoc DL(Extract); + + // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN + // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) + // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN + SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(0), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(1), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); + return DAG.getBitcast(VT, NarrowBinOp); +} + +/// If we are extracting a subvector from a wide vector load, convert to a +/// narrow load to eliminate the extraction: +/// (extract_subvector (load wide vector)) --> (load narrow vector) +static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Add support for big-endian. The offset calculation must be adjusted. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + + // TODO: The one-use check is overly conservative. Check the cost of the + // extract instead or remove that condition entirely. + auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0)); + auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); + if (!Ld || !Ld->hasOneUse() || Ld->isVolatile() || !ExtIdx) + return SDValue(); + + // The narrow load will be offset from the base address of the old load if + // we are extracting from something besides index 0 (little-endian). + EVT VT = Extract->getValueType(0); + SDLoc DL(Extract); + SDValue BaseAddr = Ld->getOperand(1); + unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); + + // TODO: Use "BaseIndexOffset" to make this more effective. + SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, + VT.getStoreSize()); + SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); + + // The new load must have the same position as the old load in terms of memory + // dependency. Create a TokenFactor for Ld and NewLd and update uses of Ld's + // output chain to use that TokenFactor. + // TODO: This code is based on a similar sequence in x86 lowering. It should + // be moved to a helper function, so it can be shared and reused. + if (Ld->hasAnyUseOfValue(1)) { + SDValue OldChain = SDValue(Ld, 1); + SDValue NewChain = SDValue(NewLd.getNode(), 1); + SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + OldChain, NewChain); + DAG.ReplaceAllUsesOfValueWith(OldChain, TokenFactor); + DAG.UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain); + } + + return NewLd; +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); - if (V->getOpcode() == ISD::CONCAT_VECTORS) { - // Combine: - // (extract_subvec (concat V1, V2, ...), i) - // Into: - // Vi if possible - // Only operand 0 is checked as 'concat' assumes all inputs of the same - // type. - if (V->getOperand(0).getValueType() != NVT) - return SDValue(); + // Extract from UNDEF is UNDEF. + if (V.isUndef()) + return DAG.getUNDEF(NVT); + + if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) + if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) + return NarrowLoad; + + // Combine: + // (extract_subvec (concat V1, V2, ...), i) + // Into: + // Vi if possible + // Only operand 0 is checked as 'concat' assumes all inputs of the same + // type. + if (V->getOpcode() == ISD::CONCAT_VECTORS && + isa<ConstantSDNode>(N->getOperand(1)) && + V->getOperand(0).getValueType() == NVT) { unsigned Idx = N->getConstantOperandVal(1); unsigned NumElems = NVT.getVectorNumElements(); assert((Idx % NumElems) == 0 && @@ -13633,19 +14635,16 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { // Handle only simple case where vector being inserted and vector - // being extracted are of same type, and are half size of larger vectors. - EVT BigVT = V->getOperand(0).getValueType(); + // being extracted are of same size. EVT SmallVT = V->getOperand(1).getValueType(); - if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits()) + if (!NVT.bitsEq(SmallVT)) return SDValue(); - // Only handle cases where both indexes are constants with the same type. + // Only handle cases where both indexes are constants. ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1)); ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2)); - if (InsIdx && ExtIdx && - InsIdx->getValueType(0).getSizeInBits() <= 64 && - ExtIdx->getValueType(0).getSizeInBits() <= 64) { + if (InsIdx && ExtIdx) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) // Into: @@ -13661,6 +14660,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { } } + if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) + return NarrowBOp; + return SDValue(); } @@ -13892,6 +14894,163 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, return DAG.getBuildVector(VT, SDLoc(SVN), Ops); } +// Match shuffles that can be converted to any_vector_extend_in_reg. +// This is often generated during legalization. +// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) +// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. +static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + EVT VT = SVN->getValueType(0); + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + // TODO Add support for big-endian when we have a test case. + if (!VT.isInteger() || IsBigEndian) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + ArrayRef<int> Mask = SVN->getMask(); + SDValue N0 = SVN->getOperand(0); + + // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) + auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) + continue; + return false; + } + return true; + }; + + // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for + // power-of-2 extensions as they are the most likely. + for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { + if (!isAnyExtend(Scale)) + continue; + + EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) + return DAG.getBitcast(VT, + DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); + } + + return SDValue(); +} + +// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of +// each source element of a large type into the lowest elements of a smaller +// destination type. This is often generated during legalization. +// If the source node itself was a '*_extend_vector_inreg' node then we should +// then be able to remove it. +static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) { + EVT VT = SVN->getValueType(0); + bool IsBigEndian = DAG.getDataLayout().isBigEndian(); + + // TODO Add support for big-endian when we have a test case. + if (!VT.isInteger() || IsBigEndian) + return SDValue(); + + SDValue N0 = SVN->getOperand(0); + while (N0.getOpcode() == ISD::BITCAST) + N0 = N0.getOperand(0); + + unsigned Opcode = N0.getOpcode(); + if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && + Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && + Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + ArrayRef<int> Mask = SVN->getMask(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); + + // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> + // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> + // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> + auto isTruncate = [&Mask, &NumElts](unsigned Scale) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) + continue; + return false; + } + return true; + }; + + // At the moment we just handle the case where we've truncated back to the + // same size as before the extension. + // TODO: handle more extension/truncation cases as cases arise. + if (EltSizeInBits != ExtSrcSizeInBits) + return SDValue(); + + // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for + // power-of-2 truncations as they are the most likely. + for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) + if (isTruncate(Scale)) + return DAG.getBitcast(VT, N00); + + return SDValue(); +} + +// Combine shuffles of splat-shuffles of the form: +// shuffle (shuffle V, undef, splat-mask), undef, M +// If splat-mask contains undef elements, we need to be careful about +// introducing undef's in the folded mask which are not the result of composing +// the masks of the shuffles. +static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask, + ShuffleVectorSDNode *Splat, + SelectionDAG &DAG) { + ArrayRef<int> SplatMask = Splat->getMask(); + assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); + + // Prefer simplifying to the splat-shuffle, if possible. This is legal if + // every undef mask element in the splat-shuffle has a corresponding undef + // element in the user-shuffle's mask or if the composition of mask elements + // would result in undef. + // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): + // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] + // In this case it is not legal to simplify to the splat-shuffle because we + // may be exposing the users of the shuffle an undef element at index 1 + // which was not there before the combine. + // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] + // In this case the composition of masks yields SplatMask, so it's ok to + // simplify to the splat-shuffle. + // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] + // In this case the composed mask includes all undef elements of SplatMask + // and in addition sets element zero to undef. It is safe to simplify to + // the splat-shuffle. + auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask, + ArrayRef<int> SplatMask) { + for (unsigned i = 0, e = UserMask.size(); i != e; ++i) + if (UserMask[i] != -1 && SplatMask[i] == -1 && + SplatMask[UserMask[i]] != -1) + return false; + return true; + }; + if (CanSimplifyToExistingSplat(UserMask, SplatMask)) + return SDValue(Splat, 0); + + // Create a new shuffle with a mask that is composed of the two shuffles' + // masks. + SmallVector<int, 32> NewMask; + for (int Idx : UserMask) + NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); + + return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), + Splat->getOperand(0), Splat->getOperand(1), + NewMask); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -13938,6 +15097,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } + // A shuffle of a single vector that is a splat can always be folded. + if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0)) + if (N1->isUndef() && N0Shuf->isSplat()) + return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { @@ -13996,6 +15160,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG)) return S; + // Match shuffles that can be converted to any_vector_extend_in_reg. + if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) + return V; + + // Combine "truncate_vector_in_reg" style shuffles. + if (SDValue V = combineTruncationShuffle(SVN, DAG)) + return V; + if (N0.getOpcode() == ISD::CONCAT_VECTORS && Level < AfterLegalizeVectorOps && (N1.isUndef() || @@ -14253,6 +15425,16 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); + // If inserting an UNDEF, just return the original vector. + if (N1.isUndef()) + return N0; + + // If this is an insert of an extracted vector into an undef vector, we can + // just use the input to the extract. + if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) + return N1.getOperand(0); + // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -14262,26 +15444,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); - if (N0.getValueType() != N1.getValueType()) + if (!isa<ConstantSDNode>(N2)) return SDValue(); + unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); + + // Canonicalize insert_subvector dag nodes. + // Example: + // (insert_subvector (insert_subvector A, Idx0), Idx1) + // -> (insert_subvector (insert_subvector A, Idx1), Idx0) + if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && + N1.getValueType() == N0.getOperand(1).getValueType() && + isa<ConstantSDNode>(N0.getOperand(2))) { + unsigned OtherIdx = N0.getConstantOperandVal(2); + if (InsIdx < OtherIdx) { + // Swap nodes. + SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, + N0.getOperand(0), N1, N2); + AddToWorklist(NewOp.getNode()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), + VT, NewOp, N0.getOperand(1), N0.getOperand(2)); + } + } + // If the input vector is a concatenation, and the insert replaces - // one of the halves, we can optimize into a single concat_vectors. - if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 && - N2.getOpcode() == ISD::Constant) { - APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue(); - - // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) -> - // (concat_vectors Z, Y) - if (InsIdx == 0) - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1, - N0.getOperand(1)); + // one of the pieces, we can optimize into a single concat_vectors. + if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && + N0.getOperand(0).getValueType() == N1.getValueType()) { + unsigned Factor = N1.getValueType().getVectorNumElements(); - // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) -> - // (concat_vectors X, Z) - if (InsIdx == VT.getVectorNumElements() / 2) - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0), - N1); + SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); + Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1; + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } return SDValue(); @@ -14366,9 +15561,9 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { // Extract the sub element from the constant bit mask. if (DAG.getDataLayout().isBigEndian()) { - Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits); + Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits); } else { - Bits = Bits.lshr(SubIdx * NumSubBits); + Bits.lshrInPlace(SubIdx * NumSubBits); } if (Split > 1) @@ -15041,7 +16236,7 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { /// => /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] -SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { +SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -15096,7 +16291,7 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { /// As a result, we precompute A/2 prior to the iteration loop. SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal) { + SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); @@ -15140,7 +16335,7 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, - SDNodeFlags *Flags, bool Reciprocal) { + SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); @@ -15185,7 +16380,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if /// Op can be zero. -SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, +SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Reciprocal) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -15238,17 +16433,17 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, return SDValue(); } -SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) { +SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, true); } -SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) { +SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, false); } /// Return true if base is a frame index, which is known not to alias with /// anything but itself. Provides base object and offset as results. -static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, +static bool findBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, const GlobalValue *&GV, const void *&CV) { // Assume it is a primitive operation. Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr; @@ -15257,7 +16452,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, if (Base.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) { Base = Base.getOperand(0); - Offset += C->getZExtValue(); + Offset += C->getSExtValue(); } } @@ -15300,54 +16495,68 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { if (Op1->isInvariant() && Op0->writeMem()) return false; + unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3; + unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; + + // Check for BaseIndexOffset matching. + BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); + BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); + if (BasePtr0.equalBaseIndex(BasePtr1)) + return !((BasePtr0.Offset + NumBytes0 <= BasePtr1.Offset) || + (BasePtr1.Offset + NumBytes1 <= BasePtr0.Offset)); + + // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis + // modified to use BaseIndexOffset. + // Gather base node and offset information. - SDValue Base1, Base2; - int64_t Offset1, Offset2; - const GlobalValue *GV1, *GV2; - const void *CV1, *CV2; - bool isFrameIndex1 = FindBaseOffset(Op0->getBasePtr(), + SDValue Base0, Base1; + int64_t Offset0, Offset1; + const GlobalValue *GV0, *GV1; + const void *CV0, *CV1; + bool IsFrameIndex0 = findBaseOffset(Op0->getBasePtr(), + Base0, Offset0, GV0, CV0); + bool IsFrameIndex1 = findBaseOffset(Op1->getBasePtr(), Base1, Offset1, GV1, CV1); - bool isFrameIndex2 = FindBaseOffset(Op1->getBasePtr(), - Base2, Offset2, GV2, CV2); - // If they have a same base address then check to see if they overlap. - if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2))) - return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 || - (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1); + // If they have the same base address, then check to see if they overlap. + if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1))) + return !((Offset0 + NumBytes0) <= Offset1 || + (Offset1 + NumBytes1) <= Offset0); // It is possible for different frame indices to alias each other, mostly // when tail call optimization reuses return address slots for arguments. // To catch this case, look up the actual index of frame indices to compute // the real alias relationship. - if (isFrameIndex1 && isFrameIndex2) { + if (IsFrameIndex0 && IsFrameIndex1) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Offset0 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base0)->getIndex()); Offset1 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex()); - Offset2 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base2)->getIndex()); - return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 || - (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1); + return !((Offset0 + NumBytes0) <= Offset1 || + (Offset1 + NumBytes1) <= Offset0); } // Otherwise, if we know what the bases are, and they aren't identical, then // we know they cannot alias. - if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2)) + if ((IsFrameIndex0 || CV0 || GV0) && (IsFrameIndex1 || CV1 || GV1)) return false; // If we know required SrcValue1 and SrcValue2 have relatively large alignment // compared to the size and offset of the access, we may be able to prove they - // do not alias. This check is conservative for now to catch cases created by + // do not alias. This check is conservative for now to catch cases created by // splitting vector types. - if ((Op0->getOriginalAlignment() == Op1->getOriginalAlignment()) && - (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) && - (Op0->getMemoryVT().getSizeInBits() >> 3 == - Op1->getMemoryVT().getSizeInBits() >> 3) && - (Op0->getOriginalAlignment() > (Op0->getMemoryVT().getSizeInBits() >> 3))) { - int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment(); - int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment(); + int64_t SrcValOffset0 = Op0->getSrcValueOffset(); + int64_t SrcValOffset1 = Op1->getSrcValueOffset(); + unsigned OrigAlignment0 = Op0->getOriginalAlignment(); + unsigned OrigAlignment1 = Op1->getOriginalAlignment(); + if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && + NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) { + int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; + int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; // There is no overlap between these relatively aligned accesses of similar - // size, return no alias. - if ((OffAlign1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign2 || - (OffAlign2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign1) + // size. Return no alias. + if ((OffAlign0 + NumBytes0) <= OffAlign1 || + (OffAlign1 + NumBytes1) <= OffAlign0) return false; } @@ -15359,20 +16568,18 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif - if (UseAA && + + if (UseAA && AA && Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { // Use alias analysis information. - int64_t MinOffset = std::min(Op0->getSrcValueOffset(), - Op1->getSrcValueOffset()); - int64_t Overlap1 = (Op0->getMemoryVT().getSizeInBits() >> 3) + - Op0->getSrcValueOffset() - MinOffset; - int64_t Overlap2 = (Op1->getMemoryVT().getSizeInBits() >> 3) + - Op1->getSrcValueOffset() - MinOffset; + int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); + int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset; + int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset; AliasResult AAResult = - AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap1, - UseTBAA ? Op0->getAAInfo() : AAMDNodes()), - MemoryLocation(Op1->getMemOperand()->getValue(), Overlap2, - UseTBAA ? Op1->getAAInfo() : AAMDNodes())); + AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, + UseTBAA ? Op0->getAAInfo() : AAMDNodes()), + MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, + UseTBAA ? Op1->getAAInfo() : AAMDNodes()) ); if (AAResult == NoAlias) return false; } @@ -15454,6 +16661,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, ++Depth; break; + case ISD::CopyFromReg: + // Forward past CopyFromReg. + Chains.push_back(Chain.getOperand(0)); + ++Depth; + break; + default: // For all other instructions we will just have to take what we can get. Aliases.push_back(Chain); @@ -15482,6 +16695,18 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } +// This function tries to collect a bunch of potentially interesting +// nodes to improve the chains of, all at once. This might seem +// redundant, as this function gets called when visiting every store +// node, so why not let the work be done on each store as it's visited? +// +// I believe this is mainly important because MergeConsecutiveStores +// is unable to deal with merging stores of different sizes, so unless +// we improve the chains of all the potential candidates up-front +// before running MergeConsecutiveStores, it might only see some of +// the nodes that will eventually be candidates, and then not be able +// to go from a partially-merged state to the desired final +// fully-merged state. bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. @@ -15517,10 +16742,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { if (!Ptr.equalBaseIndex(BasePtr)) break; - // Find the next memory operand in the chain. If the next operand in the - // chain is a store then move up and continue the scan with the next - // memory operand. If the next operand is a load save it and use alias - // information to check if it interferes with anything. + // Walk up the chain to find the next store node, ignoring any + // intermediate loads. Any other kind of node will halt the loop. SDNode *NextInChain = Index->getChain().getNode(); while (true) { if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { @@ -15539,9 +16762,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { Index = nullptr; break; } - } + } // end while } + // At this point, ChainedStores lists all of the Store nodes + // reachable by iterating up through chain nodes matching the above + // conditions. For each such store identified, try to find an + // earlier chain to attach the store to which won't violate the + // required ordering. bool MadeChangeToSt = false; SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains; @@ -15565,7 +16793,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { } /// This is the entry point for the file. -void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA, +void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, CodeGenOpt::Level OptLevel) { /// This is the main entry point to this class. DAGCombiner(*this, AA, OptLevel).Run(Level); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index e2f33bb433ba..5003b79974eb 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1,4 +1,4 @@ -//===-- FastISel.cpp - Implementation of the FastISel class ---------------===// +//===- FastISel.cpp - Implementation of the FastISel class ----------------===// // // The LLVM Compiler Infrastructure // @@ -39,35 +39,76 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "isel" @@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by " "target-specific selector"); STATISTIC(NumFastIselDead, "Number of dead insts removed on failure"); -void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS, - unsigned AttrIdx) { - IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); - IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); - IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); - IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); - IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); - IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); - IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); - IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); - IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); - IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); - Alignment = CS->getParamAlignment(AttrIdx); -} - /// Set the current block to which generated machine instructions will be /// appended, and clear the local CSE map. void FastISel::startNewBlock() { @@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) { // Try to emit the constant by using an integer constant with a cast. const APFloat &Flt = CF->getValueAPF(); EVT IntVT = TLI.getPointerTy(DL); - - uint64_t x[2]; uint32_t IntBitWidth = IntVT.getSizeInBits(); + APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false); bool isExact; - (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true, - APFloat::rmTowardZero, &isExact); + (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact); if (isExact) { - APInt IntVal(IntBitWidth, x); - unsigned IntegerReg = - getRegForValue(ConstantInt::get(V->getContext(), IntVal)); + getRegForValue(ConstantInt::get(V->getContext(), SIntVal)); if (IntegerReg != 0) Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg, /*Kill=*/false); @@ -600,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) { // have to worry about calling conventions and target-specific lowering code. // Instead we perform the call lowering right here. // - // CALLSEQ_START(0...) + // CALLSEQ_START(0, 0...) // STACKMAP(id, nbytes, ...) // CALLSEQ_END(0, 0) // @@ -646,7 +668,7 @@ bool FastISel::selectStackmap(const CallInst *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::STACKMAP)); for (auto const &MO : Ops) - MIB.addOperand(MO); + MIB.add(MO); // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); @@ -672,10 +694,8 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, Args.reserve(NumArgs); // Populate the argument list. - // Attributes for args start at offset 1, after the return attribute. ImmutableCallSite CS(CI); - for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; - ArgI != ArgE; ++ArgI) { + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) { Value *V = CI->getOperand(ArgI); assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic."); @@ -683,7 +703,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx, ArgListEntry Entry; Entry.Val = V; Entry.Ty = V->getType(); - Entry.setAttributes(&CS, AttrI); + Entry.setAttributes(&CS, ArgIdx); Args.push_back(Entry); } @@ -826,7 +846,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) { TII.get(TargetOpcode::PATCHPOINT)); for (auto &MO : Ops) - MIB.addOperand(MO); + MIB.add(MO); MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI); @@ -841,9 +861,28 @@ bool FastISel::selectPatchpoint(const CallInst *I) { return true; } -/// Returns an AttributeSet representing the attributes applied to the return +bool FastISel::selectXRayCustomEvent(const CallInst *I) { + const auto &Triple = TM.getTargetTriple(); + if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux()) + return true; // don't do anything to this instruction. + SmallVector<MachineOperand, 8> Ops; + Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)), + /*IsDef=*/false)); + Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(1)), + /*IsDef=*/false)); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::PATCHABLE_EVENT_CALL)); + for (auto &MO : Ops) + MIB.add(MO); + // Insert the Patchable Event Call instruction, that gets lowered properly. + return true; +} + + +/// Returns an AttributeList representing the attributes applied to the return /// value of the given call. -static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) { SmallVector<Attribute::AttrKind, 2> Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); @@ -852,8 +891,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) { if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); - return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, - Attrs); + return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, + Attrs); } bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName, @@ -885,9 +924,10 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol, ArgListEntry Entry; Entry.Val = V; Entry.Ty = V->getType(); - Entry.setAttributes(&CS, ArgI + 1); + Entry.setAttributes(&CS, ArgI); Args.push_back(Entry); } + TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args); CallLoweringInfo CLI; CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs); @@ -1021,7 +1061,7 @@ bool FastISel::lowerCall(const CallInst *CI) { Entry.Ty = V->getType(); // Skip the first return-type Attribute to get to params. - Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Entry.setAttributes(&CS, i - CS.arg_begin()); Args.push_back(Entry); } @@ -1110,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { return true; } - unsigned Offset = 0; + // Byval arguments with frame indices were already handled after argument + // lowering and before isel. + const auto *Arg = + dyn_cast<Argument>(Address->stripInBoundsConstantOffsets()); + if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) + return true; + Optional<MachineOperand> Op; - if (const auto *Arg = dyn_cast<Argument>(Address)) - // Some arguments' frame index is recorded during argument lowering. - Offset = FuncInfo.getArgumentFrameIndex(Arg); - if (Offset) - Op = MachineOperand::CreateFI(Offset); - if (!Op) - if (unsigned Reg = lookUpRegForValue(Address)) - Op = MachineOperand::CreateReg(Reg, false); + if (unsigned Reg = lookUpRegForValue(Address)) + Op = MachineOperand::CreateReg(Reg, false); // If we have a VLA that has a "use" in a metadata node that's then used // here but it has no other uses, then we have a problem. E.g., @@ -1143,13 +1183,15 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { "Expected inlined-at fields to agree"); if (Op->isReg()) { Op->setIsDebug(true); + // A dbg.declare describes the address of a source variable, so lower it + // into an indirect DBG_VALUE. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0, - DI->getVariable(), DI->getExpression()); + TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true, + Op->getReg(), 0, DI->getVariable(), DI->getExpression()); } else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) + .add(*Op) .addImm(0) .addMetadata(DI->getVariable()) .addMetadata(DI->getExpression()); @@ -1229,6 +1271,9 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: return selectPatchpoint(II); + + case Intrinsic::xray_customevent: + return selectXRayCustomEvent(II); } return fastLowerIntrinsicCall(II); @@ -1362,7 +1407,7 @@ bool FastISel::selectInstruction(const Instruction *I) { if (const auto *Call = dyn_cast<CallInst>(I)) { const Function *F = Call->getCalledFunction(); - LibFunc::Func Func; + LibFunc Func; // As a special case, don't handle calls to builtin library functions that // may be translated directly to target instructions. @@ -1665,7 +1710,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo, TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo), SkipTargetIndependentISel(SkipTargetIndependentISel) {} -FastISel::~FastISel() {} +FastISel::~FastISel() = default; bool FastISel::fastLowerArguments() { return false; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 377a5237f15a..606b8952f3c1 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, MF = &mf; TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - MachineModuleInfo &MMI = MF->getMMI(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); unsigned StackAlign = TFI->getStackAlignment(); @@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I))) InitializeRegForValue(&I); - // Collect llvm.dbg.declare information. This is done now instead of - // during the initial isel pass through the IR so that it is done - // in a predictable order. - if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) { - assert(DI->getVariable() && "Missing variable"); - assert(DI->getDebugLoc() && "Missing location"); - if (MMI.hasDebugInfo()) { - // Don't handle byval struct arguments or VLAs, for example. - // Non-byval arguments are handled here (they refer to the stack - // temporary alloca at this point). - const Value *Address = DI->getAddress(); - if (Address) { - if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address)) - Address = BCI->getOperand(0); - if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) { - DenseMap<const AllocaInst *, int>::iterator SI = - StaticAllocaMap.find(AI); - if (SI != StaticAllocaMap.end()) { // Check for VLAs. - int FI = SI->second; - MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(), - FI, DI->getDebugLoc()); - } - } - } - } - } - // Decide the preferred extend type for a value. PreferredExtendType[&I] = getPreferredExtendForValue(&I); } @@ -400,10 +372,9 @@ FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) { if (!LOI->IsValid) return nullptr; - if (BitWidth > LOI->KnownZero.getBitWidth()) { + if (BitWidth > LOI->Known.getBitWidth()) { LOI->NumSignBits = 1; - LOI->KnownZero = LOI->KnownZero.zextOrTrunc(BitWidth); - LOI->KnownOne = LOI->KnownOne.zextOrTrunc(BitWidth); + LOI->Known = LOI->Known.zextOrTrunc(BitWidth); } return LOI; @@ -436,17 +407,15 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { Value *V = PN->getIncomingValue(0); if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) { DestLOI.NumSignBits = 1; - APInt Zero(BitWidth, 0); - DestLOI.KnownZero = Zero; - DestLOI.KnownOne = Zero; + DestLOI.Known = KnownBits(BitWidth); return; } if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { APInt Val = CI->getValue().zextOrTrunc(BitWidth); DestLOI.NumSignBits = Val.getNumSignBits(); - DestLOI.KnownZero = ~Val; - DestLOI.KnownOne = Val; + DestLOI.Known.Zero = ~Val; + DestLOI.Known.One = Val; } else { assert(ValueMap.count(V) && "V should have been placed in ValueMap when its" "CopyToReg node was created."); @@ -463,25 +432,23 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { DestLOI = *SrcLOI; } - assert(DestLOI.KnownZero.getBitWidth() == BitWidth && - DestLOI.KnownOne.getBitWidth() == BitWidth && + assert(DestLOI.Known.Zero.getBitWidth() == BitWidth && + DestLOI.Known.One.getBitWidth() == BitWidth && "Masks should have the same bit width as the type."); for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) { Value *V = PN->getIncomingValue(i); if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) { DestLOI.NumSignBits = 1; - APInt Zero(BitWidth, 0); - DestLOI.KnownZero = Zero; - DestLOI.KnownOne = Zero; + DestLOI.Known = KnownBits(BitWidth); return; } if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { APInt Val = CI->getValue().zextOrTrunc(BitWidth); DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits()); - DestLOI.KnownZero &= ~Val; - DestLOI.KnownOne &= Val; + DestLOI.Known.Zero &= ~Val; + DestLOI.Known.One &= Val; continue; } @@ -498,8 +465,8 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { return; } DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits); - DestLOI.KnownZero &= SrcLOI->KnownZero; - DestLOI.KnownOne &= SrcLOI->KnownOne; + DestLOI.Known.Zero &= SrcLOI->Known.Zero; + DestLOI.Known.One &= SrcLOI->Known.One; } } @@ -515,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A, /// If the argument does not have any assigned frame index then 0 is /// returned. int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) { - DenseMap<const Argument *, int>::iterator I = - ByValArgFrameIndexMap.find(A); + auto I = ByValArgFrameIndexMap.find(A); if (I != ByValArgFrameIndexMap.end()) return I->second; DEBUG(dbgs() << "Argument does not have assigned frame index!\n"); - return 0; + return INT_MAX; } unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg( diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 4a9042cfb3f4..b235e19aaab2 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -161,7 +161,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, if (VRBase) { DstRC = MRI->getRegClass(VRBase); } else if (UseRC) { - assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!"); + assert(TRI->isTypeLegalForClass(*UseRC, VT) && + "Incompatible phys register def and uses!"); DstRC = UseRC; } else { DstRC = TLI->getRegClassFor(VT); @@ -235,7 +236,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, if (II.OpInfo[i].isOptionalDef()) { // Optional def must be a physical register. - unsigned NumResults = CountResults(Node); VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg(); assert(TargetRegisterInfo::isPhysicalRegister(VRBase)); MIB.addReg(VRBase, RegState::Define); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index b0028252836a..d0a8b34c69c6 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -899,6 +899,39 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { } } +static TargetLowering::LegalizeAction +getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) { + unsigned EqOpc; + switch (Opcode) { + default: llvm_unreachable("Unexpected FP pseudo-opcode"); + case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; + case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break; + case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break; + case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break; + case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break; + case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; + case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; + case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; + case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; + case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; + } + + auto Action = TLI.getOperationAction(EqOpc, VT); + + // We don't currently handle Custom or Promote for strict FP pseudo-ops. + // For now, we just expand for those cases. + if (Action != TargetLowering::Legal) + Action = TargetLowering::Expand; + + // ISD::FPOWI returns 'Legal' even though it should be expanded. + if (Opcode == ISD::STRICT_FPOWI && Action == TargetLowering::Legal) + Action = TargetLowering::Expand; + + return Action; +} + /// Return a legal replacement for the given operation, with all legal operands. void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG)); @@ -1043,6 +1076,25 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { return; } break; + case ISD::STRICT_FSQRT: + case ISD::STRICT_FPOW: + case ISD::STRICT_FPOWI: + case ISD::STRICT_FSIN: + case ISD::STRICT_FCOS: + case ISD::STRICT_FEXP: + case ISD::STRICT_FEXP2: + case ISD::STRICT_FLOG: + case ISD::STRICT_FLOG10: + case ISD::STRICT_FLOG2: + case ISD::STRICT_FRINT: + case ISD::STRICT_FNEARBYINT: + // These pseudo-ops get legalized as if they were their non-strict + // equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT + // is also legal, but if ISD::FSQRT requires expansion then so does + // ISD::STRICT_FSQRT. + Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(), + Node->getValueType(0)); + break; default: if (Node->getOpcode() >= ISD::BUILTIN_OP_END) { @@ -1192,8 +1244,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { // If the index is dependent on the store we will introduce a cycle when // creating the load (the load uses the index, and by replacing the chain - // we will make the index dependent on the load). - if (SDNode::hasPredecessorHelper(ST, Visited, Worklist)) + // we will make the index dependent on the load). Also, the store might be + // dependent on the extractelement and introduce a cycle when creating + // the load. + if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) || + ST->hasPredecessor(Op.getNode())) continue; StackPtr = ST->getBasePtr(); @@ -1340,7 +1395,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State, // Convert to an integer of the same size. if (TLI.isTypeLegal(IVT)) { State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value); - State.SignMask = APInt::getSignBit(NumBits); + State.SignMask = APInt::getSignMask(NumBits); State.SignBit = NumBits - 1; return; } @@ -1490,7 +1545,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); @@ -1909,8 +1964,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1935,9 +1990,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setTailCall(isTailCall) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1960,8 +2019,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, for (unsigned i = 0; i != NumOps; ++i) { Entry.Node = Ops[i]; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1970,9 +2029,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -1994,8 +2056,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2004,9 +2066,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC, Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2019,6 +2084,9 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128) { + if (Node->isStrictFPOpcode()) + Node = DAG.mutateStrictFPToFP(Node); + RTLIB::Libcall LC; switch (Node->getSimpleValueType(0).SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); @@ -2081,8 +2149,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } @@ -2090,8 +2158,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDValue FIPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = FIPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2099,9 +2167,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2185,24 +2256,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, // Pass the argument. Entry.Node = Node->getOperand(0); Entry.Ty = RetTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); // Pass the return address of sin. SDValue SinPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = SinPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); // Also pass the return address of the cos. SDValue CosPtr = DAG.CreateStackTemporary(RetVT); Entry.Node = CosPtr; Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -2210,9 +2281,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, SDLoc dl(Node); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), - Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args)); + CLI.setDebugLoc(dl).setChain(InChain).setLibCallee( + TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, + std::move(Args)); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -2529,12 +2600,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0); APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0); for (unsigned J = 0; J != Sz; J += 8) { - MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J)); - MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J)); - MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J)); - MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J)); - MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J)); - MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J)); + MaskHi4 = MaskHi4 | (0xF0ull << J); + MaskLo4 = MaskLo4 | (0x0Full << J); + MaskHi2 = MaskHi2 | (0xCCull << J); + MaskLo2 = MaskLo2 | (0x33ull << J); + MaskHi1 = MaskHi1 | (0xAAull << J); + MaskLo1 = MaskLo1 | (0x55ull << J); } // BSWAP if the type is wider than a single byte. @@ -2573,7 +2644,7 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) { DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT)); APInt Shift(Sz, 1); - Shift = Shift.shl(J); + Shift <<= J; Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT)); Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2); } @@ -2968,7 +3039,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT NVT = Node->getValueType(0); APFloat apf(DAG.EVTToAPFloatSemantics(VT), APInt::getNullValue(VT.getSizeInBits())); - APInt x = APInt::getSignBit(NVT.getSizeInBits()); + APInt x = APInt::getSignMask(NVT.getSizeInBits()); (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven); Tmp1 = DAG.getConstantFP(apf, dl, VT); Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT), @@ -3091,7 +3162,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { TLI.getVectorIdxTy(DAG.getDataLayout())))); } - Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + Tmp1 = DAG.getBuildVector(VT, dl, Ops); // We may have changed the BUILD_VECTOR type. Cast it back to the Node type. Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1); Results.push_back(Tmp1); @@ -3237,7 +3308,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { EVT VT = Node->getValueType(0); if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) && TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) { - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(Node)->Flags; + const SDNodeFlags Flags = Node->getFlags(); Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1)); Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags); Results.push_back(Tmp1); @@ -3481,11 +3552,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // part. unsigned LoSize = VT.getSizeInBits(); SDValue HiLHS = - DAG.getNode(ISD::SRA, dl, VT, RHS, + DAG.getNode(ISD::SRA, dl, VT, LHS, DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); SDValue HiRHS = - DAG.getNode(ISD::SRA, dl, VT, LHS, + DAG.getNode(ISD::SRA, dl, VT, RHS, DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); @@ -3790,8 +3861,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Scalars.push_back(DAG.getNode(Node->getOpcode(), dl, VT.getScalarType(), Ex, Sh)); } - SDValue Result = - DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars); + + SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars); ReplaceNode(SDValue(Node, 0), Result); break; } @@ -3830,10 +3901,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__sync_synchronize", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + .setLibCallee( + CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__sync_synchronize", + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -3870,10 +3942,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("abort", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol( + "abort", TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); Results.push_back(CallResult.second); @@ -3890,16 +3962,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::FMAX_PPCF128)); break; case ISD::FSQRT: + case ISD::STRICT_FSQRT: Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64, RTLIB::SQRT_F80, RTLIB::SQRT_F128, RTLIB::SQRT_PPCF128)); break; case ISD::FSIN: + case ISD::STRICT_FSIN: Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64, RTLIB::SIN_F80, RTLIB::SIN_F128, RTLIB::SIN_PPCF128)); break; case ISD::FCOS: + case ISD::STRICT_FCOS: Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64, RTLIB::COS_F80, RTLIB::COS_F128, RTLIB::COS_PPCF128)); @@ -3909,26 +3984,31 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { ExpandSinCosLibCall(Node, Results); break; case ISD::FLOG: + case ISD::STRICT_FLOG: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, RTLIB::LOG_F80, RTLIB::LOG_F128, RTLIB::LOG_PPCF128)); break; case ISD::FLOG2: + case ISD::STRICT_FLOG2: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, RTLIB::LOG2_F80, RTLIB::LOG2_F128, RTLIB::LOG2_PPCF128)); break; case ISD::FLOG10: + case ISD::STRICT_FLOG10: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, RTLIB::LOG10_F80, RTLIB::LOG10_F128, RTLIB::LOG10_PPCF128)); break; case ISD::FEXP: + case ISD::STRICT_FEXP: Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, RTLIB::EXP_F80, RTLIB::EXP_F128, RTLIB::EXP_PPCF128)); break; case ISD::FEXP2: + case ISD::STRICT_FEXP2: Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, RTLIB::EXP2_F80, RTLIB::EXP2_F128, RTLIB::EXP2_PPCF128)); @@ -3949,11 +4029,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::CEIL_PPCF128)); break; case ISD::FRINT: + case ISD::STRICT_FRINT: Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64, RTLIB::RINT_F80, RTLIB::RINT_F128, RTLIB::RINT_PPCF128)); break; case ISD::FNEARBYINT: + case ISD::STRICT_FNEARBYINT: Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32, RTLIB::NEARBYINT_F64, RTLIB::NEARBYINT_F80, @@ -3968,11 +4050,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::ROUND_PPCF128)); break; case ISD::FPOWI: + case ISD::STRICT_FPOWI: Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64, RTLIB::POWI_F80, RTLIB::POWI_F128, RTLIB::POWI_PPCF128)); break; case ISD::FPOW: + case ISD::STRICT_FPOW: Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, RTLIB::POW_F80, RTLIB::POW_F128, RTLIB::POW_PPCF128)); @@ -4170,6 +4254,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { ReplacedNode(Node); break; } + case ISD::MUL: case ISD::SDIV: case ISD::SREM: case ISD::UDIV: @@ -4424,8 +4509,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { NewOps.push_back(Elt); } - SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps); - + SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps); Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec)); break; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 72b56d84d945..c1cb5d9b5235 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -72,7 +72,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_PAIR: R = SoftenFloatRes_BUILD_PAIR(N); break; case ISD::ConstantFP: R = SoftenFloatRes_ConstantFP(N, ResNo); break; case ISD::EXTRACT_VECTOR_ELT: - R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break; + R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break; case ISD::FABS: R = SoftenFloatRes_FABS(N, ResNo); break; case ISD::FMINNUM: R = SoftenFloatRes_FMINNUM(N); break; case ISD::FMAXNUM: R = SoftenFloatRes_FMAXNUM(N); break; @@ -171,7 +171,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) { } } -SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) { + // When LegalInHWReg, keep the extracted value in register. + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NewOp.getValueType().getVectorElementType(), @@ -459,7 +462,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) { Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op); if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat) - SoftenFloatResult(Op.getNode(), 0); + AddToWorklist(Op.getNode()); } if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) { @@ -472,8 +475,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); - if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat) - Op = GetSoftenedFloat(Op); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first; } @@ -1054,15 +1055,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - assert(NVT.getSizeInBits() == integerPartWidth && + assert(NVT.getSizeInBits() == 64 && "Do not know how to expand this float constant!"); APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt(); SDLoc dl(N); Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), - APInt(integerPartWidth, C.getRawData()[1])), + APInt(64, C.getRawData()[1])), dl, NVT); Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT), - APInt(integerPartWidth, C.getRawData()[0])), + APInt(64, C.getRawData()[0])), dl, NVT); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index dc436ce04514..92b0d2ae4015 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -21,6 +21,7 @@ #include "LegalizeTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -134,6 +135,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::SMULO: case ISD::UMULO: Res = PromoteIntRes_XMULO(N, ResNo); break; + case ISD::ADDCARRY: + case ISD::SUBCARRY: Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break; + case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break; @@ -510,9 +514,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); EVT ValueVTs[] = { N->getValueType(0), NVT }; - SDValue Ops[] = { N->getOperand(0), N->getOperand(1) }; + SDValue Ops[3] = { N->getOperand(0), N->getOperand(1) }; + unsigned NumOps = N->getNumOperands(); + assert(NumOps <= 3 && "Too many operands"); + if (NumOps == 3) + Ops[2] = N->getOperand(2); + SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), - DAG.getVTList(ValueVTs), Ops); + DAG.getVTList(ValueVTs), makeArrayRef(Ops, NumOps)); // Modified the sum result - switch anything that used the old sum to use // the new one. @@ -690,7 +699,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { case TargetLowering::TypePromoteInteger: Res = GetPromotedInteger(InOp); break; - case TargetLowering::TypeSplitVector: + case TargetLowering::TypeSplitVector: { EVT InVT = InOp.getValueType(); assert(InVT.isVector() && "Cannot split scalar types"); unsigned NumElts = InVT.getVectorNumElements(); @@ -709,6 +718,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2); } + case TargetLowering::TypeWidenVector: { + SDValue WideInOp = GetWidenedVector(InOp); + + // Truncate widened InOp. + unsigned NumElem = WideInOp.getValueType().getVectorNumElements(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), + N->getValueType(0).getScalarType(), NumElem); + SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp); + + // Zero extend so that the elements are of same type as those of NVT + EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(), + NumElem); + SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc); + + // Extract the low NVT subvector. + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx); + } + } // Truncate to NVT instead of VT return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); @@ -742,6 +771,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) { return Res; } +SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) { + if (ResNo == 1) + return PromoteIntRes_Overflow(N); + llvm_unreachable("Not implemented"); +} + SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { // Promote the overflow bit trivially. if (ResNo == 1) @@ -904,6 +939,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + + case ISD::ADDCARRY: + case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1089,6 +1127,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { SDValue Cond = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); + if (N->getOpcode() == ISD::VSELECT) + if (SDValue Res = WidenVSELECTAndMask(N)) + return Res; + // Promote all the way up to the canonical SetCC type. EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; Cond = PromoteTargetBoolean(Cond, OpVT); @@ -1252,6 +1294,30 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) { N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) { + assert(OpNo == 2 && "Don't know how to promote this operand!"); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDLoc DL(N); + + auto VT = getSetCCResultType(LHS.getValueType()); + TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(VT); + switch (BoolType) { + case TargetLoweringBase::UndefinedBooleanContent: + Carry = DAG.getAnyExtOrTrunc(Carry, DL, VT); + break; + case TargetLoweringBase::ZeroOrOneBooleanContent: + Carry = DAG.getZExtOrTrunc(Carry, DL, VT); + break; + case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + Carry = DAG.getSExtOrTrunc(Carry, DL, VT); + break; + } + + return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0); +} //===----------------------------------------------------------------------===// // Integer Result Expansion @@ -1371,6 +1437,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ADDE: case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break; + case ISD::ADDCARRY: + case ISD::SUBCARRY: ExpandIntRes_ADDSUBCARRY(N, Lo, Hi); break; + case ISD::SHL: case ISD::SRA: case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break; @@ -1501,11 +1570,11 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits)); - APInt KnownZero, KnownOne; - DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne); + KnownBits Known; + DAG.computeKnownBits(N->getOperand(1), Known); // If we don't know anything about the high bits, exit. - if (((KnownZero|KnownOne) & HighBitMask) == 0) + if (((Known.Zero|Known.One) & HighBitMask) == 0) return false; // Get the incoming operand to be shifted. @@ -1514,7 +1583,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { // If we know that any of the high bits of the shift amount are one, then we // can do this as a couple of simple shifts. - if (KnownOne.intersects(HighBitMask)) { + if (Known.One.intersects(HighBitMask)) { // Mask out the high bit, which we know is set. Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt, DAG.getConstant(~HighBitMask, dl, ShTy)); @@ -1539,7 +1608,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { // If we know that all of the high bits of the shift amount are zero, then we // can do this as a couple of simple shifts. - if ((KnownZero & HighBitMask) == HighBitMask) { + if (HighBitMask.isSubsetOf(Known.Zero)) { // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined // shift if x is zero. We can use XOR here because x is known to be smaller // than 32. @@ -1714,6 +1783,23 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue LoOps[2] = { LHSL, RHSL }; SDValue HiOps[3] = { LHSH, RHSH }; + bool HasOpCarry = TLI.isOperationLegalOrCustom( + N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY, + TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); + if (HasOpCarry) { + SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT)); + if (N->getOpcode() == ISD::ADD) { + Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps); + } else { + Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps); + } + return; + } + // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support // them. TODO: Teach operation legalization how to expand unsupported // ADDC/ADDE/SUBC/SUBE. The problem is that these operations generate @@ -1743,7 +1829,8 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, ISD::UADDO : ISD::USUBO, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); if (hasOVF) { - SDVTList VTList = DAG.getVTList(NVT, NVT); + EVT OvfVT = getSetCCResultType(NVT); + SDVTList VTList = DAG.getVTList(NVT, OvfVT); TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); int RevOpc; if (N->getOpcode() == ISD::ADD) { @@ -1759,12 +1846,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, switch (BoolType) { case TargetLoweringBase::UndefinedBooleanContent: - OVF = DAG.getNode(ISD::AND, dl, NVT, DAG.getConstant(1, dl, NVT), OVF); + OVF = DAG.getNode(ISD::AND, dl, OvfVT, DAG.getConstant(1, dl, OvfVT), OVF); LLVM_FALLTHROUGH; case TargetLoweringBase::ZeroOrOneBooleanContent: + OVF = DAG.getZExtOrTrunc(OVF, dl, NVT); Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF); break; case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + OVF = DAG.getSExtOrTrunc(OVF, dl, NVT); Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF); } return; @@ -1842,6 +1931,71 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N, ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } +void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, + SDValue &Lo, SDValue &Hi) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDLoc dl(N); + + SDValue Ovf; + + bool HasOpCarry = TLI.isOperationLegalOrCustom( + N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY, + TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType())); + + if (HasOpCarry) { + // Expand the subcomponents. + SDValue LHSL, LHSH, RHSL, RHSH; + GetExpandedInteger(LHS, LHSL, LHSH); + GetExpandedInteger(RHS, RHSL, RHSH); + SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1)); + SDValue LoOps[2] = { LHSL, RHSL }; + SDValue HiOps[3] = { LHSH, RHSH }; + + unsigned Opc = N->getOpcode() == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; + Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(Opc, dl, VTList, HiOps); + + Ovf = Hi.getValue(1); + } else { + // Expand the result by simply replacing it with the equivalent + // non-overflow-checking operation. + auto Opc = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB; + SDValue Sum = DAG.getNode(Opc, dl, LHS.getValueType(), LHS, RHS); + SplitInteger(Sum, Lo, Hi); + + // Calculate the overflow: addition overflows iff a + b < a, and subtraction + // overflows iff a - b > a. + auto Cond = N->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT; + Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond); + } + + // Legalized the flag result - switch anything that used the old flag to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Ovf); +} + +void DAGTypeLegalizer::ExpandIntRes_ADDSUBCARRY(SDNode *N, + SDValue &Lo, SDValue &Hi) { + // Expand the subcomponents. + SDValue LHSL, LHSH, RHSL, RHSH; + SDLoc dl(N); + GetExpandedInteger(N->getOperand(0), LHSL, LHSH); + GetExpandedInteger(N->getOperand(1), RHSL, RHSH); + SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1)); + SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) }; + SDValue HiOps[3] = { LHSH, RHSH, SDValue() }; + + Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps); + HiOps[2] = Lo.getValue(1); + Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps); + + // Legalized the flag result - switch anything that used the old flag to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); +} + void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); @@ -2508,29 +2662,6 @@ void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N, Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi); } -void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, - SDValue &Lo, SDValue &Hi) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDLoc dl(N); - - // Expand the result by simply replacing it with the equivalent - // non-overflow-checking operation. - SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ? - ISD::ADD : ISD::SUB, dl, LHS.getValueType(), - LHS, RHS); - SplitInteger(Sum, Lo, Hi); - - // Calculate the overflow: addition overflows iff a + b < a, and subtraction - // overflows iff a - b > a. - SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, - N->getOpcode () == ISD::UADDO ? - ISD::SETULT : ISD::SETUGT); - - // Use the calculated overflow everywhere. - ReplaceValueWith(SDValue(N, 1), Ofl); -} - void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); @@ -2586,24 +2717,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; - Entry.isSExt = true; - Entry.isZExt = false; + Entry.IsSExt = true; + Entry.IsZExt = false; Args.push_back(Entry); } // Also pass the address of the overflow check. Entry.Node = Temp; Entry.Ty = PtrTy->getPointerTo(); - Entry.isSExt = true; - Entry.isZExt = false; + Entry.IsSExt = true; + Entry.IsZExt = false; Args.push_back(Entry); SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) - .setSExtResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) + .setSExtResult(); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); @@ -3226,7 +3358,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { Ops.push_back(Op); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } @@ -3269,7 +3401,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) { Ops.push_back(Op); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) { @@ -3317,7 +3449,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -3420,5 +3552,5 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), NewOps); + return DAG.getBuildVector(N->getValueType(0), dl, NewOps); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index cf19d75676cd..154af46c9446 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -199,8 +199,7 @@ bool DAGTypeLegalizer::run() { // non-leaves. for (SDNode &Node : DAG.allnodes()) { if (Node.getNumOperands() == 0) { - Node.setNodeId(ReadyToProcess); - Worklist.push_back(&Node); + AddToWorklist(&Node); } else { Node.setNodeId(Unanalyzed); } @@ -331,6 +330,12 @@ ScanOperands: // to the worklist etc. if (NeedsReanalyzing) { assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?"); + + // Remove any result values from SoftenedFloats as N will be revisited + // again. + for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) + SoftenedFloats.erase(SDValue(N, i)); + N->setNodeId(NewNode); // Recompute the NodeId and correct processed operands, adding the node to // the worklist if ready. @@ -749,6 +754,8 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) { // new uses of From due to CSE. If this happens, replace the new uses of // From with To. } while (!From.use_empty()); + + SoftenedFloats.erase(From); } void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) { @@ -918,9 +925,9 @@ SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) { assert(Op.getValueType().isVector() && "Only applies to vectors!"); unsigned EltWidth = Op.getScalarValueSizeInBits(); EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth); - unsigned NumElts = Op.getValueType().getVectorNumElements(); + auto EltCnt = Op.getValueType().getVectorElementCount(); return DAG.getNode(ISD::BITCAST, SDLoc(Op), - EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op); + EVT::getVectorVT(*DAG.getContext(), EltNVT, EltCnt), Op); } SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op, @@ -1077,8 +1084,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy; - Entry.isSExt = isSigned; - Entry.isZExt = !isSigned; + Entry.IsSExt = isSigned; + Entry.IsZExt = !isSigned; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), @@ -1087,9 +1094,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node, Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(SDLoc(Node)).setChain(InChain) - .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(SDLoc(Node)) + .setChain(InChain) + .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, + std::move(Args)) + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index ec55662d75c0..4c3b514856b7 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -191,6 +191,11 @@ private: void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, SDValue &Lo, SDValue &Hi); + void AddToWorklist(SDNode *N) { + N->setNodeId(ReadyToProcess); + Worklist.push_back(N); + } + //===--------------------------------------------------------------------===// // Integer Promotion Support: LegalizeIntegerTypes.cpp //===--------------------------------------------------------------------===// @@ -274,6 +279,7 @@ private: SDValue PromoteIntRes_SRL(SDNode *N); SDValue PromoteIntRes_TRUNCATE(SDNode *N); SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); @@ -306,6 +312,7 @@ private: SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -345,6 +352,7 @@ private: void ExpandIntRes_ADDSUB (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBE (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_ADDSUBCARRY (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BITREVERSE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BSWAP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_MUL (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -423,7 +431,7 @@ private: SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N); SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo); - SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FMINNUM(SDNode *N); SDValue SoftenFloatRes_FMAXNUM(SDNode *N); @@ -597,6 +605,7 @@ private: SDValue ScalarizeVecRes_TernaryOp(SDNode *N); SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_InregOp(SDNode *N); + SDValue ScalarizeVecRes_VecInregOp(SDNode *N); SDValue ScalarizeVecRes_BITCAST(SDNode *N); SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); @@ -666,12 +675,14 @@ private: // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); + SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); @@ -713,6 +724,7 @@ private: SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); + SDValue WidenVSELECTAndMask(SDNode *N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); SDValue WidenVecRes_UNDEF(SDNode *N); @@ -782,6 +794,13 @@ private: /// By default, the vector will be widened with undefined values. SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false); + /// Return a mask of vector type MaskVT to replace InMask. Also adjust + /// MaskVT to ToMaskVT if needed with vector extension or truncation. + SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT); + + /// Get the target mask VT, and widen if needed. + EVT getSETCCWidenedResultTy(SDValue SetCC); + //===--------------------------------------------------------------------===// // Generic Splitting: LegalizeTypesGeneric.cpp //===--------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 3682c32460c6..aa69e0e2adfc 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -362,8 +362,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) { SmallVector<SDValue, 8> Ops; IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType()); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, - makeArrayRef(Ops.data(), NumElts)); + SDValue Vec = + DAG.getBuildVector(NVT, dl, makeArrayRef(Ops.data(), NumElts)); return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec); } @@ -396,10 +396,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) { NewElts.push_back(Hi); } - SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, - EVT::getVectorVT(*DAG.getContext(), - NewVT, NewElts.size()), - NewElts); + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NewElts.size()); + SDValue NewVec = DAG.getBuildVector(NewVecVT, dl, NewElts); // Convert the new vector to the old vector type. return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec); @@ -458,7 +456,7 @@ SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType()); for (unsigned i = 1; i < NumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) { @@ -512,8 +510,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, GetSplitOp(Op, Lo, Hi); } -void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, - SDValue &Hi) { +static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, + SelectionDAG &DAG) { + SDLoc DL(N); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + // Split the inputs. + SDValue Lo, Hi, LL, LH, RL, RH; + std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); + std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); + + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); + + return std::make_pair(Lo, Hi); +} + +void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH, CL, CH; SDLoc dl(N); GetSplitOp(N->getOperand(1), LL, LH); @@ -522,9 +536,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue Cond = N->getOperand(0); CL = CH = Cond; if (Cond.getValueType().isVector()) { + if (SDValue Res = WidenVSELECTAndMask(N)) + std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); + // It seems to improve code to generate two narrow SETCCs as opposed to + // splitting a wide result vector. + else if (Cond.getOpcode() == ISD::SETCC) + std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG); // Check if there are already splitted versions of the vector available and // use those instead of splitting the mask operand again. - if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) + else if (getTypeAction(Cond.getValueType()) == + TargetLowering::TypeSplitVector) GetSplitVector(Cond, CL, CH); else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index d4fa20f35274..5f167f8de1cf 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ class VectorLegalizer { SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); @@ -621,8 +622,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) { } NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - Value = DAG.getNode(ISD::BUILD_VECTOR, dl, - Op.getNode()->getValueType(0), Vals); + Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals); } else { SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG); @@ -692,6 +692,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) { return ExpandUINT_TO_FLOAT(Op); case ISD::FNEG: return ExpandFNEG(Op); + case ISD::FSUB: + return ExpandFSUB(Op); case ISD::SETCC: return UnrollVSETCC(Op); case ISD::BITREVERSE: @@ -720,8 +722,6 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { assert(VT.isVector() && !Mask.getValueType().isVector() && Op1.getValueType() == Op2.getValueType() && "Invalid type"); - unsigned NumElem = VT.getVectorNumElements(); - // If we can't even use the basic vector operations of // AND,OR,XOR, we will have to scalarize the op. // Notice that the operation may be 'promoted' which means that it is @@ -745,8 +745,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { DAG.getConstant(0, DL, BitTy)); // Broadcast the mask so that the entire vector is all-one or all zero. - SmallVector<SDValue, 8> Ops(NumElem, Mask); - Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops); + Mask = DAG.getSplatBuildVector(MaskTy, DL, Mask); // Bitcast the operands to be the same type as the mask. // This is needed when we select between FP types because @@ -1025,6 +1024,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) { return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandFSUB(SDValue Op) { + // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal, + // we can defer this to operation legalization where it will be lowered as + // a+(-b). + EVT VT = Op.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) && + TLI.isOperationLegalOrCustom(ISD::FADD, VT)) + return Op; // Defer to LegalizeDAG + + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { EVT VT = Op.getValueType(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); @@ -1102,7 +1113,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) { (EltVT.getSizeInBits()), dl, EltVT), DAG.getConstant(0, dl, EltVT)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 6906f67ebacb..ff0e609803d8 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + R = ScalarizeVecRes_VecInregOp(N); + break; case ISD::ANY_EXTEND: case ISD::BITREVERSE: case ISD::BSWAP: @@ -97,6 +102,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::ZERO_EXTEND: + case ISD::FCANONICALIZE: R = ScalarizeVecRes_UnaryOp(N); break; @@ -257,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) { LHS, DAG.getValueType(ExtVT)); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + + EVT OpVT = Op.getValueType(); + EVT OpEltVT = OpVT.getVectorElementType(); + EVT EltVT = N->getValueType(0).getVectorElementType(); + + if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) { + Op = GetScalarizedVector(Op); + } else { + Op = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op, + DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + + switch (N->getOpcode()) { + case ISD::ANY_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op); + } + + llvm_unreachable("Illegal extend_vector_inreg opcode"); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) { // If the operand is wider than the vector element type then it is implicitly // truncated. Make that explicit here. @@ -478,7 +512,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) { N->getValueType(0).getScalarType(), Elt); // Revectorize the result so the types line up with what the uses of this // expression expect. - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op); + return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Op); } /// The vectors to concatenate have length one - use a BUILD_VECTOR instead. @@ -486,20 +520,21 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) { SmallVector<SDValue, 8> Ops(N->getNumOperands()); for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) Ops[i] = GetScalarizedVector(N->getOperand(i)); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops); + return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops); } /// If the input is a vector that needs to be scalarized, it must be <1 x ty>, /// so just return the element, ignoring the index. SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { + EVT VT = N->getValueType(0); SDValue Res = GetScalarizedVector(N->getOperand(0)); - if (Res.getValueType() != N->getValueType(0)) - Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), - Res); + if (Res.getValueType() != VT) + Res = VT.isFloatingPoint() + ? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res) + : DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res); return Res; } - /// If the input condition is a vector that needs to be scalarized, it must be /// <1 x i1>, so just convert to a normal ISD::SELECT /// (still with vector output type since that was acceptable if we got here). @@ -637,6 +672,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::FCANONICALIZE: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -695,7 +731,7 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, GetSplitVector(N->getOperand(1), RHSLo, RHSHi); SDLoc dl(N); - const SDNodeFlags *Flags = N->getFlags(); + const SDNodeFlags Flags = N->getFlags(); unsigned Opcode = N->getOpcode(); Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags); Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags); @@ -781,10 +817,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); unsigned LoNumElts = LoVT.getVectorNumElements(); SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts); - Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps); + Lo = DAG.getBuildVector(LoVT, dl, LoOps); SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end()); - Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps); + Hi = DAG.getBuildVector(HiVT, dl, HiOps); } void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, @@ -928,7 +964,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDLoc dl(N); SDValue InLo, InHi; - GetSplitVector(N0, InLo, InHi); + + if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(N0, InLo, InHi); + else + std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0); + EVT InLoVT = InLo.getValueType(); unsigned InNumElements = InLoVT.getVectorNumElements(); @@ -1253,12 +1294,9 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, if ((NumElements & 1) == 0 && SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) { LLVMContext &Ctx = *DAG.getContext(); - EVT NewSrcVT = EVT::getVectorVT( - Ctx, EVT::getIntegerVT( - Ctx, SrcVT.getScalarSizeInBits() * 2), - NumElements); - EVT SplitSrcVT = - EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2); + EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx); + EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx); + EVT SplitLoVT, SplitHiVT; std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT); if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) && @@ -1372,7 +1410,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, } // Construct the Lo/Hi output using a BUILD_VECTOR. - Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps); + Output = DAG.getBuildVector(NewVT, dl, SVOps); } else if (InputUsed[0] == -1U) { // No input vectors were used! The result is undefined. Output = DAG.getUNDEF(NewVT); @@ -1466,8 +1504,31 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::FTRUNC: + case ISD::FCANONICALIZE: Res = SplitVecOp_UnaryOp(N); break; + + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: + Res = SplitVecOp_ExtVecInRegOp(N); + break; + + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Res = SplitVecOp_VECREDUCE(N, OpNo); + break; } } @@ -1520,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect); } +SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { + EVT ResVT = N->getValueType(0); + SDValue Lo, Hi; + SDLoc dl(N); + + SDValue VecOp = N->getOperand(OpNo); + EVT VecVT = VecOp.getValueType(); + assert(VecVT.isVector() && "Can only split reduce vector operand"); + GetSplitVector(VecOp, Lo, Hi); + EVT LoOpVT, HiOpVT; + std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); + + bool NoNaN = N->getFlags().hasNoNaNs(); + unsigned CombineOpc = 0; + switch (N->getOpcode()) { + case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break; + case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break; + case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break; + case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break; + case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break; + case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break; + case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break; + case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; + case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; + case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; + case ISD::VECREDUCE_FMAX: + CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN; + break; + case ISD::VECREDUCE_FMIN: + CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN; + break; + default: + llvm_unreachable("Unexpected reduce ISD node"); + } + + // Use the appropriate scalar instruction on the split subvectors before + // reducing the now partially reduced smaller vector. + SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi); + return DAG.getNode(N->getOpcode(), dl, ResVT, Partial); +} + SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); @@ -1615,7 +1718,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { EltVT = MVT::i8; VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, VecVT.getVectorNumElements()); - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps); + Vec = DAG.getBuildVector(VecVT, dl, ElementOps); } // Store the vector to the stack. @@ -1629,6 +1732,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { MachinePointerInfo(), EltVT); } +SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) { + SDValue Lo, Hi; + + // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so + // splitting the result has the same effect as splitting the input operand. + SplitVecRes_ExtVecInRegOp(N, Lo, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo) { EVT LoVT, HiVT; @@ -1881,7 +1994,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts); + return DAG.getBuildVector(N->getValueType(0), DL, Elts); } SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) { @@ -2165,7 +2278,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { EVT WidenEltVT = WidenVT.getVectorElementType(); EVT VT = WidenVT; unsigned NumElts = VT.getVectorNumElements(); - const SDNodeFlags *Flags = N->getFlags(); + const SDNodeFlags Flags = N->getFlags(); while (!TLI.isTypeLegal(VT) && NumElts != 1) { NumElts = NumElts / 2; VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); @@ -2313,7 +2426,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { unsigned Opcode = N->getOpcode(); unsigned InVTNumElts = InVT.getVectorNumElements(); - const SDNodeFlags *Flags = N->getFlags(); + const SDNodeFlags Flags = N->getFlags(); if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(N->getOperand(0)); InVT = InOp.getValueType(); @@ -2323,6 +2436,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { return DAG.getNode(Opcode, DL, WidenVT, InOp); return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags); } + if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) { + // If both input and result vector types are of same width, extend + // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which + // accepts fewer elements in the result than in the input. + if (Opcode == ISD::SIGN_EXTEND) + return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT); + if (Opcode == ISD::ZERO_EXTEND) + return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT); + } } if (TLI.isTypeLegal(InWidenVT)) { @@ -2375,7 +2497,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, DL, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -2430,7 +2552,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) { while (Ops.size() != WidenNumElts) Ops.push_back(DAG.getUNDEF(WidenSVT)); - return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, DL, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) { @@ -2568,7 +2690,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { if (InVT.isVector()) NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); else - NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops); + NewVec = DAG.getBuildVector(NewInVT, dl, Ops); return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec); } } @@ -2593,7 +2715,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!"); NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT)); - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps); + return DAG.getBuildVector(WidenVT, dl, NewOps); } SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { @@ -2663,7 +2785,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(EltVT); for (; Idx < WidenNumElts; ++Idx) Ops[Idx] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { @@ -2704,7 +2826,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(EltVT); for (; i < WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { @@ -2814,6 +2936,212 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) { WidenVT, N->getOperand(0)); } +// Return true if this is a node that could have two SETCCs as operands. +static inline bool isLogicalMaskOp(unsigned Opcode) { + switch (Opcode) { + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return true; + } + return false; +} + +// This is used just for the assert in convertMask(). Check that this either +// a SETCC or a previously handled SETCC by convertMask(). +#ifndef NDEBUG +static inline bool isSETCCorConvertedSETCC(SDValue N) { + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) + N = N.getOperand(0); + else if (N.getOpcode() == ISD::CONCAT_VECTORS) { + for (unsigned i = 1; i < N->getNumOperands(); ++i) + if (!N->getOperand(i)->isUndef()) + return false; + N = N.getOperand(0); + } + + if (N.getOpcode() == ISD::TRUNCATE) + N = N.getOperand(0); + else if (N.getOpcode() == ISD::SIGN_EXTEND) + N = N.getOperand(0); + + return (N.getOpcode() == ISD::SETCC); +} +#endif + +// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT +// to ToMaskVT if needed with vector extension or truncation. +SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT, + EVT ToMaskVT) { + LLVMContext &Ctx = *DAG.getContext(); + + // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled. + unsigned InMaskOpc = InMask->getOpcode(); + assert((InMaskOpc == ISD::SETCC || + (isLogicalMaskOp(InMaskOpc) && + isSETCCorConvertedSETCC(InMask->getOperand(0)) && + isSETCCorConvertedSETCC(InMask->getOperand(1)))) && + "Unexpected mask argument."); + + // Make a new Mask node, with a legal result VT. + SmallVector<SDValue, 4> Ops; + for (unsigned i = 0; i < InMask->getNumOperands(); ++i) + Ops.push_back(InMask->getOperand(i)); + SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops); + + // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign + // extend or truncate is needed. + unsigned MaskScalarBits = MaskVT.getScalarSizeInBits(); + unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits(); + if (MaskScalarBits < ToMaskScalBits) { + EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), + MaskVT.getVectorNumElements()); + Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask); + } else if (MaskScalarBits > ToMaskScalBits) { + EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(), + MaskVT.getVectorNumElements()); + Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask); + } + + assert(Mask->getValueType(0).getScalarSizeInBits() == + ToMaskVT.getScalarSizeInBits() && + "Mask should have the right element size by now."); + + // Adjust Mask to the right number of elements. + unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements(); + if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) { + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy); + Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask, + ZeroIdx); + } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) { + unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls); + EVT SubVT = Mask->getValueType(0); + SmallVector<SDValue, 16> SubConcatOps(NumSubVecs); + SubConcatOps[0] = Mask; + for (unsigned i = 1; i < NumSubVecs; ++i) + SubConcatOps[i] = DAG.getUNDEF(SubVT); + Mask = + DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps); + } + + assert((Mask->getValueType(0) == ToMaskVT) && + "A mask of ToMaskVT should have been produced by now."); + + return Mask; +} + +// Get the target mask VT, and widen if needed. +EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) { + assert(SetCC->getOpcode() == ISD::SETCC); + LLVMContext &Ctx = *DAG.getContext(); + EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType()); + if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) + MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT); + return MaskVT; +} + +// This method tries to handle VSELECT and its mask by legalizing operands +// (which may require widening) and if needed adjusting the mask vector type +// to match that of the VSELECT. Without it, many cases end up with +// scalarization of the SETCC, with many unnecessary instructions. +SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) { + LLVMContext &Ctx = *DAG.getContext(); + SDValue Cond = N->getOperand(0); + + if (N->getOpcode() != ISD::VSELECT) + return SDValue(); + + if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode())) + return SDValue(); + + // If this is a splitted VSELECT that was previously already handled, do + // nothing. + if (Cond->getValueType(0).getScalarSizeInBits() != 1) + return SDValue(); + + EVT VSelVT = N->getValueType(0); + // Only handle vector types which are a power of 2. + if (!isPowerOf2_64(VSelVT.getSizeInBits())) + return SDValue(); + + // Don't touch if this will be scalarized. + EVT FinalVT = VSelVT; + while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector) + FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx); + + if (FinalVT.getVectorNumElements() == 1) + return SDValue(); + + // If there is support for an i1 vector mask, don't touch. + if (Cond.getOpcode() == ISD::SETCC) { + EVT SetCCOpVT = Cond->getOperand(0).getValueType(); + while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal) + SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT); + EVT SetCCResVT = getSetCCResultType(SetCCOpVT); + if (SetCCResVT.getScalarSizeInBits() == 1) + return SDValue(); + } + + // Get the VT and operands for VSELECT, and widen if needed. + SDValue VSelOp1 = N->getOperand(1); + SDValue VSelOp2 = N->getOperand(2); + if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) { + VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT); + VSelOp1 = GetWidenedVector(VSelOp1); + VSelOp2 = GetWidenedVector(VSelOp2); + } + + // The mask of the VSELECT should have integer elements. + EVT ToMaskVT = VSelVT; + if (!ToMaskVT.getScalarType().isInteger()) + ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger(); + + SDValue Mask; + if (Cond->getOpcode() == ISD::SETCC) { + EVT MaskVT = getSETCCWidenedResultTy(Cond); + Mask = convertMask(Cond, MaskVT, ToMaskVT); + } else if (isLogicalMaskOp(Cond->getOpcode()) && + Cond->getOperand(0).getOpcode() == ISD::SETCC && + Cond->getOperand(1).getOpcode() == ISD::SETCC) { + // Cond is (AND/OR/XOR (SETCC, SETCC)) + SDValue SETCC0 = Cond->getOperand(0); + SDValue SETCC1 = Cond->getOperand(1); + EVT VT0 = getSETCCWidenedResultTy(SETCC0); + EVT VT1 = getSETCCWidenedResultTy(SETCC1); + unsigned ScalarBits0 = VT0.getScalarSizeInBits(); + unsigned ScalarBits1 = VT1.getScalarSizeInBits(); + unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits(); + EVT MaskVT; + // If the two SETCCs have different VTs, either extend/truncate one of + // them to the other "towards" ToMaskVT, or truncate one and extend the + // other to ToMaskVT. + if (ScalarBits0 != ScalarBits1) { + EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1); + EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0); + if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits()) + MaskVT = WideVT; + else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits()) + MaskVT = NarrowVT; + else + MaskVT = ToMaskVT; + } else + // If the two SETCCs have the same VT, don't change it. + MaskVT = VT0; + + // Make new SETCCs and logical nodes. + SETCC0 = convertMask(SETCC0, VT0, MaskVT); + SETCC1 = convertMask(SETCC1, VT1, MaskVT); + Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1); + + // Convert the logical op for VSELECT if needed. + Mask = convertMask(Cond, MaskVT, ToMaskVT); + } else + return SDValue(); + + return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2); +} + SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); @@ -2821,6 +3149,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { SDValue Cond1 = N->getOperand(0); EVT CondVT = Cond1.getValueType(); if (CondVT.isVector()) { + if (SDValue Res = WidenVSELECTAndMask(N)) + return Res; + EVT CondEltVT = CondVT.getVectorElementType(); EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenNumElts); @@ -3093,7 +3424,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp, DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())))); - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { @@ -3144,7 +3475,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) { @@ -3565,10 +3896,9 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain, for (; i != WidenNumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops); + return DAG.getBuildVector(WidenVT, dl, Ops); } - void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST) { // The strategy assumes that we can efficiently store power-of-two widths. @@ -3737,5 +4067,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) Ops[Idx] = FillVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + return DAG.getBuildVector(NVT, dl, Ops); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index ded8e68fcbce..a21b4c733254 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -57,10 +57,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) RegPressure.resize(NumRC); std::fill(RegLimit.begin(), RegLimit.end(), 0); std::fill(RegPressure.begin(), RegPressure.end(), 0); - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); - I != E; ++I) - RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF); + for (const TargetRegisterClass *RC : TRI->regclasses()) + RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF); ParallelLiveRanges = 0; HorizontalVerticalBalance = 0; @@ -69,12 +67,11 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) unsigned ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl()) + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) continue; - SUnit *PredSU = I->getSUnit(); + SUnit *PredSU = Pred.getSUnit(); const SDNode *ScegN = PredSU->getNode(); if (!ScegN) @@ -107,12 +104,11 @@ ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isCtrl()) + for (const SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) continue; - SUnit *SuccSU = I->getSUnit(); + SUnit *SuccSU = Succ.getSUnit(); const SDNode *ScegN = SuccSU->getNode(); if (!ScegN) continue; @@ -144,9 +140,8 @@ unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU, static unsigned numberCtrlDepsInSU(SUnit *SU) { unsigned NumberDeps = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) - if (I->isCtrl()) + for (const SDep &Succ : SU->Succs) + if (Succ.isCtrl()) NumberDeps++; return NumberDeps; @@ -154,9 +149,8 @@ static unsigned numberCtrlDepsInSU(SUnit *SU) { static unsigned numberCtrlPredInSU(SUnit *SU) { unsigned NumberDeps = 0; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) - if (I->isCtrl()) + for (SDep &Pred : SU->Preds) + if (Pred.isCtrl()) NumberDeps++; return NumberDeps; @@ -214,15 +208,14 @@ bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const { /// of SU, return it, otherwise return null. SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) { SUnit *OnlyAvailablePred = nullptr; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - SUnit &Pred = *I->getSUnit(); - if (!Pred.isScheduled) { + for (const SDep &Pred : SU->Preds) { + SUnit &PredSU = *Pred.getSUnit(); + if (!PredSU.isScheduled) { // We found an available, but not scheduled, predecessor. If it's the // only one we have found, keep track of it... otherwise give up. - if (OnlyAvailablePred && OnlyAvailablePred != &Pred) + if (OnlyAvailablePred && OnlyAvailablePred != &PredSU) return nullptr; - OnlyAvailablePred = &Pred; + OnlyAvailablePred = &PredSU; } } return OnlyAvailablePred; @@ -232,9 +225,8 @@ void ResourcePriorityQueue::push(SUnit *SU) { // Look at all of the successors of this node. Count the number of nodes that // this node is the sole unscheduled node for. unsigned NumNodesBlocking = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) - if (getSingleUnscheduledPred(I->getSUnit()) == SU) + for (const SDep &Succ : SU->Succs) + if (getSingleUnscheduledPred(Succ.getSUnit()) == SU) ++NumNodesBlocking; NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking; @@ -271,14 +263,13 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) { // Now see if there are no other dependencies // to instructions already in the packet. for (unsigned i = 0, e = Packet.size(); i != e; ++i) - for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(), - E = Packet[i]->Succs.end(); I != E; ++I) { + for (const SDep &Succ : Packet[i]->Succs) { // Since we do not add pseudos to packets, might as well // ignore order deps. - if (I->isCtrl()) + if (Succ.isCtrl()) continue; - if (I->getSUnit() == SU) + if (Succ.getSUnit() == SU) return false; } @@ -364,16 +355,11 @@ int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) { return RegBalance; if (RawPressure) { - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; + for (const TargetRegisterClass *RC : TRI->regclasses()) RegBalance += rawRegPressureDelta(SU, RC->getID()); - } } else { - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; + for (const TargetRegisterClass *RC : TRI->regclasses()) { if ((RegPressure[RC->getID()] + rawRegPressureDelta(SU, RC->getID()) > 0) && (RegPressure[RC->getID()] + @@ -506,11 +492,10 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) { } } } - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl() || (I->getSUnit()->NumRegDefsLeft == 0)) + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl() || (Pred.getSUnit()->NumRegDefsLeft == 0)) continue; - --I->getSUnit()->NumRegDefsLeft; + --Pred.getSUnit()->NumRegDefsLeft; } } @@ -522,10 +507,9 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) { // number of live ranges. All others, increase it. unsigned NumberNonControlDeps = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - adjustPriorityOfUnscheduledPreds(I->getSUnit()); - if (!I->isCtrl()) + for (const SDep &Succ : SU->Succs) { + adjustPriorityOfUnscheduledPreds(Succ.getSUnit()); + if (!Succ.isCtrl()) NumberNonControlDeps++; } @@ -602,8 +586,7 @@ SUnit *ResourcePriorityQueue::pop() { std::vector<SUnit *>::iterator Best = Queue.begin(); if (!DisableDFASched) { int BestCost = SUSchedulingCost(*Best); - for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()), - E = Queue.end(); I != E; ++I) { + for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I) { if (SUSchedulingCost(*I) > BestCost) { BestCost = SUSchedulingCost(*I); @@ -613,8 +596,7 @@ SUnit *ResourcePriorityQueue::pop() { } // Use default TD scheduling mechanism. else { - for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()), - E = Queue.end(); I != E; ++I) + for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I) if (Picker(*Best, *I)) Best = I; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 62e7733ecd2b..d80a281279b6 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -160,18 +160,17 @@ void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) { void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) { // Bottom up: release predecessors - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - ReleasePred(SU, &*I); - if (I->isAssignedRegDep()) { + for (SDep &Pred : SU->Preds) { + ReleasePred(SU, &Pred); + if (Pred.isAssignedRegDep()) { // This is a physical register dependency and it's impossible or // expensive to copy the register. Make sure nothing that can // clobber the register is scheduled between the predecessor and // this node. - if (!LiveRegDefs[I->getReg()]) { + if (!LiveRegDefs[Pred.getReg()]) { ++NumLiveRegs; - LiveRegDefs[I->getReg()] = I->getSUnit(); - LiveRegCycles[I->getReg()] = CurCycle; + LiveRegDefs[Pred.getReg()] = Pred.getSUnit(); + LiveRegCycles[Pred.getReg()] = CurCycle; } } } @@ -191,16 +190,15 @@ void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) { ReleasePredecessors(SU, CurCycle); // Release all the implicit physical register defs that are live. - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isAssignedRegDep()) { - if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) { + for (SDep &Succ : SU->Succs) { + if (Succ.isAssignedRegDep()) { + if (LiveRegCycles[Succ.getReg()] == Succ.getSUnit()->getHeight()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); - assert(LiveRegDefs[I->getReg()] == SU && + assert(LiveRegDefs[Succ.getReg()] == SU && "Physical register dependency violated?"); --NumLiveRegs; - LiveRegDefs[I->getReg()] = nullptr; - LiveRegCycles[I->getReg()] = 0; + LiveRegDefs[Succ.getReg()] = nullptr; + LiveRegCycles[Succ.getReg()] = 0; } } } @@ -282,22 +280,20 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) { SmallVector<SDep, 4> LoadPreds; SmallVector<SDep, 4> NodePreds; SmallVector<SDep, 4> NodeSuccs; - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl()) - ChainPred = *I; - else if (I->getSUnit()->getNode() && - I->getSUnit()->getNode()->isOperandOf(LoadNode)) - LoadPreds.push_back(*I); + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) + ChainPred = Pred; + else if (Pred.getSUnit()->getNode() && + Pred.getSUnit()->getNode()->isOperandOf(LoadNode)) + LoadPreds.push_back(Pred); else - NodePreds.push_back(*I); + NodePreds.push_back(Pred); } - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isCtrl()) - ChainSuccs.push_back(*I); + for (SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) + ChainSuccs.push_back(Succ); else - NodeSuccs.push_back(*I); + NodeSuccs.push_back(Succ); } if (ChainPred.getSUnit()) { @@ -354,21 +350,19 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) { NewSU = Clone(SU); // New SUnit has the exact same predecessors. - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) - if (!I->isArtificial()) - AddPred(NewSU, *I); + for (SDep &Pred : SU->Preds) + if (!Pred.isArtificial()) + AddPred(NewSU, Pred); // Only copy scheduled successors. Cut them from old node's successor // list and move them over. SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps; - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isArtificial()) + for (SDep &Succ : SU->Succs) { + if (Succ.isArtificial()) continue; - SUnit *SuccSU = I->getSUnit(); + SUnit *SuccSU = Succ.getSUnit(); if (SuccSU->isScheduled) { - SDep D = *I; + SDep D = Succ; D.setSUnit(NewSU); AddPred(SuccSU, D); D.setSUnit(SU); @@ -399,16 +393,15 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg, // Only copy scheduled successors. Cut them from old node's successor // list and move them over. SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps; - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isArtificial()) + for (SDep &Succ : SU->Succs) { + if (Succ.isArtificial()) continue; - SUnit *SuccSU = I->getSUnit(); + SUnit *SuccSU = Succ.getSUnit(); if (SuccSU->isScheduled) { - SDep D = *I; + SDep D = Succ; D.setSUnit(CopyToSU); AddPred(SuccSU, D); - DelDeps.push_back(std::make_pair(SuccSU, *I)); + DelDeps.push_back(std::make_pair(SuccSU, Succ)); } } for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) { @@ -479,10 +472,9 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU, SmallSet<unsigned, 4> RegAdded; // If this node would clobber any "live" register, then it's not ready. - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isAssignedRegDep()) { - CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs, + for (SDep &Pred : SU->Preds) { + if (Pred.isAssignedRegDep()) { + CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs, RegAdded, LRegs, TRI); } } @@ -755,9 +747,8 @@ void ScheduleDAGLinearize::Schedule() { // Glue user must be scheduled together with the glue operand. So other // users of the glue operand must be treated as its users. SDNode *ImmGUser = Glue->getGluedUser(); - for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end(); - ui != ue; ++ui) - if (*ui == ImmGUser) + for (const SDNode *U : Glue->uses()) + if (U == ImmGUser) --Degree; GUser->setNodeId(UDegree + Degree); Glue->setNodeId(1); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 3549ccd9e345..4f4025d8ae6a 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -422,11 +422,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner, } // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END. if (N->isMachineOpcode()) { - if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameDestroyOpcode()) { + if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { ++NestLevel; - } else if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameSetupOpcode()) { + } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { if (NestLevel == 0) return false; --NestLevel; @@ -480,12 +478,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest, } // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END. if (N->isMachineOpcode()) { - if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameDestroyOpcode()) { + if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { ++NestLevel; MaxNest = std::max(MaxNest, NestLevel); - } else if (N->getMachineOpcode() == - (unsigned)TII->getCallFrameSetupOpcode()) { + } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { assert(NestLevel != 0); --NestLevel; if (NestLevel == 0) @@ -524,21 +520,20 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest, /// interference on flags. void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) { // Bottom up: release predecessors - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - ReleasePred(SU, &*I); - if (I->isAssignedRegDep()) { + for (SDep &Pred : SU->Preds) { + ReleasePred(SU, &Pred); + if (Pred.isAssignedRegDep()) { // This is a physical register dependency and it's impossible or // expensive to copy the register. Make sure nothing that can // clobber the register is scheduled between the predecessor and // this node. - SUnit *RegDef = LiveRegDefs[I->getReg()]; (void)RegDef; - assert((!RegDef || RegDef == SU || RegDef == I->getSUnit()) && + SUnit *RegDef = LiveRegDefs[Pred.getReg()]; (void)RegDef; + assert((!RegDef || RegDef == SU || RegDef == Pred.getSUnit()) && "interference on register dependence"); - LiveRegDefs[I->getReg()] = I->getSUnit(); - if (!LiveRegGens[I->getReg()]) { + LiveRegDefs[Pred.getReg()] = Pred.getSUnit(); + if (!LiveRegGens[Pred.getReg()]) { ++NumLiveRegs; - LiveRegGens[I->getReg()] = SU; + LiveRegGens[Pred.getReg()] = SU; } } } @@ -550,7 +545,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) { if (!LiveRegDefs[CallResource]) for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) if (Node->isMachineOpcode() && - Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { unsigned NestLevel = 0; unsigned MaxNest = 0; SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII); @@ -737,15 +732,14 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { ReleasePredecessors(SU); // Release all the implicit physical register defs that are live. - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - // LiveRegDegs[I->getReg()] != SU when SU is a two-address node. - if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] == SU) { + for (SDep &Succ : SU->Succs) { + // LiveRegDegs[Succ.getReg()] != SU when SU is a two-address node. + if (Succ.isAssignedRegDep() && LiveRegDefs[Succ.getReg()] == SU) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; - LiveRegDefs[I->getReg()] = nullptr; - LiveRegGens[I->getReg()] = nullptr; - releaseInterferences(I->getReg()); + LiveRegDefs[Succ.getReg()] = nullptr; + LiveRegGens[Succ.getReg()] = nullptr; + releaseInterferences(Succ.getReg()); } } // Release the special call resource dependence, if this is the beginning @@ -755,7 +749,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; LiveRegDefs[CallResource] = nullptr; @@ -806,17 +800,16 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: "); DEBUG(SU->dump(this)); - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - CapturePred(&*I); - if (I->isAssignedRegDep() && SU == LiveRegGens[I->getReg()]){ + for (SDep &Pred : SU->Preds) { + CapturePred(&Pred); + if (Pred.isAssignedRegDep() && SU == LiveRegGens[Pred.getReg()]){ assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); - assert(LiveRegDefs[I->getReg()] == I->getSUnit() && + assert(LiveRegDefs[Pred.getReg()] == Pred.getSUnit() && "Physical register dependency violated?"); --NumLiveRegs; - LiveRegDefs[I->getReg()] = nullptr; - LiveRegGens[I->getReg()] = nullptr; - releaseInterferences(I->getReg()); + LiveRegDefs[Pred.getReg()] = nullptr; + LiveRegGens[Pred.getReg()] = nullptr; + releaseInterferences(Pred.getReg()); } } @@ -826,7 +819,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) { ++NumLiveRegs; LiveRegDefs[CallResource] = SU; LiveRegGens[CallResource] = CallSeqEndForStart[SU]; @@ -839,7 +832,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) { for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->isMachineOpcode() && - SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) { assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!"); --NumLiveRegs; LiveRegDefs[CallResource] = nullptr; @@ -899,7 +892,7 @@ void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() { std::vector<SUnit*>::const_iterator I = (Sequence.end() - LookAhead); unsigned HazardCycle = (*I)->getHeight(); - for (std::vector<SUnit*>::const_iterator E = Sequence.end(); I != E; ++I) { + for (auto E = Sequence.end(); I != E; ++I) { SUnit *SU = *I; for (; SU->getHeight() > HazardCycle; ++HazardCycle) { HazardRec->RecedeCycle(); @@ -1265,10 +1258,9 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { // // If SU is the currently live definition of the same register that it uses, // then we are free to schedule it. - for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU) - CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(), + for (SDep &Pred : SU->Preds) { + if (Pred.isAssignedRegDep() && LiveRegDefs[Pred.getReg()] != SU) + CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs.get(), RegAdded, LRegs, TRI); } @@ -1305,7 +1297,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { // If we're in the middle of scheduling a call, don't begin scheduling // another call. Also, don't allow any physical registers to be live across // the call. - if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) { + if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) || + (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) { // Check the special calling-sequence resource. unsigned CallResource = TRI->getNumRegs(); if (LiveRegDefs[CallResource]) { @@ -1323,6 +1316,18 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { RegAdded, LRegs); const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); + if (MCID.hasOptionalDef()) { + // Most ARM instructions have an OptionalDef for CPSR, to model the S-bit. + // This operand can be either a def of CPSR, if the S bit is set; or a use + // of %noreg. When the OptionalDef is set to a valid register, we need to + // handle it in the same way as an ImplicitDef. + for (unsigned i = 0; i < MCID.getNumDefs(); ++i) + if (MCID.OpInfo[i].isOptionalDef()) { + const SDValue &OptionalDef = Node->getOperand(i - Node->getNumValues()); + unsigned Reg = cast<RegisterSDNode>(OptionalDef)->getReg(); + CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); + } + } if (!MCID.ImplicitDefs) continue; for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) @@ -1659,9 +1664,8 @@ public: RegPressure.resize(NumRC); std::fill(RegLimit.begin(), RegLimit.end(), 0); std::fill(RegPressure.begin(), RegPressure.end(), 0); - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) - RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF); + for (const TargetRegisterClass *RC : TRI->regclasses()) + RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF); } } @@ -1735,8 +1739,7 @@ protected: template<class SF> static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) { std::vector<SUnit *>::iterator Best = Q.begin(); - for (std::vector<SUnit *>::iterator I = std::next(Q.begin()), - E = Q.end(); I != E; ++I) + for (auto I = std::next(Q.begin()), E = Q.end(); I != E; ++I) if (Picker(*Best, *I)) Best = I; SUnit *V = *Best; @@ -1788,7 +1791,7 @@ public: } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(ScheduleDAG *DAG) const override { + LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override { // Emulate pop() without clobbering NodeQueueIds. std::vector<SUnit*> DumpQueue = Queue; SF DumpPicker = Picker; @@ -1924,19 +1927,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const { // Register Pressure Tracking //===----------------------------------------------------------------------===// -void RegReductionPQBase::dumpRegPressure() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(), - E = TRI->regclass_end(); I != E; ++I) { - const TargetRegisterClass *RC = *I; +LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const { + for (const TargetRegisterClass *RC : TRI->regclasses()) { unsigned Id = RC->getID(); unsigned RP = RegPressure[Id]; if (!RP) continue; DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / " << RegLimit[Id] << '\n'); } -#endif } +#endif bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const { if (!TLI) @@ -2092,7 +2093,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) { RegPressure[RCId] -= Cost; } } - dumpRegPressure(); + DEBUG(dumpRegPressure()); } void RegReductionPQBase::unscheduledNode(SUnit *SU) { @@ -2172,7 +2173,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) { } } - dumpRegPressure(); + DEBUG(dumpRegPressure()); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 3be622f8c179..3c8526ebb702 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -650,6 +650,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use, } void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const { + // Cannot completely remove virtual function even in release mode. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) if (!SU->getNode()) { dbgs() << "PHYS REG COPY\n"; @@ -704,8 +705,8 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, if (!N->getHasDebugValue()) return; - // Opportunistically insert immediate dbg_value uses, i.e. those with source - // order number right after the N. + // Opportunistically insert immediate dbg_value uses, i.e. those with the same + // source order number as N. MachineBasicBlock *BB = Emitter.getBlock(); MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos(); ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N); @@ -713,7 +714,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter, if (DVs[i]->isInvalidated()) continue; unsigned DVOrder = DVs[i]->getOrder(); - if (!Order || DVOrder == ++Order) { + if (!Order || DVOrder == Order) { MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap); if (DbgMI) { Orders.push_back(std::make_pair(DVOrder, DbgMI)); @@ -835,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { GluedNodes.push_back(N); while (!GluedNodes.empty()) { SDNode *N = GluedNodes.back(); - Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned, - VRBaseMap); + Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap); // Remember the source order of the inserted instruction. if (HasDbg) ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index e225ba8703b7..177898e1e950 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" @@ -289,28 +290,28 @@ static int isSignedOp(ISD::CondCode Opcode) { } ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2, - bool isInteger) { - if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) + bool IsInteger) { + if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed integer setcc with an unsigned integer setcc. return ISD::SETCC_INVALID; unsigned Op = Op1 | Op2; // Combine all of the condition bits. - // If the N and U bits get set then the resultant comparison DOES suddenly - // care about orderedness, and is true when ordered. + // If the N and U bits get set, then the resultant comparison DOES suddenly + // care about orderedness, and it is true when ordered. if (Op > ISD::SETTRUE2) Op &= ~16; // Clear the U bit if the N bit is set. // Canonicalize illegal integer setcc's. - if (isInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT + if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT Op = ISD::SETNE; return ISD::CondCode(Op); } ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, - bool isInteger) { - if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) + bool IsInteger) { + if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3) // Cannot fold a signed setcc with an unsigned setcc. return ISD::SETCC_INVALID; @@ -318,7 +319,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2, ISD::CondCode Result = ISD::CondCode(Op1 & Op2); // Canonicalize illegal integer setcc's. - if (isInteger) { + if (IsInteger) { switch (Result) { default: break; case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT @@ -639,12 +640,15 @@ void SelectionDAG::DeallocateNode(SDNode *N) { // If we have operands, deallocate them. removeOperands(N); + NodeAllocator.Deallocate(AllNodes.remove(N)); + // Set the opcode to DELETED_NODE to help catch bugs when node // memory is reallocated. + // FIXME: There are places in SDag that have grown a dependency on the opcode + // value in the released node. + __asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType)); N->NodeType = ISD::DELETED_NODE; - NodeAllocator.Deallocate(AllNodes.remove(N)); - // If any of the SDDbgValue nodes refer to this SDNode, invalidate // them and forget about that node. DbgInfo->erase(N); @@ -807,8 +811,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op, AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos); if (Node) - if (const SDNodeFlags *Flags = N->getFlags()) - Node->intersectFlagsWith(Flags); + Node->intersectFlagsWith(N->getFlags()); return Node; } @@ -828,8 +831,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos); if (Node) - if (const SDNodeFlags *Flags = N->getFlags()) - Node->intersectFlagsWith(Flags); + Node->intersectFlagsWith(N->getFlags()); return Node; } @@ -848,8 +850,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops, AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos); if (Node) - if (const SDNodeFlags *Flags = N->getFlags()) - Node->intersectFlagsWith(Flags); + Node->intersectFlagsWith(N->getFlags()); return Node; } @@ -871,11 +872,13 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) DbgInfo = new SDDbgInfo(); } -void SelectionDAG::init(MachineFunction &mf) { - MF = &mf; +void SelectionDAG::init(MachineFunction &NewMF, + OptimizationRemarkEmitter &NewORE) { + MF = &NewMF; + ORE = &NewORE; TLI = getSubtarget().getTargetLowering(); TSI = getSubtarget().getSelectionDAGInfo(); - Context = &mf.getFunction()->getContext(); + Context = &MF->getFunction()->getContext(); } SelectionDAG::~SelectionDAG() { @@ -895,29 +898,6 @@ void SelectionDAG::allnodes_clear() { #endif } -SDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, const SDLoc &DL, - SDVTList VTs, SDValue N1, SDValue N2, - const SDNodeFlags *Flags) { - SDValue Ops[] = {N1, N2}; - - if (isBinOpWithFlags(Opcode)) { - // If no flags were passed in, use a default flags object. - SDNodeFlags F; - if (Flags == nullptr) - Flags = &F; - - auto *FN = newSDNode<BinaryWithFlagsSDNode>(Opcode, DL.getIROrder(), - DL.getDebugLoc(), VTs, *Flags); - createOperands(FN, Ops); - - return FN; - } - - auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); - createOperands(N, Ops); - return N; -} - SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos) { SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos); @@ -979,6 +959,12 @@ void SelectionDAG::clear() { DbgInfo->clear(); } +SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) { + return VT.bitsGT(Op.getValueType()) + ? getNode(ISD::FP_EXTEND, DL, VT, Op) + : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : @@ -1824,7 +1810,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) { std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); + return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout())); } SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { @@ -1837,7 +1823,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { MachineFrameInfo &MFI = getMachineFunction().getFrameInfo(); int FrameIdx = MFI.CreateStackObject(Bytes, Align, false); - return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout())); + return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout())); } SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, @@ -1953,7 +1939,7 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, /// use this predicate to simplify operations downstream. bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { unsigned BitWidth = Op.getScalarValueSizeInBits(); - return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth); + return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth); } /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use @@ -1961,9 +1947,9 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const { /// for bits that V cannot have. bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth) const { - APInt KnownZero, KnownOne; - computeKnownBits(Op, KnownZero, KnownOne, Depth); - return (KnownZero & Mask) == Mask; + KnownBits Known; + computeKnownBits(Op, Known, Depth); + return Mask.isSubsetOf(Known.Zero); } /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that @@ -1979,33 +1965,30 @@ static const APInt *getValidShiftAmountConstant(SDValue V) { } /// Determine which bits of Op are known to be either zero or one and return -/// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are -/// those that are shared by every vector element. -void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, - APInt &KnownOne, unsigned Depth) const { +/// them in Known. For vectors, the known bits are those that are shared by +/// every vector element. +void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, + unsigned Depth) const { EVT VT = Op.getValueType(); APInt DemandedElts = VT.isVector() ? APInt::getAllOnesValue(VT.getVectorNumElements()) : APInt(1, 1); - computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth); + computeKnownBits(Op, Known, DemandedElts, Depth); } /// Determine which bits of Op are known to be either zero or one and return -/// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows -/// us to only collect the known bits that are shared by the requested vector -/// elements. -/// TODO: We only support DemandedElts on a few opcodes so far, the remainder -/// should be added when they become necessary. -void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, - APInt &KnownOne, const APInt &DemandedElts, +/// them in Known. The DemandedElts argument allows us to only collect the known +/// bits that are shared by the requested vector elements. +void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, + const APInt &DemandedElts, unsigned Depth) const { unsigned BitWidth = Op.getScalarValueSizeInBits(); - KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. + Known = KnownBits(BitWidth); // Don't know anything. if (Depth == 6) return; // Limit search depth. - APInt KnownZero2, KnownOne2; + KnownBits Known2; unsigned NumElts = DemandedElts.getBitWidth(); if (!DemandedElts) @@ -2015,35 +1998,34 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, switch (Opcode) { case ISD::Constant: // We know all of the bits for a constant! - KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue(); - KnownZero = ~KnownOne; + Known.One = cast<ConstantSDNode>(Op)->getAPIntValue(); + Known.Zero = ~Known.One; break; case ISD::BUILD_VECTOR: // Collect the known bits that are shared by every demanded vector element. assert(NumElts == Op.getValueType().getVectorNumElements() && "Unexpected vector size"); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { if (!DemandedElts[i]) continue; SDValue SrcOp = Op.getOperand(i); - computeKnownBits(SrcOp, KnownZero2, KnownOne2, Depth + 1); + computeKnownBits(SrcOp, Known2, Depth + 1); // BUILD_VECTOR can implicitly truncate sources, we must handle this. if (SrcOp.getValueSizeInBits() != BitWidth) { assert(SrcOp.getValueSizeInBits() > BitWidth && "Expected BUILD_VECTOR implicit truncation"); - KnownOne2 = KnownOne2.trunc(BitWidth); - KnownZero2 = KnownZero2.trunc(BitWidth); + Known2 = Known2.trunc(BitWidth); } // Known bits are the values that are shared by every demanded element. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; } break; @@ -2051,7 +2033,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Collect the known bits that are shared by every vector element referenced // by the shuffle. APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); assert(NumElts == SVN->getMask().size() && "Unexpected vector size"); for (unsigned i = 0; i != NumElts; ++i) { @@ -2062,8 +2044,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (M < 0) { // For UNDEF elements, we don't know anything about the common state of // the shuffle result. - KnownOne.clearAllBits(); - KnownZero.clearAllBits(); + Known.resetAll(); DemandedLHS.clearAllBits(); DemandedRHS.clearAllBits(); break; @@ -2077,24 +2058,24 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Known bits are the values that are shared by every demanded element. if (!!DemandedLHS) { SDValue LHS = Op.getOperand(0); - computeKnownBits(LHS, KnownZero2, KnownOne2, DemandedLHS, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; if (!!DemandedRHS) { SDValue RHS = Op.getOperand(1); - computeKnownBits(RHS, KnownZero2, KnownOne2, DemandedRHS, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } break; } case ISD::CONCAT_VECTORS: { // Split DemandedElts and test each of the demanded subvectors. - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); EVT SubVectorVT = Op.getOperand(0).getValueType(); unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements(); unsigned NumSubVectors = Op.getNumOperands(); @@ -2103,12 +2084,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, DemandedSub = DemandedSub.trunc(NumSubVectorElts); if (!!DemandedSub) { SDValue Sub = Op.getOperand(i); - computeKnownBits(Sub, KnownZero2, KnownOne2, DemandedSub, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(Sub, Known2, DemandedSub, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; } break; @@ -2123,9 +2104,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Offset the demanded elts by the subvector index. uint64_t Idx = SubIdx->getZExtValue(); APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx); - computeKnownBits(Src, KnownZero, KnownOne, DemandedSrc, Depth + 1); + computeKnownBits(Src, Known, DemandedSrc, Depth + 1); } else { - computeKnownBits(Src, KnownZero, KnownOne, Depth + 1); + computeKnownBits(Src, Known, Depth + 1); } break; } @@ -2139,7 +2120,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // Fast handling of 'identity' bitcasts. if (BitWidth == SubBitWidth) { - computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1); + computeKnownBits(N0, Known, DemandedElts, Depth + 1); break; } @@ -2163,10 +2144,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, SubDemandedElts.setBit(i * SubScale); for (unsigned i = 0; i != SubScale; ++i) { - computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts.shl(i), + computeKnownBits(N0, Known2, SubDemandedElts.shl(i), Depth + 1); - KnownOne |= KnownOne2.zext(BitWidth).shl(SubBitWidth * i); - KnownZero |= KnownZero2.zext(BitWidth).shl(SubBitWidth * i); + Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * i); + Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i); } } @@ -2183,16 +2164,16 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (DemandedElts[i]) SubDemandedElts.setBit(i / SubScale); - computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts, Depth + 1); + computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0; i != NumElts; ++i) if (DemandedElts[i]) { unsigned Offset = (i % SubScale) * BitWidth; - KnownOne &= KnownOne2.lshr(Offset).trunc(BitWidth); - KnownZero &= KnownZero2.lshr(Offset).trunc(BitWidth); + Known.One &= Known2.One.lshr(Offset).trunc(BitWidth); + Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; } } @@ -2200,107 +2181,90 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, } case ISD::AND: // If either the LHS or the RHS are Zero, the result is zero. - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // Output known-1 bits are only known if set in both the LHS & RHS. - KnownOne &= KnownOne2; + Known.One &= Known2.One; // Output known-0 are known to be clear if zero in either the LHS | RHS. - KnownZero |= KnownZero2; + Known.Zero |= Known2.Zero; break; case ISD::OR: - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // Output known-0 bits are only known if clear in both the LHS & RHS. - KnownZero &= KnownZero2; + Known.Zero &= Known2.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. - KnownOne |= KnownOne2; + Known.One |= Known2.One; break; case ISD::XOR: { - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // Output known-0 bits are known if clear or set in both the LHS & RHS. - APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); + APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); - KnownZero = KnownZeroOut; + Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero); + Known.Zero = KnownZeroOut; break; } case ISD::MUL: { - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // If low bits are zero in either operand, output low known-0 bits. // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. - KnownOne.clearAllBits(); - unsigned TrailZ = KnownZero.countTrailingOnes() + - KnownZero2.countTrailingOnes(); - unsigned LeadZ = std::max(KnownZero.countLeadingOnes() + - KnownZero2.countLeadingOnes(), + unsigned TrailZ = Known.countMinTrailingZeros() + + Known2.countMinTrailingZeros(); + unsigned LeadZ = std::max(Known.countMinLeadingZeros() + + Known2.countMinLeadingZeros(), BitWidth) - BitWidth; - TrailZ = std::min(TrailZ, BitWidth); - LeadZ = std::min(LeadZ, BitWidth); - KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | - APInt::getHighBitsSet(BitWidth, LeadZ); + Known.resetAll(); + Known.Zero.setLowBits(std::min(TrailZ, BitWidth)); + Known.Zero.setHighBits(std::min(LeadZ, BitWidth)); break; } case ISD::UDIV: { // For the purposes of computing leading zeros we can conservatively // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - unsigned LeadZ = KnownZero2.countLeadingOnes(); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + unsigned LeadZ = Known2.countMinLeadingZeros(); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros(); - if (RHSUnknownLeadingOnes != BitWidth) - LeadZ = std::min(BitWidth, - LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros(); + if (RHSMaxLeadingZeros != BitWidth) + LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1); - KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); + Known.Zero.setHighBits(LeadZ); break; } case ISD::SELECT: - computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(2), Known, Depth+1); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1); + computeKnownBits(Op.getOperand(1), Known2, Depth+1); // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; case ISD::SELECT_CC: - computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(3), Known, Depth+1); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; - computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1); + computeKnownBits(Op.getOperand(2), Known2, Depth+1); // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; - case ISD::SADDO: - case ISD::UADDO: - case ISD::SSUBO: - case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) @@ -2312,51 +2276,46 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (TLI->getBooleanContents(Op.getValueType().isVector(), false) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; case ISD::SETCC: // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; case ISD::SHL: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero << *ShAmt; - KnownOne = KnownOne << *ShAmt; + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.Zero <<= *ShAmt; + Known.One <<= *ShAmt; // Low bits are known zero. - KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue()); + Known.Zero.setLowBits(ShAmt->getZExtValue()); } break; case ISD::SRL: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero.lshr(*ShAmt); - KnownOne = KnownOne.lshr(*ShAmt); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.Zero.lshrInPlace(*ShAmt); + Known.One.lshrInPlace(*ShAmt); // High bits are known zero. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); - KnownZero |= HighBits; + Known.Zero.setHighBits(ShAmt->getZExtValue()); } break; case ISD::SRA: if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero.lshr(*ShAmt); - KnownOne = KnownOne.lshr(*ShAmt); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.Zero.lshrInPlace(*ShAmt); + Known.One.lshrInPlace(*ShAmt); // If we know the value of the sign bit, then we know it is copied across // the high bits by the shift amount. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue()); - APInt SignBit = APInt::getSignBit(BitWidth); - SignBit = SignBit.lshr(*ShAmt); // Adjust to where it is now in the mask. - if (KnownZero.intersects(SignBit)) { - KnownZero |= HighBits; // New bits are known zero. - } else if (KnownOne.intersects(SignBit)) { - KnownOne |= HighBits; // New bits are known one. + APInt SignMask = APInt::getSignMask(BitWidth); + SignMask.lshrInPlace(*ShAmt); // Adjust to where it is now in the mask. + if (Known.Zero.intersects(SignMask)) { + Known.Zero.setHighBits(ShAmt->getZExtValue());// New bits are known zero. + } else if (Known.One.intersects(SignMask)) { + Known.One.setHighBits(ShAmt->getZExtValue()); // New bits are known one. } } break; @@ -2368,42 +2327,56 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // present in the input. APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits); - APInt InSignBit = APInt::getSignBit(EBits); + APInt InSignMask = APInt::getSignMask(EBits); APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits); // If the sign extended bits are demanded, we know that the sign // bit is demanded. - InSignBit = InSignBit.zext(BitWidth); + InSignMask = InSignMask.zext(BitWidth); if (NewBits.getBoolValue()) - InputDemandedBits |= InSignBit; + InputDemandedBits |= InSignMask; - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownOne &= InputDemandedBits; - KnownZero &= InputDemandedBits; + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known.One &= InputDemandedBits; + Known.Zero &= InputDemandedBits; // If the sign bit of the input is known set or clear, then we know the // top bits of the result. - if (KnownZero.intersects(InSignBit)) { // Input sign bit known clear - KnownZero |= NewBits; - KnownOne &= ~NewBits; - } else if (KnownOne.intersects(InSignBit)) { // Input sign bit known set - KnownOne |= NewBits; - KnownZero &= ~NewBits; + if (Known.Zero.intersects(InSignMask)) { // Input sign bit known clear + Known.Zero |= NewBits; + Known.One &= ~NewBits; + } else if (Known.One.intersects(InSignMask)) { // Input sign bit known set + Known.One |= NewBits; + Known.Zero &= ~NewBits; } else { // Input sign bit unknown - KnownZero &= ~NewBits; - KnownOne &= ~NewBits; + Known.Zero &= ~NewBits; + Known.One &= ~NewBits; } break; } case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + // If we have a known 1, its position is our upper bound. + unsigned PossibleTZ = Known2.countMaxTrailingZeros(); + unsigned LowBits = Log2_32(PossibleTZ) + 1; + Known.Zero.setBitsFrom(LowBits); + break; + } case ISD::CTLZ: - case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTLZ_ZERO_UNDEF: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + // If we have a known 1, its position is our upper bound. + unsigned PossibleLZ = Known2.countMaxLeadingZeros(); + unsigned LowBits = Log2_32(PossibleLZ) + 1; + Known.Zero.setBitsFrom(LowBits); + break; + } case ISD::CTPOP: { - unsigned LowBits = Log2_32(BitWidth)+1; - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); - KnownOne.clearAllBits(); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + // If we know some of the bits are zero, they can't be one. + unsigned PossibleOnes = Known2.countMaxPopulation(); + Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1); break; } case ISD::LOAD: { @@ -2412,76 +2385,87 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) { EVT VT = LD->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + Known.Zero.setBitsFrom(MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { if (LD->getExtensionType() == ISD::NON_EXTLOAD) - computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne); + computeKnownBitsFromRangeMetadata(*Ranges, Known); } break; } - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND_VECTOR_INREG: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); - KnownZero = KnownZero.trunc(InBits); - KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, + DemandedElts.zext(InVT.getVectorNumElements()), Depth + 1); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); - KnownZero |= NewBits; + Known = Known.zext(BitWidth); + Known.Zero.setBitsFrom(InBits); break; } + case ISD::ZERO_EXTEND: { + EVT InVT = Op.getOperand(0).getValueType(); + unsigned InBits = InVT.getScalarSizeInBits(); + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known = Known.zext(BitWidth); + Known.Zero.setBitsFrom(InBits); + break; + } + // TODO ISD::SIGN_EXTEND_VECTOR_INREG case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - KnownZero = KnownZero.trunc(InBits); - KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); // If the sign bit is known to be zero or one, then sext will extend // it to the top bits, else it will just zext. - KnownZero = KnownZero.sext(BitWidth); - KnownOne = KnownOne.sext(BitWidth); + Known = Known.sext(BitWidth); break; } case ISD::ANY_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - KnownZero = KnownZero.trunc(InBits); - KnownOne = KnownOne.trunc(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + Known = Known.trunc(InBits); + computeKnownBits(Op.getOperand(0), Known, Depth+1); + Known = Known.zext(BitWidth); break; } case ISD::TRUNCATE: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); - KnownZero = KnownZero.zext(InBits); - KnownOne = KnownOne.zext(InBits); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); + Known = Known.zext(InBits); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + Known = Known.trunc(BitWidth); break; } case ISD::AssertZext: { EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT(); APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); - KnownZero |= (~InMask); - KnownOne &= (~KnownZero); + computeKnownBits(Op.getOperand(0), Known, Depth+1); + Known.Zero |= (~InMask); + Known.One &= (~Known.Zero); break; } case ISD::FGETSIGN: // All bits are zero except the low bit. - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; - - case ISD::SUB: { + case ISD::USUBO: + case ISD::SSUBO: + if (Op.getResNo() == 1) { + // If we know the result of a setcc has the top bits zero, use this info. + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); + break; + } + LLVM_FALLTHROUGH; + case ISD::SUB: + case ISD::SUBC: { if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) { // We know that the top bits of C-X are clear if X contains less bits // than C (i.e. no wrap-around can happen). For example, 20-X is @@ -2490,22 +2474,47 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros(); // NLZ can't be BitWidth with no sign bit APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); // If all of the MaskV bits are known to be zero, then we know the // output top bits are zero, because we now know that the output is // from [0-C]. - if ((KnownZero2 & MaskV) == MaskV) { + if ((Known2.Zero & MaskV) == MaskV) { unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros(); // Top bits known zero. - KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); + Known.Zero.setHighBits(NLZ2); } } } - LLVM_FALLTHROUGH; + + // If low bits are know to be zero in both operands, then we know they are + // going to be 0 in the result. Both addition and complement operations + // preserve the low zero bits. + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + unsigned KnownZeroLow = Known2.countMinTrailingZeros(); + if (KnownZeroLow == 0) + break; + + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros()); + Known.Zero.setLowBits(KnownZeroLow); + break; } + case ISD::UADDO: + case ISD::SADDO: + case ISD::ADDCARRY: + if (Op.getResNo() == 1) { + // If we know the result of a setcc has the top bits zero, use this info. + if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); + break; + } + LLVM_FALLTHROUGH; case ISD::ADD: + case ISD::ADDC: case ISD::ADDE: { // Output known-0 bits are known if clear or set in both the low clear bits // common to both LHS & RHS. For example, 8+(X<<3) is known to have the @@ -2514,31 +2523,28 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, // known to be clear. For example, if one input has the top 10 bits clear // and the other has the top 8 bits clear, we know the top 7 bits of the // output must be clear. - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - unsigned KnownZeroHigh = KnownZero2.countLeadingOnes(); - unsigned KnownZeroLow = KnownZero2.countTrailingOnes(); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + unsigned KnownZeroHigh = Known2.countMinLeadingZeros(); + unsigned KnownZeroLow = Known2.countMinTrailingZeros(); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - KnownZeroHigh = std::min(KnownZeroHigh, - KnownZero2.countLeadingOnes()); - KnownZeroLow = std::min(KnownZeroLow, - KnownZero2.countTrailingOnes()); - - if (Opcode == ISD::ADD) { - KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow); - if (KnownZeroHigh > 1) - KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1); + KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros()); + KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros()); + + if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) { + // With ADDE and ADDCARRY, a carry bit may be added in, so we can only + // use this information if we know (at least) that the low two bits are + // clear. We then return to the caller that the low bit is unknown but + // that other bits are known zero. + if (KnownZeroLow >= 2) + Known.Zero.setBits(1, KnownZeroLow); break; } - // With ADDE, a carry bit may be added in, so we can only use this - // information if we know (at least) that the low two bits are clear. We - // then return to the caller that the low bit is unknown but that other bits - // are known zero. - if (KnownZeroLow >= 2) // ADDE - KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow); + Known.Zero.setLowBits(KnownZeroLow); + if (KnownZeroHigh > 1) + Known.Zero.setHighBits(KnownZeroHigh - 1); break; } case ISD::SREM: @@ -2546,23 +2552,22 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, const APInt &RA = Rem->getAPIntValue().abs(); if (RA.isPowerOf2()) { APInt LowBits = RA - 1; - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // The low bits of the first operand are unchanged by the srem. - KnownZero = KnownZero2 & LowBits; - KnownOne = KnownOne2 & LowBits; + Known.Zero = Known2.Zero & LowBits; + Known.One = Known2.One & LowBits; // If the first operand is non-negative or has all low bits zero, then // the upper bits are all zero. - if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits)) - KnownZero |= ~LowBits; + if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits)) + Known.Zero |= ~LowBits; // If the first operand is negative and not all low bits are zero, then // the upper bits are all one. - if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0)) - KnownOne |= ~LowBits; - assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); + if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0)) + Known.One |= ~LowBits; + assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?"); } } break; @@ -2571,41 +2576,37 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, const APInt &RA = Rem->getAPIntValue(); if (RA.isPowerOf2()) { APInt LowBits = (RA - 1); - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // The upper bits are all zero, the lower ones are unchanged. - KnownZero = KnownZero2 | ~LowBits; - KnownOne = KnownOne2 & LowBits; + Known.Zero = Known2.Zero | ~LowBits; + Known.One = Known2.One & LowBits; break; } } // Since the result is less than or equal to either operand, any leading // zero bits in either operand must also exist in the result. - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, - Depth + 1); - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - uint32_t Leaders = std::max(KnownZero.countLeadingOnes(), - KnownZero2.countLeadingOnes()); - KnownOne.clearAllBits(); - KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); + uint32_t Leaders = + std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros()); + Known.resetAll(); + Known.Zero.setHighBits(Leaders); break; } case ISD::EXTRACT_ELEMENT: { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + computeKnownBits(Op.getOperand(0), Known, Depth+1); const unsigned Index = Op.getConstantOperandVal(1); const unsigned BitWidth = Op.getValueSizeInBits(); // Remove low part of known bits mask - KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth); - KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth); + Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth); + Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth); // Remove high part of known bit mask - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); + Known = Known.trunc(BitWidth); break; } case ISD::EXTRACT_VECTOR_ELT: { @@ -2617,24 +2618,20 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, const unsigned NumSrcElts = VecVT.getVectorNumElements(); // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know // anything about the extended bits. - if (BitWidth > EltBitWidth) { - KnownZero = KnownZero.trunc(EltBitWidth); - KnownOne = KnownOne.trunc(EltBitWidth); - } + if (BitWidth > EltBitWidth) + Known = Known.trunc(EltBitWidth); ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) { // If we know the element index, just demand that vector element. unsigned Idx = ConstEltNo->getZExtValue(); APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); - computeKnownBits(InVec, KnownZero, KnownOne, DemandedElt, Depth + 1); + computeKnownBits(InVec, Known, DemandedElt, Depth + 1); } else { // Unknown element index, so ignore DemandedElts and demand them all. - computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1); - } - if (BitWidth > EltBitWidth) { - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + computeKnownBits(InVec, Known, Depth + 1); } + if (BitWidth > EltBitWidth) + Known = Known.zext(BitWidth); break; } case ISD::INSERT_VECTOR_ELT: { @@ -2646,60 +2643,110 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) { // If we know the element index, split the demand between the // source vector and the inserted element. - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth); unsigned EltIdx = CEltNo->getZExtValue(); // If we demand the inserted element then add its common known bits. if (DemandedElts[EltIdx]) { - computeKnownBits(InVal, KnownZero2, KnownOne2, Depth + 1); - KnownOne &= KnownOne2.zextOrTrunc(KnownOne.getBitWidth()); - KnownZero &= KnownZero2.zextOrTrunc(KnownZero.getBitWidth());; + computeKnownBits(InVal, Known2, Depth + 1); + Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth()); + Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());; } // If we demand the source vector then add its common known bits, ensuring // that we don't demand the inserted element. APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx)); if (!!VectorElts) { - computeKnownBits(InVec, KnownZero2, KnownOne2, VectorElts, Depth + 1); - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + computeKnownBits(InVec, Known2, VectorElts, Depth + 1); + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } } else { // Unknown element index, so ignore DemandedElts and demand them all. - computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1); - computeKnownBits(InVal, KnownZero2, KnownOne2, Depth + 1); - KnownOne &= KnownOne2.zextOrTrunc(KnownOne.getBitWidth()); - KnownZero &= KnownZero2.zextOrTrunc(KnownZero.getBitWidth());; + computeKnownBits(InVec, Known, Depth + 1); + computeKnownBits(InVal, Known2, Depth + 1); + Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth()); + Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());; } break; } + case ISD::BITREVERSE: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + Known.Zero = Known2.Zero.reverseBits(); + Known.One = Known2.One.reverseBits(); + break; + } case ISD::BSWAP: { - computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts, + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + Known.Zero = Known2.Zero.byteSwap(); + Known.One = Known2.One.byteSwap(); + break; + } + case ISD::ABS: { + computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); + + // If the source's MSB is zero then we know the rest of the bits already. + if (Known2.isNonNegative()) { + Known.Zero = Known2.Zero; + Known.One = Known2.One; + break; + } + + // We only know that the absolute values's MSB will be zero iff there is + // a set bit that isn't the sign bit (otherwise it could be INT_MIN). + Known2.One.clearSignBit(); + if (Known2.One.getBoolValue()) { + Known.Zero = APInt::getSignMask(BitWidth); + break; + } + break; + } + case ISD::UMIN: { + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + + // UMIN - we know that the result will have the maximum of the + // known zero leading bits of the inputs. + unsigned LeadZero = Known.countMinLeadingZeros(); + LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros()); + + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; + Known.Zero.setHighBits(LeadZero); + break; + } + case ISD::UMAX: { + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); - KnownZero = KnownZero2.byteSwap(); - KnownOne = KnownOne2.byteSwap(); + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + + // UMAX - we know that the result will have the maximum of the + // known one leading bits of the inputs. + unsigned LeadOne = Known.countMinLeadingOnes(); + LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes()); + + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; + Known.One.setHighBits(LeadOne); break; } case ISD::SMIN: - case ISD::SMAX: - case ISD::UMIN: - case ISD::UMAX: { - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts, + case ISD::SMAX: { + computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); // If we don't know any bits, early out. - if (!KnownOne && !KnownZero) + if (!Known.One && !Known.Zero) break; - computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts, - Depth + 1); - KnownZero &= KnownZero2; - KnownOne &= KnownOne2; + computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; break; } case ISD::FrameIndex: case ISD::TargetFrameIndex: if (unsigned Align = InferPtrAlignment(Op)) { // The low bits are known zero if the pointer is aligned. - KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align)); + Known.Zero.setLowBits(Log2_32(Align)); break; } break; @@ -2712,11 +2759,45 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_VOID: // Allow the target to implement this method for its nodes. - TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth); + TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth); break; } - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); +} + +SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0, + SDValue N1) const { + // X + 0 never overflow + if (isNullConstant(N1)) + return OFK_Never; + + KnownBits N1Known; + computeKnownBits(N1, N1Known); + if (N1Known.Zero.getBoolValue()) { + KnownBits N0Known; + computeKnownBits(N0, N0Known); + + bool overflow; + (void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow); + if (!overflow) + return OFK_Never; + } + + // mulhi + 1 never overflow + if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 && + (~N1Known.Zero & 0x01) == ~N1Known.Zero) + return OFK_Never; + + if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) { + KnownBits N0Known; + computeKnownBits(N0, N0Known); + + if ((~N0Known.Zero & 0x01) == ~N0Known.Zero) + return OFK_Never; + } + + return OFK_Sometime; } bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { @@ -2730,7 +2811,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // A left-shift of a constant one will have exactly one bit set because // shifting the bit off the end is undefined. if (Val.getOpcode() == ISD::SHL) { - auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0)); + auto *C = isConstOrConstSplat(Val.getOperand(0)); if (C && C->getAPIntValue() == 1) return true; } @@ -2738,14 +2819,14 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // Similarly, a logical right-shift of a constant sign-bit will have exactly // one bit set. if (Val.getOpcode() == ISD::SRL) { - auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0)); - if (C && C->getAPIntValue().isSignBit()) + auto *C = isConstOrConstSplat(Val.getOperand(0)); + if (C && C->getAPIntValue().isSignMask()) return true; } // Are all operands of a build vector constant powers of two? if (Val.getOpcode() == ISD::BUILD_VECTOR) - if (llvm::all_of(Val->ops(), [this, BitWidth](SDValue E) { + if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E)) return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2(); return false; @@ -2756,22 +2837,34 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // to handle some common cases. // Fall back to computeKnownBits to catch other known cases. - APInt KnownZero, KnownOne; - computeKnownBits(Val, KnownZero, KnownOne); - return (KnownZero.countPopulation() == BitWidth - 1) && - (KnownOne.countPopulation() == 1); + KnownBits Known; + computeKnownBits(Val, Known); + return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1); } unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { EVT VT = Op.getValueType(); + APInt DemandedElts = VT.isVector() + ? APInt::getAllOnesValue(VT.getVectorNumElements()) + : APInt(1, 1); + return ComputeNumSignBits(Op, DemandedElts, Depth); +} + +unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, + unsigned Depth) const { + EVT VT = Op.getValueType(); assert(VT.isInteger() && "Invalid VT!"); unsigned VTBits = VT.getScalarSizeInBits(); + unsigned NumElts = DemandedElts.getBitWidth(); unsigned Tmp, Tmp2; unsigned FirstAnswer = 1; if (Depth == 6) return 1; // Limit search depth. + if (!DemandedElts) + return 1; // No demanded elts, better to assume we don't know anything. + switch (Op.getOpcode()) { default: break; case ISD::AssertSext: @@ -2786,7 +2879,61 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { return Val.getNumSignBits(); } + case ISD::BUILD_VECTOR: + Tmp = VTBits; + for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) { + if (!DemandedElts[i]) + continue; + + SDValue SrcOp = Op.getOperand(i); + Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1); + + // BUILD_VECTOR can implicitly truncate sources, we must handle this. + if (SrcOp.getValueSizeInBits() != VTBits) { + assert(SrcOp.getValueSizeInBits() > VTBits && + "Expected BUILD_VECTOR implicit truncation"); + unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits; + Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1); + } + Tmp = std::min(Tmp, Tmp2); + } + return Tmp; + + case ISD::VECTOR_SHUFFLE: { + // Collect the minimum number of sign bits that are shared by every vector + // element referenced by the shuffle. + APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0); + const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); + assert(NumElts == SVN->getMask().size() && "Unexpected vector size"); + for (unsigned i = 0; i != NumElts; ++i) { + int M = SVN->getMaskElt(i); + if (!DemandedElts[i]) + continue; + // For UNDEF elements, we don't know anything about the common state of + // the shuffle result. + if (M < 0) + return 1; + if ((unsigned)M < NumElts) + DemandedLHS.setBit((unsigned)M % NumElts); + else + DemandedRHS.setBit((unsigned)M % NumElts); + } + Tmp = UINT_MAX; + if (!!DemandedLHS) + Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); + if (!!DemandedRHS) { + Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + // If we don't know anything, early out and try computeKnownBits fall-back. + if (Tmp == 1) + break; + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); + return Tmp; + } + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; @@ -2799,7 +2946,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { return std::max(Tmp, Tmp2); case ISD::SRA: - Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); + Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1); // SRA X, C -> adds C sign bits. if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) { APInt ShiftVal = C->getAPIntValue(); @@ -2887,6 +3034,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { } break; case ISD::ADD: + case ISD::ADDC: // Add can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1); @@ -2895,17 +3043,17 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // Special case decrementing a value (ADD X, -1): if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) if (CRHS->isAllOnesValue()) { - APInt KnownZero, KnownOne; - computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + KnownBits Known; + computeKnownBits(Op.getOperand(0), Known, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. - if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) + if ((Known.Zero | 1).isAllOnesValue()) return VTBits; // If we are subtracting one from a positive number, there is no carry // out of the result. - if (KnownZero.isNegative()) + if (Known.isNonNegative()) return Tmp; } @@ -2920,16 +3068,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // Handle NEG. if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) if (CLHS->isNullValue()) { - APInt KnownZero, KnownOne; - computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); + KnownBits Known; + computeKnownBits(Op.getOperand(1), Known, Depth+1); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. - if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue()) + if ((Known.Zero | 1).isAllOnesValue()) return VTBits; // If the input is known to be positive (the sign bit is known clear), // the output of the NEG has the same number of sign bits as the input. - if (KnownZero.isNegative()) + if (Known.isNonNegative()) return Tmp2; // Otherwise, we treat this like a SUB. @@ -2961,28 +3109,98 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { // result. Otherwise it gives either negative or > bitwidth result return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0); } + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = Op.getOperand(0); + SDValue InVal = Op.getOperand(1); + SDValue EltNo = Op.getOperand(2); + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + + ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo); + if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) { + // If we know the element index, split the demand between the + // source vector and the inserted element. + unsigned EltIdx = CEltNo->getZExtValue(); + + // If we demand the inserted element then get its sign bits. + Tmp = UINT_MAX; + if (DemandedElts[EltIdx]) { + // TODO - handle implicit truncation of inserted elements. + if (InVal.getScalarValueSizeInBits() != VTBits) + break; + Tmp = ComputeNumSignBits(InVal, Depth + 1); + } + + // If we demand the source vector then get its sign bits, and determine + // the minimum. + APInt VectorElts = DemandedElts; + VectorElts.clearBit(EltIdx); + if (!!VectorElts) { + Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + } else { + // Unknown element index, so ignore DemandedElts and demand them all. + Tmp = ComputeNumSignBits(InVec, Depth + 1); + Tmp2 = ComputeNumSignBits(InVal, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); + return Tmp; + } case ISD::EXTRACT_VECTOR_ELT: { - // At the moment we keep this simple and skip tracking the specific - // element. This way we get the lowest common denominator for all elements - // of the vector. - // TODO: get information for given vector element + SDValue InVec = Op.getOperand(0); + SDValue EltNo = Op.getOperand(1); + EVT VecVT = InVec.getValueType(); const unsigned BitWidth = Op.getValueSizeInBits(); const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); + const unsigned NumSrcElts = VecVT.getVectorNumElements(); + // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know // anything about sign bits. But if the sizes match we can derive knowledge // about sign bits from the vector operand. - if (BitWidth == EltBitWidth) - return ComputeNumSignBits(Op.getOperand(0), Depth+1); - break; + if (BitWidth != EltBitWidth) + break; + + // If we know the element index, just demand that vector element, else for + // an unknown element index, ignore DemandedElts and demand them all. + APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts); + ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); + if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) + DemandedSrcElts = + APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue()); + + return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1); + } + case ISD::EXTRACT_SUBVECTOR: { + // If we know the element index, just demand that subvector elements, + // otherwise demand them all. + SDValue Src = Op.getOperand(0); + ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) { + // Offset the demanded elts by the subvector index. + uint64_t Idx = SubIdx->getZExtValue(); + APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx); + return ComputeNumSignBits(Src, DemandedSrc, Depth + 1); + } + return ComputeNumSignBits(Src, Depth + 1); } - case ISD::EXTRACT_SUBVECTOR: - return ComputeNumSignBits(Op.getOperand(0), Depth + 1); case ISD::CONCAT_VECTORS: - // Determine the minimum number of sign bits across all input vectors. - // Early out if the result is already 1. - Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1); - for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) - Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1)); + // Determine the minimum number of sign bits across all demanded + // elts of the input vectors. Early out if the result is already 1. + Tmp = UINT_MAX; + EVT SubVectorVT = Op.getOperand(0).getValueType(); + unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements(); + unsigned NumSubVectors = Op.getNumOperands(); + for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) { + APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts); + DemandedSub = DemandedSub.trunc(NumSubVectorElts); + if (!DemandedSub) + continue; + Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); return Tmp; } @@ -3008,20 +3226,22 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || Op.getOpcode() == ISD::INTRINSIC_VOID) { - unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth); - if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits); + unsigned NumBits = + TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth); + if (NumBits > 1) + FirstAnswer = std::max(FirstAnswer, NumBits); } // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. - APInt KnownZero, KnownOne; - computeKnownBits(Op, KnownZero, KnownOne, Depth); + KnownBits Known; + computeKnownBits(Op, Known, DemandedElts, Depth); APInt Mask; - if (KnownZero.isNegative()) { // sign bit is 0 - Mask = KnownZero; - } else if (KnownOne.isNegative()) { // sign bit is 1; - Mask = KnownOne; + if (Known.isNonNegative()) { // sign bit is 0 + Mask = Known.Zero; + } else if (Known.isNegative()) { // sign bit is 1; + Mask = Known.One; } else { // Nothing known. return FirstAnswer; @@ -3054,6 +3274,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const { if (getTarget().Options.NoNaNsFPMath) return true; + if (Op->getFlags().hasNoNaNs()) + return true; + // If the value is a constant, we can obviously see if it is a NaN or not. if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) return !C->getValueAPF().isNaN(); @@ -3096,11 +3319,10 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { assert(A.getValueType() == B.getValueType() && "Values must have the same type"); - APInt AZero, AOne; - APInt BZero, BOne; - computeKnownBits(A, AZero, AOne); - computeKnownBits(B, BZero, BOne); - return (AZero | BZero).isAllOnesValue(); + KnownBits AKnown, BKnown; + computeKnownBits(A, AKnown); + computeKnownBits(B, BKnown); + return (AKnown.Zero | BKnown.Zero).isAllOnesValue(); } static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT, @@ -3169,7 +3391,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue Operand) { + SDValue Operand, const SDNodeFlags Flags) { // Constant fold unary operations with an integer constant operand. Even // opaque constant will be folded, because the folding of unary operations // doesn't create new constants with different values. Nevertheless, the @@ -3206,6 +3428,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == MVT::f128 && C->getValueType(0) == MVT::i128) return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT); break; + case ISD::ABS: + return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(), + C->isOpaque()); + case ISD::BITREVERSE: + return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(), + C->isOpaque()); case ISD::BSWAP: return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(), C->isOpaque()); @@ -3220,6 +3448,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::CTTZ_ZERO_UNDEF: return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(), C->isOpaque()); + case ISD::FP16_TO_FP: { + bool Ignored; + APFloat FPV(APFloat::IEEEhalf(), + (Val.getBitWidth() == 16) ? Val : Val.trunc(16)); + + // This can return overflow, underflow, or inexact; we don't care. + // FIXME need to be more flexible about rounding mode. + (void)FPV.convert(EVTToAPFloatSemantics(VT), + APFloat::rmNearestTiesToEven, &Ignored); + return getConstantFP(FPV, DL, VT); + } } } @@ -3261,17 +3500,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { - integerPart x[2]; bool ignored; - static_assert(integerPartWidth >= 64, "APFloat parts too small!"); + APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT); // FIXME need to be more flexible about rounding mode. - APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(), - Opcode==ISD::FP_TO_SINT, - APFloat::rmTowardZero, &ignored); - if (s==APFloat::opInvalidOp) // inexact is OK, in fact usual + APFloat::opStatus s = + V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored); + if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual break; - APInt api(VT.getSizeInBits(), x); - return getConstant(api, DL, VT); + return getConstant(IntVal, DL, VT); } case ISD::BITCAST: if (VT == MVT::i16 && C->getValueType(0) == MVT::f16) @@ -3281,6 +3517,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64) return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT); break; + case ISD::FP_TO_FP16: { + bool Ignored; + // This can return overflow, underflow, or inexact; we don't care. + // FIXME need to be more flexible about rounding mode. + (void)V.convert(APFloat::IEEEhalf(), + APFloat::rmNearestTiesToEven, &Ignored); + return getConstant(V.bitcastToAPInt(), DL, VT); + } } } @@ -3303,6 +3547,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::TRUNCATE: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: + case ISD::ABS: + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: @@ -3348,7 +3594,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsLT(VT) && "Invalid sext node, dst < src!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); @@ -3364,8 +3610,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsLT(VT) && "Invalid zext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) - return getNode(ISD::ZERO_EXTEND, DL, VT, - Operand.getNode()->getOperand(0)); + return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); @@ -3384,13 +3629,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // (ext (trunx x)) -> x if (OpOpcode == ISD::TRUNCATE) { - SDValue OpOp = Operand.getNode()->getOperand(0); + SDValue OpOp = Operand.getOperand(0); if (OpOp.getValueType() == VT) return OpOp; } @@ -3406,20 +3651,26 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsGT(VT) && "Invalid truncate node, src < dst!"); if (OpOpcode == ISD::TRUNCATE) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) { // If the source is smaller than the dest, we still need an extend. - if (Operand.getNode()->getOperand(0).getValueType().getScalarType() + if (Operand.getOperand(0).getValueType().getScalarType() .bitsLT(VT.getScalarType())) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); - if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT)) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); - return Operand.getNode()->getOperand(0); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); + if (Operand.getOperand(0).getValueType().bitsGT(VT)) + return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); + return Operand.getOperand(0); } if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; + case ISD::ABS: + assert(VT.isInteger() && VT == Operand.getValueType() && + "Invalid ABS!"); + if (OpOpcode == ISD::UNDEF) + return getUNDEF(VT); + break; case ISD::BSWAP: assert(VT.isInteger() && VT == Operand.getValueType() && "Invalid BSWAP!"); @@ -3464,15 +3715,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags? - return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1), - Operand.getNode()->getOperand(0), - &cast<BinaryWithFlagsSDNode>(Operand.getNode())->Flags); + return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1), + Operand.getOperand(0), Operand.getNode()->getFlags()); if (OpOpcode == ISD::FNEG) // --X -> X - return Operand.getNode()->getOperand(0); + return Operand.getOperand(0); break; case ISD::FABS: if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) - return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(ISD::FABS, DL, VT, Operand.getOperand(0)); break; } @@ -3483,10 +3733,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) { + E->intersectFlagsWith(Flags); return SDValue(E, 0); + } N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + N->setFlags(Flags); createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { @@ -3569,6 +3822,30 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, GA->getOffset() + uint64_t(Offset)); } +bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { + switch (Opcode) { + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: { + // If a divisor is zero/undef or any element of a divisor vector is + // zero/undef, the whole op is undef. + assert(Ops.size() == 2 && "Div/rem should have 2 operands"); + SDValue Divisor = Ops[1]; + if (Divisor.isUndef() || isNullConstant(Divisor)) + return true; + + return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) && + any_of(Divisor->op_values(), + [](SDValue V) { return V.isUndef() || isNullConstant(V); }); + // TODO: Handle signed overflow. + } + // TODO: Handle oversized shifts. + default: + return false; + } +} + SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, SDNode *Cst1, SDNode *Cst2) { @@ -3578,6 +3855,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); + if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)})) + return getUNDEF(VT); + // Handle the case of two scalars. if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) { if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) { @@ -3638,13 +3918,16 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, - const SDNodeFlags *Flags) { + const SDNodeFlags Flags) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. if (Opcode >= ISD::BUILTIN_OP_END) return SDValue(); + if (isUndef(Opcode, Ops)) + return getUNDEF(VT); + // We can only fold vectors - maybe merge with FoldConstantArithmetic someday? if (!VT.isVector()) return SDValue(); @@ -3676,7 +3959,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, // Find legal integer scalar type for constant promotion and // ensure that its scalar size is at least as large as source. EVT LegalSVT = VT.getScalarType(); - if (LegalSVT.isInteger()) { + if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) { LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT); if (LegalSVT.bitsLT(VT.getScalarType())) return SDValue(); @@ -3727,8 +4010,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue N1, SDValue N2, - const SDNodeFlags *Flags) { + SDValue N1, SDValue N2, const SDNodeFlags Flags) { ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); @@ -3910,35 +4192,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending - auto SignExtendInReg = [&](APInt Val) { + auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) { unsigned FromBits = EVT.getScalarSizeInBits(); Val <<= Val.getBitWidth() - FromBits; - Val = Val.ashr(Val.getBitWidth() - FromBits); - return getConstant(Val, DL, VT.getScalarType()); + Val.ashrInPlace(Val.getBitWidth() - FromBits); + return getConstant(Val, DL, ConstantVT); }; if (N1C) { const APInt &Val = N1C->getAPIntValue(); - return SignExtendInReg(Val); + return SignExtendInReg(Val, VT); } if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { SmallVector<SDValue, 8> Ops; + llvm::EVT OpVT = N1.getOperand(0).getValueType(); for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { SDValue Op = N1.getOperand(i); if (Op.isUndef()) { - Ops.push_back(getUNDEF(VT.getScalarType())); + Ops.push_back(getUNDEF(OpVT)); continue; } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - APInt Val = C->getAPIntValue(); - Val = Val.zextOrTrunc(VT.getScalarSizeInBits()); - Ops.push_back(SignExtendInReg(Val)); - continue; - } - break; + ConstantSDNode *C = cast<ConstantSDNode>(Op); + APInt Val = C->getAPIntValue(); + Ops.push_back(SignExtendInReg(Val, OpVT)); } - if (Ops.size() == VT.getVectorNumElements()) - return getBuildVector(VT, DL, Ops); + return getBuildVector(VT, DL, Ops); } break; } @@ -4040,6 +4318,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT.getSimpleVT() == N1.getSimpleValueType()) return N1; + // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF. + if (N1.isUndef()) + return getUNDEF(VT); + + // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of + // the concat have the same type as the extract. + if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS && + N1.getNumOperands() > 0 && + VT == N1.getOperand(0).getValueType()) { + unsigned Factor = VT.getVectorNumElements(); + return N1.getOperand(N2C->getZExtValue() / Factor); + } + // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created // during shuffle legalization. if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) && @@ -4186,21 +4477,23 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // Memoize this node if possible. SDNode *N; SDVTList VTs = getVTList(VT); + SDValue Ops[] = {N1, N2}; if (VT != MVT::Glue) { - SDValue Ops[] = {N1, N2}; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) { - if (Flags) - E->intersectFlagsWith(Flags); + E->intersectFlagsWith(Flags); return SDValue(E, 0); } - N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); + N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + N->setFlags(Flags); + createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { - N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); + N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); + createOperands(N, Ops); } InsertNode(N); @@ -4392,9 +4685,10 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG, /// used when a memcpy is turned into a memset when the source is a constant /// string ptr. static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, - const TargetLowering &TLI, StringRef Str) { + const TargetLowering &TLI, + const ConstantDataArraySlice &Slice) { // Handle vector with all elements zero. - if (Str.empty()) { + if (Slice.Array == nullptr) { if (VT.isInteger()) return DAG.getConstant(0, dl, VT); else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128) @@ -4413,15 +4707,15 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, assert(!VT.isVector() && "Can't handle vector type here!"); unsigned NumVTBits = VT.getSizeInBits(); unsigned NumVTBytes = NumVTBits / 8; - unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size())); + unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length)); APInt Val(NumVTBits, 0); if (DAG.getDataLayout().isLittleEndian()) { for (unsigned i = 0; i != NumBytes; ++i) - Val |= (uint64_t)(unsigned char)Str[i] << i*8; + Val |= (uint64_t)(unsigned char)Slice[i] << i*8; } else { for (unsigned i = 0; i != NumBytes; ++i) - Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8; + Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8; } // If the "cost" of materializing the integer immediate is less than the cost @@ -4438,9 +4732,8 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset, return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT)); } -/// isMemSrcFromString - Returns true if memcpy source is a string constant. -/// -static bool isMemSrcFromString(SDValue Src, StringRef &Str) { +/// Returns true if memcpy source is constant data. +static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) { uint64_t SrcDelta = 0; GlobalAddressSDNode *G = nullptr; if (Src.getOpcode() == ISD::GlobalAddress) @@ -4454,8 +4747,8 @@ static bool isMemSrcFromString(SDValue Src, StringRef &Str) { if (!G) return false; - return getConstantStringInfo(G->getGlobal(), Str, - SrcDelta + G->getOffset(), false); + return getConstantDataArrayInfo(G->getGlobal(), Slice, 8, + SrcDelta + G->getOffset()); } /// Determines the optimal series of memory ops to replace the memset / memcpy. @@ -4486,23 +4779,23 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps, DAG.getMachineFunction()); if (VT == MVT::Other) { - if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) || - TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) { - VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS); - } else { - switch (DstAlign & 7) { - case 0: VT = MVT::i64; break; - case 4: VT = MVT::i32; break; - case 2: VT = MVT::i16; break; - default: VT = MVT::i8; break; - } - } - + // Use the largest integer type whose alignment constraints are satisfied. + // We only need to check DstAlign here as SrcAlign is always greater or + // equal to DstAlign (or zero). + VT = MVT::i64; + while (DstAlign && DstAlign < VT.getSizeInBits() / 8 && + !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) + VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); + assert(VT.isInteger()); + + // Find the largest legal integer type. MVT LVT = MVT::i64; while (!TLI.isTypeLegal(LVT)) LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1); assert(LVT.isInteger()); + // If the type we've chosen is larger than the largest legal integer type + // then use that instead. if (VT.bitsGT(LVT)) VT = LVT; } @@ -4598,15 +4891,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; - StringRef Str; - bool CopyFromStr = isMemSrcFromString(Src, Str); - bool isZeroStr = CopyFromStr && Str.empty(); + ConstantDataArraySlice Slice; + bool CopyFromConstant = isMemSrcFromConstant(Src, Slice); + bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr; unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), - (isZeroStr ? 0 : SrcAlign), - false, false, CopyFromStr, true, + (isZeroConstant ? 0 : SrcAlign), + false, false, CopyFromConstant, true, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), DAG, TLI)) @@ -4650,18 +4943,29 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, DstOff -= VTSize - Size; } - if (CopyFromStr && - (isZeroStr || (VT.isInteger() && !VT.isVector()))) { + if (CopyFromConstant && + (isZeroConstant || (VT.isInteger() && !VT.isVector()))) { // It's unlikely a store of a vector immediate can be done in a single // instruction. It would require a load from a constantpool first. // We only handle zero vectors here. // FIXME: Handle other cases where store of vector immediate is done in // a single instruction. - Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff)); + ConstantDataArraySlice SubSlice; + if (SrcOff < Slice.Length) { + SubSlice = Slice; + SubSlice.move(SrcOff); + } else { + // This is an out-of-bounds access and hence UB. Pretend we read zero. + SubSlice.Array = nullptr; + SubSlice.Offset = 0; + SubSlice.Length = VTSize; + } + Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice); if (Value.getNode()) Store = DAG.getStore(Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl), - DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags); + DstPtrInfo.getWithOffset(DstOff), Align, + MMOFlags); } if (!Store.getNode()) { @@ -4943,11 +5247,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5004,11 +5308,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5066,11 +5370,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, TargetLowering::CallLoweringInfo CLI(*this); CLI.setDebugLoc(dl) .setChain(Chain) - .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), - Dst.getValueType().getTypeForEVT(*getContext()), - getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), - TLI->getPointerTy(getDataLayout())), - std::move(Args)) + .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET), + Dst.getValueType().getTypeForEVT(*getContext()), + getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET), + TLI->getPointerTy(getDataLayout())), + std::move(Args)) .setDiscardResult() .setTailCall(isTailCall); @@ -5722,11 +6026,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) { + ArrayRef<SDValue> Ops, const SDNodeFlags Flags) { unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); - case 1: return getNode(Opcode, DL, VT, Ops[0]); + case 1: return getNode(Opcode, DL, VT, Ops[0], Flags); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; @@ -6238,6 +6542,63 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc, return N; } +SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) { + unsigned OrigOpc = Node->getOpcode(); + unsigned NewOpc; + bool IsUnary = false; + switch (OrigOpc) { + default: + llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!"); + case ISD::STRICT_FADD: NewOpc = ISD::FADD; break; + case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break; + case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break; + case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break; + case ISD::STRICT_FREM: NewOpc = ISD::FREM; break; + case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break; + case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break; + case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break; + case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break; + case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break; + case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break; + case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break; + case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break; + case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break; + case ISD::STRICT_FNEARBYINT: + NewOpc = ISD::FNEARBYINT; + IsUnary = true; + break; + } + + // We're taking this node out of the chain, so we need to re-link things. + SDValue InputChain = Node->getOperand(0); + SDValue OutputChain = SDValue(Node, 1); + ReplaceAllUsesOfValueWith(OutputChain, InputChain); + + SDVTList VTs = getVTList(Node->getOperand(1).getValueType()); + SDNode *Res = nullptr; + if (IsUnary) + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) }); + else + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1), + Node->getOperand(2) }); + + // MorphNodeTo can operate in two ways: if an existing node with the + // specified operands exists, it can just return it. Otherwise, it + // updates the node in place to have the requested operands. + if (Res == Node) { + // If we updated the node in place, reset the node ID. To the isel, + // this should be just like a newly allocated machine node. + Res->setNodeId(-1); + } else { + ReplaceAllUsesWith(Node, Res); + RemoveDeadNode(Node); + } + + return Res; +} + /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. @@ -6384,14 +6745,13 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, /// else return NULL. SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops, - const SDNodeFlags *Flags) { + const SDNodeFlags Flags) { if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) { - if (Flags) - E->intersectFlagsWith(Flags); + E->intersectFlagsWith(Flags); return E; } } @@ -7049,6 +7409,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const { return Seen; } +/// Return true if the only users of N are contained in Nodes. +bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) { + bool Seen = false; + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + SDNode *User = *I; + if (llvm::any_of(Nodes, + [&User](const SDNode *Node) { return User == Node; })) + Seen = true; + else + return false; + } + + return Seen; +} + /// isOperand - Return true if this node is an operand of N. /// bool SDValue::isOperandOf(const SDNode *N) const { @@ -7070,21 +7445,39 @@ bool SDNode::isOperandOf(const SDNode *N) const { /// side-effecting instructions on any chain path. In practice, this looks /// through token factors and non-volatile loads. In order to remain efficient, /// this only looks a couple of nodes in, it does not do an exhaustive search. +/// +/// Note that we only need to examine chains when we're searching for +/// side-effects; SelectionDAG requires that all side-effects are represented +/// by chains, even if another operand would force a specific ordering. This +/// constraint is necessary to allow transformations like splitting loads. bool SDValue::reachesChainWithoutSideEffects(SDValue Dest, - unsigned Depth) const { + unsigned Depth) const { if (*this == Dest) return true; // Don't search too deeply, we just want to be able to see through // TokenFactor's etc. if (Depth == 0) return false; - // If this is a token factor, all inputs to the TF happen in parallel. If any - // of the operands of the TF does not reach dest, then we cannot do the xform. + // If this is a token factor, all inputs to the TF happen in parallel. if (getOpcode() == ISD::TokenFactor) { - for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1)) - return false; - return true; + // First, try a shallow search. + if (is_contained((*this)->ops(), Dest)) { + // We found the chain we want as an operand of this TokenFactor. + // Essentially, we reach the chain without side-effects if we could + // serialize the TokenFactor into a simple chain of operations with + // Dest as the last operation. This is automatically true if the + // chain has one use: there are no other ordering constraints. + // If the chain has more than one use, we give up: some other + // use of Dest might force a side-effect between Dest and the current + // node. + if (Dest.hasOneUse()) + return true; + } + // Next, try a deep search: check whether every operand of the TokenFactor + // reaches Dest. + return all_of((*this)->ops(), [=](SDValue Op) { + return Op.reachesChainWithoutSideEffects(Dest, Depth - 1); + }); } // Loads don't have side effects, look through them. @@ -7102,20 +7495,8 @@ bool SDNode::hasPredecessor(const SDNode *N) const { return hasPredecessorHelper(N, Visited, Worklist); } -uint64_t SDNode::getConstantOperandVal(unsigned Num) const { - assert(Num < NumOperands && "Invalid child # of SDNode!"); - return cast<ConstantSDNode>(OperandList[Num])->getZExtValue(); -} - -const SDNodeFlags *SDNode::getFlags() const { - if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this)) - return &FlagsNode->Flags; - return nullptr; -} - -void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) { - if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this)) - FlagsNode->Flags.intersectWith(Flags); +void SDNode::intersectFlagsWith(const SDNodeFlags Flags) { + this->Flags.intersectWith(Flags); } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { @@ -7255,10 +7636,9 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { int64_t GVOffset = 0; if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType()); - APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0); - llvm::computeKnownBits(const_cast<GlobalValue *>(GV), KnownZero, KnownOne, - getDataLayout()); - unsigned AlignBits = KnownZero.countTrailingOnes(); + KnownBits Known(PtrWidth); + llvm::computeKnownBits(GV, Known, getDataLayout()); + unsigned AlignBits = Known.countMinTrailingZeros(); unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0; if (Align) return MinAlign(Align, GVOffset); @@ -7292,14 +7672,11 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const { // Currently all types are split in half. EVT LoVT, HiVT; - if (!VT.isVector()) { + if (!VT.isVector()) LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT); - } else { - unsigned NumElements = VT.getVectorNumElements(); - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(), - NumElements/2); - } + else + LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext()); + return std::make_pair(LoVT, HiVT); } @@ -7348,52 +7725,52 @@ Type *ConstantPoolSDNode::getType() const { return Val.ConstVal->getType(); } -bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, - APInt &SplatUndef, +bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits, - bool isBigEndian) const { + bool IsBigEndian) const { EVT VT = getValueType(0); assert(VT.isVector() && "Expected a vector type"); - unsigned sz = VT.getSizeInBits(); - if (MinSplatBits > sz) + unsigned VecWidth = VT.getSizeInBits(); + if (MinSplatBits > VecWidth) return false; - SplatValue = APInt(sz, 0); - SplatUndef = APInt(sz, 0); + // FIXME: The widths are based on this node's type, but build vectors can + // truncate their operands. + SplatValue = APInt(VecWidth, 0); + SplatUndef = APInt(VecWidth, 0); - // Get the bits. Bits with undefined values (when the corresponding element + // Get the bits. Bits with undefined values (when the corresponding element // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared - // in SplatValue. If any of the values are not constant, give up and return + // in SplatValue. If any of the values are not constant, give up and return // false. - unsigned int nOps = getNumOperands(); - assert(nOps > 0 && "isConstantSplat has 0-size build vector"); - unsigned EltBitSize = VT.getScalarSizeInBits(); + unsigned int NumOps = getNumOperands(); + assert(NumOps > 0 && "isConstantSplat has 0-size build vector"); + unsigned EltWidth = VT.getScalarSizeInBits(); - for (unsigned j = 0; j < nOps; ++j) { - unsigned i = isBigEndian ? nOps-1-j : j; + for (unsigned j = 0; j < NumOps; ++j) { + unsigned i = IsBigEndian ? NumOps - 1 - j : j; SDValue OpVal = getOperand(i); - unsigned BitPos = j * EltBitSize; + unsigned BitPos = j * EltWidth; if (OpVal.isUndef()) - SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize); - else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) - SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize). - zextOrTrunc(sz) << BitPos; - else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) - SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos; - else + SplatUndef.setBits(BitPos, BitPos + EltWidth); + else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal)) + SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos); + else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal)) + SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos); + else return false; } - // The build_vector is all constants or undefs. Find the smallest element + // The build_vector is all constants or undefs. Find the smallest element // size that splats the vector. - HasAnyUndefs = (SplatUndef != 0); - while (sz > 8) { - unsigned HalfSize = sz / 2; + // FIXME: This does not work for vectors with elements less than 8 bits. + while (VecWidth > 8) { + unsigned HalfSize = VecWidth / 2; APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize); APInt LowValue = SplatValue.trunc(HalfSize); APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize); @@ -7407,10 +7784,10 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, SplatValue = HighValue | LowValue; SplatUndef = HighUndef & LowUndef; - sz = HalfSize; + VecWidth = HalfSize; } - SplatBitSize = sz; + SplatBitSize = VecWidth; return true; } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 996c95bd5f07..b895da21a7ff 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -83,24 +83,6 @@ LimitFPPrecision("limit-float-precision", "for some float libcalls"), cl::location(LimitFloatPrecision), cl::init(0)); - -static cl::opt<bool> -EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden, - cl::desc("Enable fast-math-flags for DAG nodes")); - -/// Minimum jump table density for normal functions. -static cl::opt<unsigned> -JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden, - cl::desc("Minimum density for building a jump table in " - "a normal function")); - -/// Minimum jump table density for -Os or -Oz functions. -static cl::opt<unsigned> -OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden, - cl::desc("Minimum density for building a jump table in " - "an optsize function")); - - // Limit the width of DAG chains. This is important in general to prevent // DAG-based analysis from blowing up. For example, alias analysis and // load clustering may not complete in reasonable time. It is difficult to @@ -366,11 +348,12 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, return DAG.getUNDEF(ValueVT); } - if (ValueVT.getVectorNumElements() == 1 && - ValueVT.getVectorElementType() != PartEVT) - Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType()); + EVT ValueSVT = ValueVT.getVectorElementType(); + if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) + Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT) + : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); - return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); + return DAG.getBuildVector(ValueVT, DL, Val); } static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl, @@ -541,7 +524,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, e = PartVT.getVectorNumElements(); i != e; ++i) Ops.push_back(DAG.getUNDEF(ElementVT)); - Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops); + Val = DAG.getBuildVector(PartVT, DL, Ops); // FIXME: Use CONCAT for 2x -> 4x. @@ -561,10 +544,9 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, Val = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); - - Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); } + assert(Val.getValueType() == PartVT && "Unexpected vector part value type"); Parts[0] = Val; return; } @@ -634,10 +616,6 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, } } -/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from -/// this value and returns the result as a ValueVT value. This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used. SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, @@ -683,7 +661,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, unsigned RegSize = RegisterVT.getSizeInBits(); unsigned NumSignBits = LOI->NumSignBits; - unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes(); + unsigned NumZeroBits = LOI->Known.countMinLeadingZeros(); if (NumZeroBits == RegSize) { // The current value is a zero. @@ -739,10 +717,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values); } -/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the -/// specified value into the registers specified by this object. This uses -/// Chain/Flag as the input and updates them for the output Chain/Flag. -/// If the Flag pointer is NULL, no flag is used. void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V, @@ -796,9 +770,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } -/// AddInlineAsmOperands - Add this value to the specified inlineasm node -/// operand list. This adds the code marker and includes the number of -/// values added into it. void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, unsigned MatchingIdx, const SDLoc &dl, SelectionDAG &DAG, @@ -840,9 +811,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, } } -void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, +void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, const TargetLibraryInfo *li) { - AA = &aa; + AA = aa; GFI = gfi; LibInfo = li; DL = &DAG.getDataLayout(); @@ -850,12 +821,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, LPadToCallSiteMap.clear(); } -/// clear - Clear out the current SelectionDAG and the associated -/// state and prepare this SelectionDAGBuilder object to be used -/// for a new block. This doesn't clear out information about -/// additional blocks that are needed to complete switch lowering -/// or PHI node updating; that information is cleared out as it is -/// consumed. void SelectionDAGBuilder::clear() { NodeMap.clear(); UnusedArgNodeMap.clear(); @@ -867,21 +832,10 @@ void SelectionDAGBuilder::clear() { StatepointLowering.clear(); } -/// clearDanglingDebugInfo - Clear the dangling debug information -/// map. This function is separated from the clear so that debug -/// information that is dangling in a basic block can be properly -/// resolved in a different basic block. This allows the -/// SelectionDAG to resolve dangling debug information attached -/// to PHI nodes. void SelectionDAGBuilder::clearDanglingDebugInfo() { DanglingDebugInfoMap.clear(); } -/// getRoot - Return the current virtual root of the Selection DAG, -/// flushing any PendingLoad items. This must be done before emitting -/// a store or any other node that may need to be ordered after any -/// prior load instructions. -/// SDValue SelectionDAGBuilder::getRoot() { if (PendingLoads.empty()) return DAG.getRoot(); @@ -901,10 +855,6 @@ SDValue SelectionDAGBuilder::getRoot() { return Root; } -/// getControlRoot - Similar to getRoot, but instead of flushing all the -/// PendingLoad items, flush all the PendingExports items. It is necessary -/// to do this before emitting a terminator instruction. -/// SDValue SelectionDAGBuilder::getControlRoot() { SDValue Root = DAG.getRoot(); @@ -937,7 +887,9 @@ void SelectionDAGBuilder::visit(const Instruction &I) { HandlePHINodesInSuccessorBlocks(I.getParent()); } - ++SDNodeOrder; + // Increase the SDNodeOrder if dealing with a non-debug instruction. + if (!isa<DbgInfoIntrinsic>(I)) + ++SDNodeOrder; CurInst = &I; @@ -1122,8 +1074,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (isa<ArrayType>(CDS->getType())) return DAG.getMergeValues(Ops, getCurSDLoc()); - return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), - VT, Ops); + return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); } if (C->getType()->isStructTy() || C->getType()->isArrayTy()) { @@ -1175,7 +1126,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { } // Create a BUILD_VECTOR node. - return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops); + return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); } // If this is a static alloca, generate it as the frameindex instead of @@ -1185,7 +1136,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) return DAG.getFrameIndex(SI->second, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFrameIndexTy(DAG.getDataLayout())); } // If this is an instruction which fast-isel has deferred, select it now. @@ -1384,7 +1335,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { RetPtr.getValueType(), RetPtr, DAG.getIntPtrConstant(Offsets[i], getCurSDLoc()), - &Flags); + Flags); Chains[i] = DAG.getStore(Chain, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + i), // FIXME: better loc info would be nice. @@ -1403,16 +1354,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { const Function *F = I.getParent()->getParent(); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; LLVMContext &Context = F->getContext(); - bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, - Attribute::InReg); + bool RetInReg = F->getAttributes().hasAttribute( + AttributeList::ReturnIndex, Attribute::InReg); for (unsigned j = 0; j != NumValues; ++j) { EVT VT = ValueVTs[j]; @@ -1582,7 +1533,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, BranchProbability TProb, - BranchProbability FProb) { + BranchProbability FProb, + bool InvertCond) { const BasicBlock *BB = CurBB->getBasicBlock(); // If the leaf of the tree is a comparison, merge the condition into @@ -1596,10 +1548,14 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, isExportableFromCurrentBlock(BOp->getOperand(1), BB))) { ISD::CondCode Condition; if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) { - Condition = getICmpCondCode(IC->getPredicate()); + ICmpInst::Predicate Pred = + InvertCond ? IC->getInversePredicate() : IC->getPredicate(); + Condition = getICmpCondCode(Pred); } else { const FCmpInst *FC = cast<FCmpInst>(Cond); - Condition = getFCmpCondCode(FC->getPredicate()); + FCmpInst::Predicate Pred = + InvertCond ? FC->getInversePredicate() : FC->getPredicate(); + Condition = getFCmpCondCode(Pred); if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); } @@ -1612,7 +1568,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, } // Create a CaseBlock record representing this branch. - CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()), + ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ; + CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()), nullptr, TBB, FBB, CurBB, TProb, FProb); SwitchCases.push_back(CB); } @@ -1625,16 +1582,44 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, MachineBasicBlock *SwitchBB, Instruction::BinaryOps Opc, BranchProbability TProb, - BranchProbability FProb) { - // If this node is not part of the or/and tree, emit it as a branch. + BranchProbability FProb, + bool InvertCond) { + // Skip over not part of the tree and remember to invert op and operands at + // next level. + if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) { + const Value *CondOp = BinaryOperator::getNotArgument(Cond); + if (InBlock(CondOp, CurBB->getBasicBlock())) { + FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb, + !InvertCond); + return; + } + } + const Instruction *BOp = dyn_cast<Instruction>(Cond); + // Compute the effective opcode for Cond, taking into account whether it needs + // to be inverted, e.g. + // and (not (or A, B)), C + // gets lowered as + // and (and (not A, not B), C) + unsigned BOpc = 0; + if (BOp) { + BOpc = BOp->getOpcode(); + if (InvertCond) { + if (BOpc == Instruction::And) + BOpc = Instruction::Or; + else if (BOpc == Instruction::Or) + BOpc = Instruction::And; + } + } + + // If this node is not part of the or/and tree, emit it as a branch. if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) || - (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() || + BOpc != Opc || !BOp->hasOneUse() || BOp->getParent() != CurBB->getBasicBlock() || !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) || !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) { EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, - TProb, FProb); + TProb, FProb, InvertCond); return; } @@ -1669,14 +1654,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, auto NewFalseProb = TProb / 2 + FProb; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, - NewTrueProb, NewFalseProb); + NewTrueProb, NewFalseProb, InvertCond); // Normalize A/2 and B to get A/(1+B) and 2B/(1+B). SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb}; BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - Probs[0], Probs[1]); + Probs[0], Probs[1], InvertCond); } else { assert(Opc == Instruction::And && "Unknown merge op!"); // Codegen X & Y as: @@ -1702,14 +1687,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, auto NewFalseProb = FProb / 2; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, - NewTrueProb, NewFalseProb); + NewTrueProb, NewFalseProb, InvertCond); // Normalize A and B/2 to get 2A/(1+A) and B/(1+A). SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2}; BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - Probs[0], Probs[1]); + Probs[0], Probs[1], InvertCond); } } @@ -1793,7 +1778,8 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) { FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode, getEdgeProbability(BrMBB, Succ0MBB), - getEdgeProbability(BrMBB, Succ1MBB)); + getEdgeProbability(BrMBB, Succ1MBB), + /*InvertCond=*/false); // If the compares in later blocks need to use values not currently // exported from this block, export them now. This block should always // be the first entry. @@ -2027,7 +2013,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, Entry.Node = StackSlot; Entry.Ty = FnTy->getParamType(0); if (Fn->hasAttribute(1, Attribute::AttrKind::InReg)) - Entry.isInReg = true; + Entry.IsInReg = true; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); @@ -2581,15 +2567,15 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) { Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); Flags.setVectorReduction(vec_redux); - if (EnableFMFInDAG) { - Flags.setAllowReciprocal(FMF.allowReciprocal()); - Flags.setNoInfs(FMF.noInfs()); - Flags.setNoNaNs(FMF.noNaNs()); - Flags.setNoSignedZeros(FMF.noSignedZeros()); - Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); - } + Flags.setAllowReciprocal(FMF.allowReciprocal()); + Flags.setAllowContract(FMF.allowContract()); + Flags.setNoInfs(FMF.noInfs()); + Flags.setNoNaNs(FMF.noNaNs()); + Flags.setNoSignedZeros(FMF.noSignedZeros()); + Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); + SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), - Op1, Op2, &Flags); + Op1, Op2, Flags); setValue(&I, BinNodeValue); } @@ -2642,7 +2628,7 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) { Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, - &Flags); + Flags); setValue(&I, Res); } @@ -2654,7 +2640,7 @@ void SelectionDAGBuilder::visitSDiv(const User &I) { Flags.setExact(isa<PossiblyExactOperator>(&I) && cast<PossiblyExactOperator>(&I)->isExact()); setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1, - Op2, &Flags)); + Op2, Flags)); } void SelectionDAGBuilder::visitICmp(const User &I) { @@ -2914,7 +2900,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) { DestVT, N)); // convert types. // Check if the original LLVM IR Operand was a ConstantInt, because getValue() // might fold any kind of constant expression to an integer constant and that - // is not what we are looking for. Only regcognize a bitcast of a genuine + // is not what we are looking for. Only recognize a bitcast of a genuine // constant integer as an opaque constant. else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0))) setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false, @@ -3067,14 +3053,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { if (SrcNumElts > MaskNumElts) { // Analyze the access pattern of the vector to see if we can extract - // two subvectors and do the shuffle. The analysis is done by calculating - // the range of elements the mask access on both vectors. - int MinRange[2] = { static_cast<int>(SrcNumElts), - static_cast<int>(SrcNumElts)}; - int MaxRange[2] = {-1, -1}; - - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; + // two subvectors and do the shuffle. + int StartIdx[2] = { -1, -1 }; // StartIdx to extract from + bool CanExtract = true; + for (int Idx : Mask) { unsigned Input = 0; if (Idx < 0) continue; @@ -3083,41 +3065,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { Input = 1; Idx -= SrcNumElts; } - if (Idx > MaxRange[Input]) - MaxRange[Input] = Idx; - if (Idx < MinRange[Input]) - MinRange[Input] = Idx; - } - - // Check if the access is smaller than the vector size and can we find - // a reasonable extract index. - int RangeUse[2] = { -1, -1 }; // 0 = Unused, 1 = Extract, -1 = Can not - // Extract. - int StartIdx[2]; // StartIdx to extract from - for (unsigned Input = 0; Input < 2; ++Input) { - if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) { - RangeUse[Input] = 0; // Unused - StartIdx[Input] = 0; - continue; - } - // Find a good start index that is a multiple of the mask length. Then - // see if the rest of the elements are in range. - StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts; - if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts && - StartIdx[Input] + MaskNumElts <= SrcNumElts) - RangeUse[Input] = 1; // Extract from a multiple of the mask length. + // If all the indices come from the same MaskNumElts sized portion of + // the sources we can use extract. Also make sure the extract wouldn't + // extract past the end of the source. + int NewStartIdx = alignDown(Idx, MaskNumElts); + if (NewStartIdx + MaskNumElts > SrcNumElts || + (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx)) + CanExtract = false; + // Make sure we always update StartIdx as we use it to track if all + // elements are undef. + StartIdx[Input] = NewStartIdx; } - if (RangeUse[0] == 0 && RangeUse[1] == 0) { + if (StartIdx[0] < 0 && StartIdx[1] < 0) { setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used. return; } - if (RangeUse[0] >= 0 && RangeUse[1] >= 0) { + if (CanExtract) { // Extract appropriate subvector and generate a vector shuffle for (unsigned Input = 0; Input < 2; ++Input) { SDValue &Src = Input == 0 ? Src1 : Src2; - if (RangeUse[Input] == 0) + if (StartIdx[Input] < 0) Src = DAG.getUNDEF(VT); else { Src = DAG.getNode( @@ -3128,16 +3097,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { } // Calculate new mask. - SmallVector<int, 8> MappedOps; - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; - if (Idx >= 0) { - if (Idx < (int)SrcNumElts) - Idx -= StartIdx[0]; - else - Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; - } - MappedOps.push_back(Idx); + SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end()); + for (int &Idx : MappedOps) { + if (Idx >= (int)SrcNumElts) + Idx -= SrcNumElts + StartIdx[1] - MaskNumElts; + else if (Idx >= 0) + Idx -= StartIdx[0]; } setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps)); @@ -3151,8 +3116,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { EVT EltVT = VT.getVectorElementType(); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); SmallVector<SDValue,8> Ops; - for (unsigned i = 0; i != MaskNumElts; ++i) { - int Idx = Mask[i]; + for (int Idx : Mask) { SDValue Res; if (Idx < 0) { @@ -3168,7 +3132,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { Ops.push_back(Res); } - setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops)); + setValue(&I, DAG.getBuildVector(VT, DL, Ops)); } void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { @@ -3281,14 +3245,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { // N = N + Offset uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); - // In an inbouds GEP with an offset that is nonnegative even when + // In an inbounds GEP with an offset that is nonnegative even when // interpreted as signed, assume there is no unsigned overflow. SDNodeFlags Flags; if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds()) Flags.setNoUnsignedWrap(true); N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, - DAG.getConstant(Offset, dl, N.getValueType()), &Flags); + DAG.getConstant(Offset, dl, N.getValueType()), Flags); } } else { MVT PtrTy = @@ -3318,7 +3282,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds()) Flags.setNoUnsignedWrap(true); - N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags); + N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags); continue; } @@ -3396,7 +3360,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { Flags.setNoUnsignedWrap(true); AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize, - DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags); + DAG.getIntPtrConstant(StackAlign - 1, dl), Flags); // Mask out the low bits for alignment purposes. AllocSize = DAG.getNode(ISD::AND, dl, @@ -3459,7 +3423,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (isVolatile || NumValues > MaxParallelChains) // Serialize volatile loads with other side effects. Root = getRoot(); - else if (AA->pointsToConstantMemory(MemoryLocation( + else if (AA && AA->pointsToConstantMemory(MemoryLocation( SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. Root = DAG.getEntryNode(); @@ -3500,7 +3464,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { SDValue A = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, DAG.getConstant(Offsets[i], dl, PtrVT), - &Flags); + Flags); auto MMOFlags = MachineMemOperand::MONone; if (isVolatile) MMOFlags |= MachineMemOperand::MOVolatile; @@ -3571,8 +3535,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { Type *Ty = I.getType(); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); - assert(!AA->pointsToConstantMemory(MemoryLocation( - SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) && + assert((!AA || !AA->pointsToConstantMemory(MemoryLocation( + SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) && "load_from_swift_error should not be constant memory"); SmallVector<EVT, 4> ValueVTs; @@ -3655,7 +3619,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { ChainI = 0; } SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, - DAG.getConstant(Offsets[i], dl, PtrVT), &Flags); + DAG.getConstant(Offsets[i], dl, PtrVT), Flags); SDValue St = DAG.getStore( Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add, MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo); @@ -3853,7 +3817,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); // Do not serialize masked loads of constant memory with anything. - bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation( + bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation( PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo)); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); @@ -3897,7 +3861,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { bool UniformBase = getUniformBase(BasePtr, Base, Index, this); bool ConstantMemory = false; if (UniformBase && - AA->pointsToConstantMemory(MemoryLocation( + AA && AA->pointsToConstantMemory(MemoryLocation( BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. @@ -3990,9 +3954,9 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) { SDValue Ops[3]; Ops[0] = getRoot(); Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFenceOperandTy(DAG.getDataLayout())); Ops[2] = DAG.getConstant(I.getSynchScope(), dl, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFenceOperandTy(DAG.getDataLayout())); DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops)); } @@ -4695,7 +4659,7 @@ static unsigned getUnderlyingArgReg(const SDValue &N) { /// At the end of instruction selection, they will be inserted to the entry BB. bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( const Value *V, DILocalVariable *Variable, DIExpression *Expr, - DILocation *DL, int64_t Offset, bool IsIndirect, const SDValue &N) { + DILocation *DL, int64_t Offset, bool IsDbgDeclare, const SDValue &N) { const Argument *Arg = dyn_cast<Argument>(V); if (!Arg) return false; @@ -4709,9 +4673,11 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction())) return false; + bool IsIndirect = false; Optional<MachineOperand> Op; // Some arguments' frame index is recorded during argument lowering. - if (int FI = FuncInfo.getArgumentFrameIndex(Arg)) + int FI = FuncInfo.getArgumentFrameIndex(Arg); + if (FI != INT_MAX) Op = MachineOperand::CreateFI(FI); if (!Op && N.getNode()) { @@ -4722,15 +4688,19 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (PR) Reg = PR; } - if (Reg) + if (Reg) { Op = MachineOperand::CreateReg(Reg, false); + IsIndirect = IsDbgDeclare; + } } if (!Op) { // Check if ValueMap has reg number. DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V); - if (VMI != FuncInfo.ValueMap.end()) + if (VMI != FuncInfo.ValueMap.end()) { Op = MachineOperand::CreateReg(VMI->second, false); + IsIndirect = IsDbgDeclare; + } } if (!Op && N.getNode()) @@ -4752,7 +4722,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( else FuncInfo.ArgDbgValues.push_back( BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE)) - .addOperand(*Op) + .add(*Op) .addImm(Offset) .addMetadata(Variable) .addMetadata(Expr)); @@ -4764,26 +4734,17 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, DILocalVariable *Variable, DIExpression *Expr, int64_t Offset, - DebugLoc dl, + const DebugLoc &dl, unsigned DbgSDNodeOrder) { - SDDbgValue *SDV; - auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode()); - if (FISDN && Expr->startsWithDeref()) { + if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) { // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe // stack slot locations as such instead of as indirectly addressed // locations. - ArrayRef<uint64_t> TrailingElements(Expr->elements_begin() + 1, - Expr->elements_end()); - DIExpression *DerefedDIExpr = - DIExpression::get(*DAG.getContext(), TrailingElements); - int FI = FISDN->getIndex(); - SDV = DAG.getFrameIndexDbgValue(Variable, DerefedDIExpr, FI, 0, dl, - DbgSDNodeOrder); - } else { - SDV = DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, - Offset, dl, DbgSDNodeOrder); + return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), 0, dl, + DbgSDNodeOrder); } - return SDV; + return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, + Offset, dl, DbgSDNodeOrder); } // VisualStudio defines setjmp as _setjmp @@ -4794,9 +4755,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, # define setjmp_undefined_for_msvc #endif -/// visitIntrinsicCall - Lower the call to the specified intrinsic function. If -/// we want to emit this as a call to a named external function, return the name -/// otherwise lower it and return null. +/// Lower the call to the specified intrinsic function. If we want to emit this +/// as a call to a named external function, return the name. Otherwise, lower it +/// and return null. const char * SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -4912,11 +4873,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { Entry.Node = Src; Args.push_back(Entry); - + Entry.Ty = I.getArgOperand(2)->getType(); Entry.Node = NumElements; Args.push_back(Entry); - + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); Entry.Node = ElementSize; Args.push_back(Entry); @@ -4929,14 +4890,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { report_fatal_error("Unsupported element size"); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(sdl) - .setChain(getRoot()) - .setCallee(TLI.getLibcallCallingConv(LibraryCall), - Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol( - TLI.getLibcallName(LibraryCall), - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args)); + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( + TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); DAG.setRoot(CallResult.second); @@ -4960,6 +4919,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } + // Byval arguments with frame indices were already handled after argument + // lowering and before isel. + const auto *Arg = + dyn_cast<Argument>(Address->stripInBoundsConstantOffsets()); + if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) + return nullptr; + SDValue &N = NodeMap[Address]; if (!N.getNode() && isa<Argument>(Address)) // Check unused arguments map. @@ -4978,8 +4944,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } else if (isa<Argument>(Address)) { // Address is an argument, so try to emit its dbg value using // virtual register info from the FuncInfo.ValueMap. - EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, - N); + EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N); return nullptr; } else { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), @@ -4989,22 +4954,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } else { // If Address is an argument then try to emit its dbg value using // virtual register info from the FuncInfo.ValueMap. - if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, + if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N)) { - // If variable is pinned by a alloca in dominating bb then - // use StaticAllocaMap. - if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) { - if (AI->getParent() != DI.getParent()) { - DenseMap<const AllocaInst*, int>::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - if (SI != FuncInfo.StaticAllocaMap.end()) { - SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second, - 0, dl, SDNodeOrder); - DAG.AddDbgValue(SDV, nullptr, false); - return nullptr; - } - } - } DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); } } @@ -5026,45 +4977,33 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl, SDNodeOrder); DAG.AddDbgValue(SDV, nullptr, false); - } else { - // Do not use getValue() in here; we don't want to generate code at - // this point if it hasn't been done yet. - SDValue N = NodeMap[V]; - if (!N.getNode() && isa<Argument>(V)) - // Check unused arguments map. - N = UnusedArgNodeMap[V]; - if (N.getNode()) { - if (!EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, - false, N)) { - SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder); - DAG.AddDbgValue(SDV, N.getNode(), false); - } - } else if (!V->use_empty() ) { - // Do not call getValue(V) yet, as we don't want to generate code. - // Remember it for later. - DanglingDebugInfo DDI(&DI, dl, SDNodeOrder); - DanglingDebugInfoMap[V] = DDI; - } else { - // We may expand this to cover more cases. One case where we have no - // data available is an unreferenced parameter. - DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); - } + return nullptr; } - // Build a debug info table entry. - if (const BitCastInst *BCI = dyn_cast<BitCastInst>(V)) - V = BCI->getOperand(0); - const AllocaInst *AI = dyn_cast<AllocaInst>(V); - // Don't handle byval struct arguments or VLAs, for example. - if (!AI) { - DEBUG(dbgs() << "Dropping debug location info for:\n " << DI << "\n"); - DEBUG(dbgs() << " Last seen at:\n " << *V << "\n"); + // Do not use getValue() in here; we don't want to generate code at + // this point if it hasn't been done yet. + SDValue N = NodeMap[V]; + if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map. + N = UnusedArgNodeMap[V]; + if (N.getNode()) { + if (EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, false, + N)) + return nullptr; + SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder); + DAG.AddDbgValue(SDV, N.getNode(), false); return nullptr; } - DenseMap<const AllocaInst*, int>::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - if (SI == FuncInfo.StaticAllocaMap.end()) - return nullptr; // VLAs. + + if (!V->use_empty() ) { + // Do not call getValue(V) yet, as we don't want to generate code. + // Remember it for later. + DanglingDebugInfo DDI(&DI, dl, SDNodeOrder); + DanglingDebugInfoMap[V] = DDI; + return nullptr; + } + + DEBUG(dbgs() << "Dropping debug location info for:\n " << DI << "\n"); + DEBUG(dbgs() << " Last seen at:\n " << *V << "\n"); return nullptr; } @@ -5202,7 +5141,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue ShOps[2]; ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, sdl, MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps); + ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt); Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT, @@ -5301,6 +5240,25 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return nullptr; + case Intrinsic::experimental_constrained_fadd: + case Intrinsic::experimental_constrained_fsub: + case Intrinsic::experimental_constrained_fmul: + case Intrinsic::experimental_constrained_fdiv: + case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_sqrt: + case Intrinsic::experimental_constrained_pow: + case Intrinsic::experimental_constrained_powi: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_rint: + case Intrinsic::experimental_constrained_nearbyint: + visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I)); + return nullptr; case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && @@ -5537,7 +5495,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::trap: { StringRef TrapFuncName = I.getAttributes() - .getAttribute(AttributeSet::FunctionIndex, "trap-func-name") + .getAttribute(AttributeList::FunctionIndex, "trap-func-name") .getValueAsString(); if (TrapFuncName.empty()) { ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? @@ -5548,7 +5506,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { TargetLowering::ArgListTy Args; TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee( + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( CallingConv::C, I.getType(), DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy(DAG.getDataLayout())), @@ -5629,7 +5587,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue Ops[2]; Ops[0] = getRoot(); Ops[1] = - DAG.getFrameIndex(FI, TLI.getPointerTy(DAG.getDataLayout()), true); + DAG.getFrameIndex(FI, TLI.getFrameIndexTy(DAG.getDataLayout()), true); unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END); Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops); @@ -5690,7 +5648,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { int FI = FuncInfo.StaticAllocaMap[Slot]; MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(MF.getName()), Idx); + GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl, TII->get(TargetOpcode::LOCAL_ESCAPE)) .addSym(FrameAllocSym) @@ -5711,7 +5669,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX)); MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(Fn->getName()), IdxVal); + GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal); // Create a MCSymbol for the label to avoid any target lowering // that would make this PC relative. @@ -5742,13 +5700,142 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { setValue(&I, N); return nullptr; } + case Intrinsic::xray_customevent: { + // Here we want to make sure that the intrinsic behaves as if it has a + // specific calling convention, and only for x86_64. + // FIXME: Support other platforms later. + const auto &Triple = DAG.getTarget().getTargetTriple(); + if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux()) + return nullptr; + + SDLoc DL = getCurSDLoc(); + SmallVector<SDValue, 8> Ops; + // We want to say that we always want the arguments in registers. + SDValue LogEntryVal = getValue(I.getArgOperand(0)); + SDValue StrSizeVal = getValue(I.getArgOperand(1)); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Chain = getRoot(); + Ops.push_back(LogEntryVal); + Ops.push_back(StrSizeVal); + Ops.push_back(Chain); + + // We need to enforce the calling convention for the callsite, so that + // argument ordering is enforced correctly, and that register allocation can + // see that some registers may be assumed clobbered and have to preserve + // them across calls to the intrinsic. + MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL, + DL, NodeTys, Ops); + SDValue patchableNode = SDValue(MN, 0); + DAG.setRoot(patchableNode); + setValue(&I, patchableNode); + return nullptr; + } case Intrinsic::experimental_deoptimize: LowerDeoptimizeCall(&I); return nullptr; + + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: { + visitVectorReduce(I, Intrinsic); + return nullptr; + } + } } +void SelectionDAGBuilder::visitConstrainedFPIntrinsic( + const ConstrainedFPIntrinsic &FPI) { + SDLoc sdl = getCurSDLoc(); + unsigned Opcode; + switch (FPI.getIntrinsicID()) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::experimental_constrained_fadd: + Opcode = ISD::STRICT_FADD; + break; + case Intrinsic::experimental_constrained_fsub: + Opcode = ISD::STRICT_FSUB; + break; + case Intrinsic::experimental_constrained_fmul: + Opcode = ISD::STRICT_FMUL; + break; + case Intrinsic::experimental_constrained_fdiv: + Opcode = ISD::STRICT_FDIV; + break; + case Intrinsic::experimental_constrained_frem: + Opcode = ISD::STRICT_FREM; + break; + case Intrinsic::experimental_constrained_sqrt: + Opcode = ISD::STRICT_FSQRT; + break; + case Intrinsic::experimental_constrained_pow: + Opcode = ISD::STRICT_FPOW; + break; + case Intrinsic::experimental_constrained_powi: + Opcode = ISD::STRICT_FPOWI; + break; + case Intrinsic::experimental_constrained_sin: + Opcode = ISD::STRICT_FSIN; + break; + case Intrinsic::experimental_constrained_cos: + Opcode = ISD::STRICT_FCOS; + break; + case Intrinsic::experimental_constrained_exp: + Opcode = ISD::STRICT_FEXP; + break; + case Intrinsic::experimental_constrained_exp2: + Opcode = ISD::STRICT_FEXP2; + break; + case Intrinsic::experimental_constrained_log: + Opcode = ISD::STRICT_FLOG; + break; + case Intrinsic::experimental_constrained_log10: + Opcode = ISD::STRICT_FLOG10; + break; + case Intrinsic::experimental_constrained_log2: + Opcode = ISD::STRICT_FLOG2; + break; + case Intrinsic::experimental_constrained_rint: + Opcode = ISD::STRICT_FRINT; + break; + case Intrinsic::experimental_constrained_nearbyint: + Opcode = ISD::STRICT_FNEARBYINT; + break; + } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Chain = getRoot(); + SmallVector<EVT, 4> ValueVTs; + ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs); + ValueVTs.push_back(MVT::Other); // Out chain + + SDVTList VTs = DAG.getVTList(ValueVTs); + SDValue Result; + if (FPI.isUnaryOp()) + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)) }); + else + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)), + getValue(FPI.getArgOperand(1)) }); + + assert(Result.getNode()->getNumValues() == 2); + SDValue OutChain = Result.getValue(1); + DAG.setRoot(OutChain); + SDValue FPResult = Result.getValue(0); + setValue(&FPI, FPResult); +} + std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, const BasicBlock *EHPadBB) { @@ -5827,7 +5914,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, Type *RetTy = CS.getType(); TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; Args.reserve(CS.arg_size()); const Value *SwiftErrorVal = nullptr; @@ -5843,6 +5929,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { + TargetLowering::ArgListEntry Entry; const Value *V = *i; // Skip empty types @@ -5852,11 +5939,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, SDValue ArgNode = getValue(V); Entry.Node = ArgNode; Entry.Ty = V->getType(); - // Skip the first return-type Attribute to get to params. - Entry.setAttributes(&CS, i - CS.arg_begin() + 1); + Entry.setAttributes(&CS, i - CS.arg_begin()); // Use swifterror virtual register as input to the call. - if (Entry.isSwiftError && TLI.supportSwiftError()) { + if (Entry.IsSwiftError && TLI.supportSwiftError()) { SwiftErrorVal = V; // We find the virtual register for the actual swifterror argument. // Instead of using the Value, we use the virtual register instead. @@ -5869,7 +5955,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, // If we have an explicit sret argument that is an Instruction, (i.e., it // might point to function-local memory), we can't meaningfully tail-call. - if (Entry.isSRet && isa<Instruction>(V)) + if (Entry.IsSRet && isa<Instruction>(V)) isTailCall = false; } @@ -5912,8 +5998,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, } } -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. +/// Return true if it only matters that the value is equal or not-equal to zero. static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { for (const User *U : V->users()) { if (const ICmpInst *IC = dyn_cast<ICmpInst>(U)) @@ -5928,13 +6013,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { } static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, - Type *LoadTy, SelectionDAGBuilder &Builder) { // Check to see if this load can be trivially constant folded, e.g. if the // input is from a string literal. if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) { // Cast pointer to the type we really want to load. + Type *LoadTy = + Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits()); + if (LoadVT.isVector()) + LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements()); + LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput), PointerType::getUnqual(LoadTy)); @@ -5949,7 +6038,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, bool ConstantMemory = false; // Do not serialize (non-volatile) loads of constant memory with anything. - if (Builder.AA->pointsToConstantMemory(PtrVal)) { + if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) { Root = Builder.DAG.getEntryNode(); ConstantMemory = true; } else { @@ -5967,8 +6056,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, return LoadVal; } -/// processIntegerCallValue - Record the value for an instruction that -/// produces an integer result, converting the type where necessary. +/// Record the value for an instruction that produces an integer result, +/// converting the type where necessary. void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, SDValue Value, bool IsSigned) { @@ -5981,20 +6070,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, setValue(&I, Value); } -/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a memcmp call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { - // Verify that the prototype makes sense. int memcmp(void*,void*,size_t) - if (I.getNumArgOperands() != 3) - return false; - const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); - if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() || - !I.getArgOperand(2)->getType()->isIntegerTy() || - !I.getType()->isIntegerTy()) - return false; - const Value *Size = I.getArgOperand(2); const ConstantInt *CSize = dyn_cast<ConstantInt>(Size); if (CSize && CSize->getZExtValue() == 0) { @@ -6005,11 +6087,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { } const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); - std::pair<SDValue, SDValue> Res = - TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(), - getValue(LHS), getValue(RHS), getValue(Size), - MachinePointerInfo(LHS), - MachinePointerInfo(RHS)); + std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp( + DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS), + getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS)); if (Res.first.getNode()) { processIntegerCallValue(I, Res.first, true); PendingLoads.push_back(Res.second); @@ -6018,88 +6098,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 - if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) { - bool ActuallyDoIt = true; - MVT LoadVT; - Type *LoadTy; - switch (CSize->getZExtValue()) { - default: - LoadVT = MVT::Other; - LoadTy = nullptr; - ActuallyDoIt = false; - break; - case 2: - LoadVT = MVT::i16; - LoadTy = Type::getInt16Ty(CSize->getContext()); - break; - case 4: - LoadVT = MVT::i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - break; - case 8: - LoadVT = MVT::i64; - LoadTy = Type::getInt64Ty(CSize->getContext()); - break; - /* - case 16: - LoadVT = MVT::v4i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - LoadTy = VectorType::get(LoadTy, 4); - break; - */ - } - - // This turns into unaligned loads. We only do this if the target natively - // supports the MVT we'll be loading or if it is small enough (<= 4) that - // we'll only produce a small number of byte loads. + if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I)) + return false; - // Require that we can find a legal MVT, and only do this if the target - // supports unaligned loads of that type. Expanding into byte loads would - // bloat the code. + // If the target has a fast compare for the given size, it will return a + // preferred load type for that size. Require that the load VT is legal and + // that the target supports unaligned loads of that type. Otherwise, return + // INVALID. + auto hasFastLoadsAndCompare = [&](unsigned NumBits) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (ActuallyDoIt && CSize->getZExtValue() > 4) { - unsigned DstAS = LHS->getType()->getPointerAddressSpace(); - unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + MVT LVT = TLI.hasFastEqualityCompare(NumBits); + if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { // TODO: Handle 5 byte compare as 4-byte + 1 byte. // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. // TODO: Check alignment of src and dest ptrs. - if (!TLI.isTypeLegal(LoadVT) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS)) - ActuallyDoIt = false; + unsigned DstAS = LHS->getType()->getPointerAddressSpace(); + unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + if (!TLI.isTypeLegal(LVT) || + !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || + !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) + LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; } - if (ActuallyDoIt) { - SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); - SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); + return LVT; + }; - SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, - ISD::SETNE); - processIntegerCallValue(I, Res, false); - return true; - } + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. + MVT LoadVT; + unsigned NumBitsToCompare = CSize->getZExtValue() * 8; + switch (NumBitsToCompare) { + default: + return false; + case 16: + LoadVT = MVT::i16; + break; + case 32: + LoadVT = MVT::i32; + break; + case 64: + case 128: + case 256: + LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); + break; } + if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; - return false; + SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); + SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + + // Bitcast to a wide integer type if the loads are vectors. + if (LoadVT.isVector()) { + EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); + LoadL = DAG.getBitcast(CmpVT, LoadL); + LoadR = DAG.getBitcast(CmpVT, LoadR); + } + + SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); + processIntegerCallValue(I, Cmp, false); + return true; } -/// visitMemChrCall -- See if we can lower a memchr call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a memchr call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { - // Verify that the prototype makes sense. void *memchr(void *, int, size_t) - if (I.getNumArgOperands() != 3) - return false; - const Value *Src = I.getArgOperand(0); const Value *Char = I.getArgOperand(1); const Value *Length = I.getArgOperand(2); - if (!Src->getType()->isPointerTy() || - !Char->getType()->isIntegerTy() || - !Length->getType()->isIntegerTy() || - !I.getType()->isPointerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6115,15 +6186,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) { return false; } -/// -/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to -/// to adjust the dst pointer by the size of the copied memory. +/// See if we can lower a mempcpy call into an optimized form. If so, return +/// true and lower it. Otherwise return false, and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { - - // Verify argument count: void *mempcpy(void *, const void *, size_t) - if (I.getNumArgOperands() != 3) - return false; - SDValue Dst = getValue(I.getArgOperand(0)); SDValue Src = getValue(I.getArgOperand(1)); SDValue Size = getValue(I.getArgOperand(2)); @@ -6158,19 +6226,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { return true; } -/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an -/// optimized form. If so, return true and lower it, otherwise return false -/// and it will be lowered like a normal call. +/// See if we can lower a strcpy call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { - // Verify that the prototype makes sense. char *strcpy(char *, char *) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isPointerTy() || - !I.getType()->isPointerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6187,19 +6249,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) { return false; } -/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form. -/// If so, return true and lower it, otherwise return false and it will be -/// lowered like a normal call. +/// See if we can lower a strcmp call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { - // Verify that the prototype makes sense. int strcmp(void*,void*) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isPointerTy() || - !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6216,17 +6272,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) { return false; } -/// visitStrLenCall -- See if we can lower a strlen call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strlen call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { - // Verify that the prototype makes sense. size_t strlen(char *) - if (I.getNumArgOperands() != 1) - return false; - const Value *Arg0 = I.getArgOperand(0); - if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6241,19 +6293,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) { return false; } -/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized -/// form. If so, return true and lower it, otherwise return false and it -/// will be lowered like a normal call. +/// See if we can lower a strnlen call into an optimized form. If so, return +/// true and lower it, otherwise return false and it will be lowered like a +/// normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { - // Verify that the prototype makes sense. size_t strnlen(char *, size_t) - if (I.getNumArgOperands() != 2) - return false; - const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1); - if (!Arg0->getType()->isPointerTy() || - !Arg1->getType()->isIntegerTy() || - !I.getType()->isIntegerTy()) - return false; const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); std::pair<SDValue, SDValue> Res = @@ -6269,16 +6315,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { return false; } -/// visitUnaryFloatCall - If a call instruction is a unary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a unary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it, otherwise return +/// false and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { - // Sanity check that it really is a unary floating-point call. - if (I.getNumArgOperands() != 1 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - !I.onlyReadsMemory()) + // We already checked this call's prototype; verify it doesn't modify errno. + if (!I.onlyReadsMemory()) return false; SDValue Tmp = getValue(I.getArgOperand(0)); @@ -6286,17 +6331,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, return true; } -/// visitBinaryFloatCall - If a call instruction is a binary floating-point -/// operation (as expected), translate it to an SDNode with the specified opcode -/// and return true. +/// See if we can lower a binary floating-point operation into an SDNode with +/// the specified Opcode. If so, return true and lower it. Otherwise return +/// false, and it will be lowered like a normal call. +/// The caller already checked that \p I calls the appropriate LibFunc with a +/// correct prototype. bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { - // Sanity check that it really is a binary floating-point call. - if (I.getNumArgOperands() != 2 || - !I.getArgOperand(0)->getType()->isFloatingPointTy() || - I.getType() != I.getArgOperand(0)->getType() || - I.getType() != I.getArgOperand(1)->getType() || - !I.onlyReadsMemory()) + // We already checked this call's prototype; verify it doesn't modify errno. + if (!I.onlyReadsMemory()) return false; SDValue Tmp0 = getValue(I.getArgOperand(0)); @@ -6336,20 +6379,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. Don't do the check if marked as nobuiltin for // some reason. - LibFunc::Func Func; + LibFunc Func; if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && - LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; - case LibFunc::copysign: - case LibFunc::copysignf: - case LibFunc::copysignl: - if (I.getNumArgOperands() == 2 && // Basic sanity checks. - I.getArgOperand(0)->getType()->isFloatingPointTy() && - I.getType() == I.getArgOperand(0)->getType() && - I.getType() == I.getArgOperand(1)->getType() && - I.onlyReadsMemory()) { + case LibFunc_copysign: + case LibFunc_copysignf: + case LibFunc_copysignl: + // We already checked this call's prototype; verify it doesn't modify + // errno. + if (I.onlyReadsMemory()) { SDValue LHS = getValue(I.getArgOperand(0)); SDValue RHS = getValue(I.getArgOperand(1)); setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(), @@ -6357,122 +6398,122 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { return; } break; - case LibFunc::fabs: - case LibFunc::fabsf: - case LibFunc::fabsl: + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: if (visitUnaryFloatCall(I, ISD::FABS)) return; break; - case LibFunc::fmin: - case LibFunc::fminf: - case LibFunc::fminl: + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: if (visitBinaryFloatCall(I, ISD::FMINNUM)) return; break; - case LibFunc::fmax: - case LibFunc::fmaxf: - case LibFunc::fmaxl: + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: if (visitBinaryFloatCall(I, ISD::FMAXNUM)) return; break; - case LibFunc::sin: - case LibFunc::sinf: - case LibFunc::sinl: + case LibFunc_sin: + case LibFunc_sinf: + case LibFunc_sinl: if (visitUnaryFloatCall(I, ISD::FSIN)) return; break; - case LibFunc::cos: - case LibFunc::cosf: - case LibFunc::cosl: + case LibFunc_cos: + case LibFunc_cosf: + case LibFunc_cosl: if (visitUnaryFloatCall(I, ISD::FCOS)) return; break; - case LibFunc::sqrt: - case LibFunc::sqrtf: - case LibFunc::sqrtl: - case LibFunc::sqrt_finite: - case LibFunc::sqrtf_finite: - case LibFunc::sqrtl_finite: + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + case LibFunc_sqrt_finite: + case LibFunc_sqrtf_finite: + case LibFunc_sqrtl_finite: if (visitUnaryFloatCall(I, ISD::FSQRT)) return; break; - case LibFunc::floor: - case LibFunc::floorf: - case LibFunc::floorl: + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: if (visitUnaryFloatCall(I, ISD::FFLOOR)) return; break; - case LibFunc::nearbyint: - case LibFunc::nearbyintf: - case LibFunc::nearbyintl: + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: if (visitUnaryFloatCall(I, ISD::FNEARBYINT)) return; break; - case LibFunc::ceil: - case LibFunc::ceilf: - case LibFunc::ceill: + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: if (visitUnaryFloatCall(I, ISD::FCEIL)) return; break; - case LibFunc::rint: - case LibFunc::rintf: - case LibFunc::rintl: + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: if (visitUnaryFloatCall(I, ISD::FRINT)) return; break; - case LibFunc::round: - case LibFunc::roundf: - case LibFunc::roundl: + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: if (visitUnaryFloatCall(I, ISD::FROUND)) return; break; - case LibFunc::trunc: - case LibFunc::truncf: - case LibFunc::truncl: + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: if (visitUnaryFloatCall(I, ISD::FTRUNC)) return; break; - case LibFunc::log2: - case LibFunc::log2f: - case LibFunc::log2l: + case LibFunc_log2: + case LibFunc_log2f: + case LibFunc_log2l: if (visitUnaryFloatCall(I, ISD::FLOG2)) return; break; - case LibFunc::exp2: - case LibFunc::exp2f: - case LibFunc::exp2l: + case LibFunc_exp2: + case LibFunc_exp2f: + case LibFunc_exp2l: if (visitUnaryFloatCall(I, ISD::FEXP2)) return; break; - case LibFunc::memcmp: + case LibFunc_memcmp: if (visitMemCmpCall(I)) return; break; - case LibFunc::mempcpy: + case LibFunc_mempcpy: if (visitMemPCpyCall(I)) return; break; - case LibFunc::memchr: + case LibFunc_memchr: if (visitMemChrCall(I)) return; break; - case LibFunc::strcpy: + case LibFunc_strcpy: if (visitStrCpyCall(I, false)) return; break; - case LibFunc::stpcpy: + case LibFunc_stpcpy: if (visitStrCpyCall(I, true)) return; break; - case LibFunc::strcmp: + case LibFunc_strcmp: if (visitStrCmpCall(I)) return; break; - case LibFunc::strlen: + case LibFunc_strlen: if (visitStrLenCall(I)) return; break; - case LibFunc::strnlen: + case LibFunc_strnlen: if (visitStrNLenCall(I)) return; break; @@ -6648,7 +6689,7 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location, unsigned Align = DL.getPrefTypeAlignment(Ty); MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy(DL)); + SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL)); Chain = DAG.getStore(Chain, Location, OpInfo.CallOperand, StackSlot, MachinePointerInfo::getFixedStack(MF, SSFI)); OpInfo.CallOperand = StackSlot; @@ -6671,12 +6712,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, MachineFunction &MF = DAG.getMachineFunction(); SmallVector<unsigned, 4> Regs; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // If this is a constraint for a single physreg, or a constraint for a // register class, find it. std::pair<unsigned, const TargetRegisterClass *> PhysReg = - TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(), - OpInfo.ConstraintCode, + TLI.getRegForInlineAsmConstraint(&TRI, OpInfo.ConstraintCode, OpInfo.ConstraintVT); unsigned NumRegs = 1; @@ -6684,12 +6725,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, // If this is a FP input in an integer register (or visa versa) insert a bit // cast of the input value. More generally, handle any case where the input // value disagrees with the register class we plan to stick this in. - if (OpInfo.Type == InlineAsm::isInput && - PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) { + if (OpInfo.Type == InlineAsm::isInput && PhysReg.second && + !TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) { // Try to convert to the first EVT that the reg class contains. If the // types are identical size, use a bitcast to convert (e.g. two differing // vector types). - MVT RegVT = *PhysReg.second->vt_begin(); + MVT RegVT = *TRI.legalclasstypes_begin(*PhysReg.second); if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) { OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand); @@ -6717,12 +6758,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, if (unsigned AssignedReg = PhysReg.first) { const TargetRegisterClass *RC = PhysReg.second; if (OpInfo.ConstraintVT == MVT::Other) - ValueVT = *RC->vt_begin(); + ValueVT = *TRI.legalclasstypes_begin(*RC); // Get the actual register value type. This is important, because the user // may have asked for (e.g.) the AX register in i32 type. We need to // remember that AX is actually i16 to get the right extension. - RegVT = *RC->vt_begin(); + RegVT = *TRI.legalclasstypes_begin(*RC); // This is a explicit reference to a physical register. Regs.push_back(AssignedReg); @@ -6748,7 +6789,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, // Otherwise, if this was a reference to an LLVM register class, create vregs // for this reference. if (const TargetRegisterClass *RC = PhysReg.second) { - RegVT = *RC->vt_begin(); + RegVT = *TRI.legalclasstypes_begin(*RC); if (OpInfo.ConstraintVT == MVT::Other) ValueVT = RegVT; @@ -7361,7 +7402,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( // Populate the argument list. // Attributes for args start at offset 1, after the return attribute. - for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1; + for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) { const Value *V = CS->getOperand(ArgI); @@ -7370,7 +7411,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo( TargetLowering::ArgListEntry Entry; Entry.Node = getValue(V); Entry.Ty = V->getType(); - Entry.setAttributes(&CS, AttrI); + Entry.setAttributes(&CS, ArgIdx); Args.push_back(Entry); } @@ -7411,7 +7452,7 @@ static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx, } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) { const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo(); Ops.push_back(Builder.DAG.getTargetFrameIndex( - FI->getIndex(), TLI.getPointerTy(Builder.DAG.getDataLayout()))); + FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout()))); } else Ops.push_back(OpVal); } @@ -7437,11 +7478,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { // have to worry about calling conventions and target specific lowering code. // Instead we perform the call lowering right here. // - // chain, flag = CALLSEQ_START(chain, 0) + // chain, flag = CALLSEQ_START(chain, 0, 0) // chain, flag = STACKMAP(id, nbytes, ..., chain, flag) // chain, flag = CALLSEQ_END(chain, 0, 0, flag) // - Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL); + Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL); InFlag = Chain.getValue(1); // Add the <id> and <numBytes> constants. @@ -7631,9 +7672,79 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, FuncInfo.MF->getFrameInfo().setHasPatchPoint(); } -/// Returns an AttributeSet representing the attributes applied to the return +void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, + unsigned Intrinsic) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2; + if (I.getNumArgOperands() > 1) + Op2 = getValue(I.getArgOperand(1)); + SDLoc dl = getCurSDLoc(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Res; + FastMathFlags FMF; + if (isa<FPMathOperator>(I)) + FMF = I.getFastMathFlags(); + SDNodeFlags SDFlags; + SDFlags.setNoNaNs(FMF.noNaNs()); + + switch (Intrinsic) { + case Intrinsic::experimental_vector_reduce_fadd: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_fmul: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_add: + Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_mul: + Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_and: + Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_or: + Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_xor: + Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smax: + Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smin: + Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umax: + Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umin: + Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_fmax: { + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); + break; + } + case Intrinsic::experimental_vector_reduce_fmin: { + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); + break; + } + default: + llvm_unreachable("Unhandled vector reduce intrinsic"); + } + setValue(&I, Res); +} + +/// Returns an AttributeList representing the attributes applied to the return /// value of the given call. -static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { +static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { SmallVector<Attribute::AttrKind, 2> Attrs; if (CLI.RetSExt) Attrs.push_back(Attribute::SExt); @@ -7642,8 +7753,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { if (CLI.IsInReg) Attrs.push_back(Attribute::InReg); - return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex, - Attrs); + return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex, + Attrs); } /// TargetLowering::LowerCallTo - This is the default LowerCallTo @@ -7679,19 +7790,19 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false); Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy); - DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy(DL)); + DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL)); ArgListEntry Entry; Entry.Node = DemoteStackSlot; Entry.Ty = StackSlotPtrType; - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isInReg = false; - Entry.isSRet = true; - Entry.isNest = false; - Entry.isByVal = false; - Entry.isReturned = false; - Entry.isSwiftSelf = false; - Entry.isSwiftError = false; + Entry.IsSExt = false; + Entry.IsZExt = false; + Entry.IsInReg = false; + Entry.IsSRet = true; + Entry.IsNest = false; + Entry.IsByVal = false; + Entry.IsReturned = false; + Entry.IsSwiftSelf = false; + Entry.IsSwiftError = false; Entry.Alignment = Align; CLI.getArgs().insert(CLI.getArgs().begin(), Entry); CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext()); @@ -7724,7 +7835,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ArgListTy &Args = CLI.getArgs(); if (supportSwiftError()) { for (unsigned i = 0, e = Args.size(); i != e; ++i) { - if (Args[i].isSwiftError) { + if (Args[i].IsSwiftError) { ISD::InputArg MyFlags; MyFlags.VT = getPointerTy(DL); MyFlags.ArgVT = EVT(getPointerTy(DL)); @@ -7741,7 +7852,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs); Type *FinalType = Args[i].Ty; - if (Args[i].isByVal) + if (Args[i].IsByVal) FinalType = cast<PointerType>(Args[i].Ty)->getElementType(); bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( FinalType, CLI.CallConv, CLI.IsVarArg); @@ -7754,11 +7865,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ISD::ArgFlagsTy Flags; unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - if (Args[i].isZExt) + if (Args[i].IsZExt) Flags.setZExt(); - if (Args[i].isSExt) + if (Args[i].IsSExt) Flags.setSExt(); - if (Args[i].isInReg) { + if (Args[i].IsInReg) { // If we are using vectorcall calling convention, a structure that is // passed InReg - is surely an HVA if (CLI.CallConv == CallingConv::X86_VectorCall && @@ -7771,15 +7882,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // Set InReg Flag Flags.setInReg(); } - if (Args[i].isSRet) + if (Args[i].IsSRet) Flags.setSRet(); - if (Args[i].isSwiftSelf) + if (Args[i].IsSwiftSelf) Flags.setSwiftSelf(); - if (Args[i].isSwiftError) + if (Args[i].IsSwiftError) Flags.setSwiftError(); - if (Args[i].isByVal) + if (Args[i].IsByVal) Flags.setByVal(); - if (Args[i].isInAlloca) { + if (Args[i].IsInAlloca) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated @@ -7788,7 +7899,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // in the various CC lowering callbacks. Flags.setByVal(); } - if (Args[i].isByVal || Args[i].isInAlloca) { + if (Args[i].IsByVal || Args[i].IsInAlloca) { PointerType *Ty = cast<PointerType>(Args[i].Ty); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); @@ -7801,7 +7912,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { FrameAlign = getByValTypeAlignment(ElementTy, DL); Flags.setByValAlign(FrameAlign); } - if (Args[i].isNest) + if (Args[i].IsNest) Flags.setNest(); if (NeedsRegBlock) Flags.setInConsecutiveRegs(); @@ -7812,13 +7923,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SmallVector<SDValue, 4> Parts(NumParts); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (Args[i].isSExt) + if (Args[i].IsSExt) ExtendKind = ISD::SIGN_EXTEND; - else if (Args[i].isZExt) + else if (Args[i].IsZExt) ExtendKind = ISD::ZERO_EXTEND; // Conservatively only handle 'returned' on non-vectors for now - if (Args[i].isReturned && !Op.getValueType().isVector()) { + if (Args[i].IsReturned && !Op.getValueType().isVector()) { assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues && "unexpected use of 'returned'"); // Before passing 'returned' to the target lowering code, ensure that @@ -7832,9 +7943,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // parameter extension method is not compatible with the return // extension method if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) || - (ExtendKind != ISD::ANY_EXTEND && - CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt)) - Flags.setReturned(); + (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt && + CLI.RetZExt == Args[i].IsZExt)) + Flags.setReturned(); } getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, @@ -7916,7 +8027,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { for (unsigned i = 0; i < NumValues; ++i) { SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], CLI.DL, - PtrVT), &Flags); + PtrVT), Flags); SDValue L = CLI.DAG.getLoad( RetTys[i], CLI.DL, CLI.Chain, Add, MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(), @@ -8010,6 +8121,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) { return true; } +typedef DenseMap<const Argument *, + std::pair<const AllocaInst *, const StoreInst *>> + ArgCopyElisionMapTy; + +/// Scan the entry block of the function in FuncInfo for arguments that look +/// like copies into a local alloca. Record any copied arguments in +/// ArgCopyElisionCandidates. +static void +findArgumentCopyElisionCandidates(const DataLayout &DL, + FunctionLoweringInfo *FuncInfo, + ArgCopyElisionMapTy &ArgCopyElisionCandidates) { + // Record the state of every static alloca used in the entry block. Argument + // allocas are all used in the entry block, so we need approximately as many + // entries as we have arguments. + enum StaticAllocaInfo { Unknown, Clobbered, Elidable }; + SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas; + unsigned NumArgs = FuncInfo->Fn->arg_size(); + StaticAllocas.reserve(NumArgs * 2); + + auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * { + if (!V) + return nullptr; + V = V->stripPointerCasts(); + const auto *AI = dyn_cast<AllocaInst>(V); + if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI)) + return nullptr; + auto Iter = StaticAllocas.insert({AI, Unknown}); + return &Iter.first->second; + }; + + // Look for stores of arguments to static allocas. Look through bitcasts and + // GEPs to handle type coercions, as long as the alloca is fully initialized + // by the store. Any non-store use of an alloca escapes it and any subsequent + // unanalyzed store might write it. + // FIXME: Handle structs initialized with multiple stores. + for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) { + // Look for stores, and handle non-store uses conservatively. + const auto *SI = dyn_cast<StoreInst>(&I); + if (!SI) { + // We will look through cast uses, so ignore them completely. + if (I.isCast()) + continue; + // Ignore debug info intrinsics, they don't escape or store to allocas. + if (isa<DbgInfoIntrinsic>(I)) + continue; + // This is an unknown instruction. Assume it escapes or writes to all + // static alloca operands. + for (const Use &U : I.operands()) { + if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U)) + *Info = StaticAllocaInfo::Clobbered; + } + continue; + } + + // If the stored value is a static alloca, mark it as escaped. + if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand())) + *Info = StaticAllocaInfo::Clobbered; + + // Check if the destination is a static alloca. + const Value *Dst = SI->getPointerOperand()->stripPointerCasts(); + StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst); + if (!Info) + continue; + const AllocaInst *AI = cast<AllocaInst>(Dst); + + // Skip allocas that have been initialized or clobbered. + if (*Info != StaticAllocaInfo::Unknown) + continue; + + // Check if the stored value is an argument, and that this store fully + // initializes the alloca. Don't elide copies from the same argument twice. + const Value *Val = SI->getValueOperand()->stripPointerCasts(); + const auto *Arg = dyn_cast<Argument>(Val); + if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() || + Arg->getType()->isEmptyTy() || + DL.getTypeStoreSize(Arg->getType()) != + DL.getTypeAllocSize(AI->getAllocatedType()) || + ArgCopyElisionCandidates.count(Arg)) { + *Info = StaticAllocaInfo::Clobbered; + continue; + } + + DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n'); + + // Mark this alloca and store for argument copy elision. + *Info = StaticAllocaInfo::Elidable; + ArgCopyElisionCandidates.insert({Arg, {AI, SI}}); + + // Stop scanning if we've seen all arguments. This will happen early in -O0 + // builds, which is useful, because -O0 builds have large entry blocks and + // many allocas. + if (ArgCopyElisionCandidates.size() == NumArgs) + break; + } +} + +/// Try to elide argument copies from memory into a local alloca. Succeeds if +/// ArgVal is a load from a suitable fixed stack object. +static void tryToElideArgumentCopy( + FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains, + DenseMap<int, int> &ArgCopyElisionFrameIndexMap, + SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs, + ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg, + SDValue ArgVal, bool &ArgHasUses) { + // Check if this is a load from a fixed stack object. + auto *LNode = dyn_cast<LoadSDNode>(ArgVal); + if (!LNode) + return; + auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()); + if (!FINode) + return; + + // Check that the fixed stack object is the right size and alignment. + // Look at the alignment that the user wrote on the alloca instead of looking + // at the stack object. + auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg); + assert(ArgCopyIter != ArgCopyElisionCandidates.end()); + const AllocaInst *AI = ArgCopyIter->second.first; + int FixedIndex = FINode->getIndex(); + int &AllocaIndex = FuncInfo->StaticAllocaMap[AI]; + int OldIndex = AllocaIndex; + MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo(); + if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) { + DEBUG(dbgs() << " argument copy elision failed due to bad fixed stack " + "object size\n"); + return; + } + unsigned RequiredAlignment = AI->getAlignment(); + if (!RequiredAlignment) { + RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment( + AI->getAllocatedType()); + } + if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) { + DEBUG(dbgs() << " argument copy elision failed: alignment of alloca " + "greater than stack argument alignment (" + << RequiredAlignment << " vs " + << MFI.getObjectAlignment(FixedIndex) << ")\n"); + return; + } + + // Perform the elision. Delete the old stack object and replace its only use + // in the variable info map. Mark the stack object as mutable. + DEBUG({ + dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n' + << " Replacing frame index " << OldIndex << " with " << FixedIndex + << '\n'; + }); + MFI.RemoveStackObject(OldIndex); + MFI.setIsImmutableObjectIndex(FixedIndex, false); + AllocaIndex = FixedIndex; + ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex}); + Chains.push_back(ArgVal.getValue(1)); + + // Avoid emitting code for the store implementing the copy. + const StoreInst *SI = ArgCopyIter->second.second; + ElidedArgCopyInstrs.insert(SI); + + // Check for uses of the argument again so that we can avoid exporting ArgVal + // if it is't used by anything other than the store. + for (const Value *U : Arg.users()) { + if (U != SI) { + ArgHasUses = true; + break; + } + } +} + void SelectionDAGISel::LowerArguments(const Function &F) { SelectionDAG &DAG = SDB->DAG; SDLoc dl = SDB->getCurSDLoc(); @@ -8032,16 +8310,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) { Ins.push_back(RetArg); } + // Look for stores of arguments to static allocas. Mark such arguments with a + // flag to ask the target to give us the memory location of that argument if + // available. + ArgCopyElisionMapTy ArgCopyElisionCandidates; + findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates); + // Set up the incoming argument description vector. - unsigned Idx = 1; - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); - I != E; ++I, ++Idx) { + for (const Argument &Arg : F.args()) { + unsigned ArgNo = Arg.getArgNo(); SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); - bool isArgValueUsed = !I->use_empty(); + ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); + bool isArgValueUsed = !Arg.use_empty(); unsigned PartBase = 0; - Type *FinalType = I->getType(); - if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) + Type *FinalType = Arg.getType(); + if (Arg.hasAttribute(Attribute::ByVal)) FinalType = cast<PointerType>(FinalType)->getElementType(); bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( FinalType, F.getCallingConv(), F.isVarArg()); @@ -8052,15 +8335,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) { ISD::ArgFlagsTy Flags; unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) + if (Arg.hasAttribute(Attribute::ZExt)) Flags.setZExt(); - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) + if (Arg.hasAttribute(Attribute::SExt)) Flags.setSExt(); - if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) { + if (Arg.hasAttribute(Attribute::InReg)) { // If we are using vectorcall calling convention, a structure that is // passed InReg - is surely an HVA if (F.getCallingConv() == CallingConv::X86_VectorCall && - isa<StructType>(I->getType())) { + isa<StructType>(Arg.getType())) { // The first value of a structure is marked if (0 == Value) Flags.setHvaStart(); @@ -8069,15 +8352,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Set InReg Flag Flags.setInReg(); } - if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet)) + if (Arg.hasAttribute(Attribute::StructRet)) Flags.setSRet(); - if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf)) + if (Arg.hasAttribute(Attribute::SwiftSelf)) Flags.setSwiftSelf(); - if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftError)) + if (Arg.hasAttribute(Attribute::SwiftError)) Flags.setSwiftError(); - if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) + if (Arg.hasAttribute(Attribute::ByVal)) Flags.setByVal(); - if (F.getAttributes().hasAttribute(Idx, Attribute::InAlloca)) { + if (Arg.hasAttribute(Attribute::InAlloca)) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about // inalloca. This way we can know how many bytes we should've allocated @@ -8088,33 +8371,35 @@ void SelectionDAGISel::LowerArguments(const Function &F) { } if (F.getCallingConv() == CallingConv::X86_INTR) { // IA Interrupt passes frame (1st parameter) by value in the stack. - if (Idx == 1) + if (ArgNo == 0) Flags.setByVal(); } if (Flags.isByVal() || Flags.isInAlloca()) { - PointerType *Ty = cast<PointerType>(I->getType()); + PointerType *Ty = cast<PointerType>(Arg.getType()); Type *ElementTy = Ty->getElementType(); Flags.setByValSize(DL.getTypeAllocSize(ElementTy)); // For ByVal, alignment should be passed from FE. BE will guess if // this info is not there but there are cases it cannot get right. unsigned FrameAlign; - if (F.getParamAlignment(Idx)) - FrameAlign = F.getParamAlignment(Idx); + if (Arg.getParamAlignment()) + FrameAlign = Arg.getParamAlignment(); else FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL); Flags.setByValAlign(FrameAlign); } - if (F.getAttributes().hasAttribute(Idx, Attribute::Nest)) + if (Arg.hasAttribute(Attribute::Nest)) Flags.setNest(); if (NeedsRegBlock) Flags.setInConsecutiveRegs(); Flags.setOrigAlign(OriginalAlignment); + if (ArgCopyElisionCandidates.count(&Arg)) + Flags.setCopyElisionCandidate(); MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT); unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT); for (unsigned i = 0; i != NumRegs; ++i) { ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed, - Idx-1, PartBase+i*RegisterVT.getStoreSize()); + ArgNo, PartBase+i*RegisterVT.getStoreSize()); if (NumRegs > 1 && i == 0) MyFlags.Flags.setSplit(); // if it isn't first piece, alignment must be 1 @@ -8155,7 +8440,6 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Set up the argument values. unsigned i = 0; - Idx = 1; if (!FuncInfo->CanLowerReturn) { // Create a virtual register for the sret pointer, and put in a copy // from the sret argument into it. @@ -8177,29 +8461,41 @@ void SelectionDAGISel::LowerArguments(const Function &F) { DAG.setRoot(NewRoot); // i indexes lowered arguments. Bump it past the hidden sret argument. - // Idx indexes LLVM arguments. Don't touch it. ++i; } - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; - ++I, ++Idx) { + SmallVector<SDValue, 4> Chains; + DenseMap<int, int> ArgCopyElisionFrameIndexMap; + for (const Argument &Arg : F.args()) { SmallVector<SDValue, 4> ArgValues; SmallVector<EVT, 4> ValueVTs; - ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs); + ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs); unsigned NumValues = ValueVTs.size(); + if (NumValues == 0) + continue; + + bool ArgHasUses = !Arg.use_empty(); + + // Elide the copying store if the target loaded this argument from a + // suitable fixed stack object. + if (Ins[i].Flags.isCopyElisionCandidate()) { + tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap, + ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg, + InVals[i], ArgHasUses); + } // If this argument is unused then remember its value. It is used to generate // debugging information. bool isSwiftErrorArg = TLI->supportSwiftError() && - F.getAttributes().hasAttribute(Idx, Attribute::SwiftError); - if (I->use_empty() && NumValues && !isSwiftErrorArg) { - SDB->setUnusedArgValue(&*I, InVals[i]); + Arg.hasAttribute(Attribute::SwiftError); + if (!ArgHasUses && !isSwiftErrorArg) { + SDB->setUnusedArgValue(&Arg, InVals[i]); // Also remember any frame index for use in FastISel. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(InVals[i].getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); } for (unsigned Val = 0; Val != NumValues; ++Val) { @@ -8210,16 +8506,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Even an apparant 'unused' swifterror argument needs to be returned. So // we do generate a copy for it that can be used on return from the // function. - if (!I->use_empty() || isSwiftErrorArg) { + if (ArgHasUses || isSwiftErrorArg) { Optional<ISD::NodeType> AssertOp; - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) + if (Arg.hasAttribute(Attribute::SExt)) AssertOp = ISD::AssertSext; - else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt)) + else if (Arg.hasAttribute(Attribute::ZExt)) AssertOp = ISD::AssertZext; - ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], - NumParts, PartVT, VT, - nullptr, AssertOp)); + ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, + PartVT, VT, nullptr, AssertOp)); } i += NumParts; @@ -8232,18 +8527,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Note down frame index. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues), SDB->getCurSDLoc()); - SDB->setValue(&*I, Res); + SDB->setValue(&Arg, Res); if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) { if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(Res.getOperand(0).getNode())) if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) - FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex()); } // Update the SwiftErrorVRegDefMap. @@ -8263,18 +8558,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // uses with vregs. unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { - FuncInfo->ValueMap[&*I] = Reg; + FuncInfo->ValueMap[&Arg] = Reg; continue; } } - if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) { - FuncInfo->InitializeRegForValue(&*I); - SDB->CopyToExportRegsIfNeeded(&*I); + if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) { + FuncInfo->InitializeRegForValue(&Arg); + SDB->CopyToExportRegsIfNeeded(&Arg); } } + if (!Chains.empty()) { + Chains.push_back(NewRoot); + NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + } + + DAG.setRoot(NewRoot); + assert(i == InVals.size() && "Argument register count mismatch!"); + // If any argument copy elisions occurred and we have debug info, update the + // stale frame indices used in the dbg.declare variable info table. + MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo(); + if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) { + for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) { + auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot); + if (I != ArgCopyElisionFrameIndexMap.end()) + VI.Slot = I->second; + } + } + // Finally, if the target has anything special to do, allow it to do so. EmitFunctionEntryCode(); } @@ -8402,13 +8715,10 @@ void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) { HasTailCall = true; } -bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters, - const SmallVectorImpl<unsigned> &TotalCases, - unsigned First, unsigned Last, - unsigned Density) const { +uint64_t +SelectionDAGBuilder::getJumpTableRange(const CaseClusterVector &Clusters, + unsigned First, unsigned Last) const { assert(Last >= First); - assert(TotalCases[Last] >= TotalCases[First]); - const APInt &LowCase = Clusters[First].Low->getValue(); const APInt &HighCase = Clusters[Last].High->getValue(); assert(LowCase.getBitWidth() == HighCase.getBitWidth()); @@ -8417,26 +8727,17 @@ bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters, // comparison to lower. We should discriminate against such consecutive ranges // in jump tables. - uint64_t Diff = (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100); - uint64_t Range = Diff + 1; + return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1; +} +uint64_t SelectionDAGBuilder::getJumpTableNumCases( + const SmallVectorImpl<unsigned> &TotalCases, unsigned First, + unsigned Last) const { + assert(Last >= First); + assert(TotalCases[Last] >= TotalCases[First]); uint64_t NumCases = TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]); - - assert(NumCases < UINT64_MAX / 100); - assert(Range >= NumCases); - - return NumCases * 100 >= Range * Density; -} - -static inline bool areJTsAllowed(const TargetLowering &TLI, - const SwitchInst *SI) { - const Function *Fn = SI->getParent()->getParent(); - if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true") - return false; - - return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || - TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other); + return NumCases; } bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters, @@ -8475,10 +8776,11 @@ bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters, JTProbs[Clusters[I].MBB] += Clusters[I].Prob; } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumDests = JTProbs.size(); - if (isSuitableForBitTests(NumDests, NumCmps, - Clusters[First].Low->getValue(), - Clusters[Last].High->getValue())) { + if (TLI.isSuitableForBitTests( + NumDests, NumCmps, Clusters[First].Low->getValue(), + Clusters[Last].High->getValue(), DAG.getDataLayout())) { // Clusters[First..Last] should be lowered as bit tests instead. return false; } @@ -8499,7 +8801,6 @@ bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters, } JumpTableMBB->normalizeSuccProbs(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding()) ->createJumpTableIndex(Table); @@ -8528,17 +8829,12 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, #endif const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!areJTsAllowed(TLI, SI)) + if (!TLI.areJTsAllowed(SI->getParent()->getParent())) return; - const bool OptForSize = DefaultMBB->getParent()->getFunction()->optForSize(); - const int64_t N = Clusters.size(); const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries(); const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2; - const unsigned MaxJumpTableSize = - OptForSize || TLI.getMaximumJumpTableSize() == 0 - ? UINT_MAX : TLI.getMaximumJumpTableSize(); if (N < 2 || N < MinJumpTableEntries) return; @@ -8553,15 +8849,12 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, TotalCases[i] += TotalCases[i - 1]; } - const unsigned MinDensity = - OptForSize ? OptsizeJumpTableDensity : JumpTableDensity; - // Cheap case: the whole range may be suitable for jump table. - unsigned JumpTableSize = (Clusters[N - 1].High->getValue() - - Clusters[0].Low->getValue()) - .getLimitedValue(UINT_MAX - 1) + 1; - if (JumpTableSize <= MaxJumpTableSize && - isDense(Clusters, TotalCases, 0, N - 1, MinDensity)) { + uint64_t Range = getJumpTableRange(Clusters,0, N - 1); + uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1); + assert(NumCases < UINT64_MAX / 100); + assert(Range >= NumCases); + if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) { CaseCluster JTCluster; if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) { Clusters[0] = JTCluster; @@ -8614,11 +8907,11 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, // Search for a solution that results in fewer partitions. for (int64_t j = N - 1; j > i; j--) { // Try building a partition from Clusters[i..j]. - JumpTableSize = (Clusters[j].High->getValue() - - Clusters[i].Low->getValue()) - .getLimitedValue(UINT_MAX - 1) + 1; - if (JumpTableSize <= MaxJumpTableSize && - isDense(Clusters, TotalCases, i, j, MinDensity)) { + uint64_t Range = getJumpTableRange(Clusters, i, j); + uint64_t NumCases = getJumpTableNumCases(TotalCases, i, j); + assert(NumCases < UINT64_MAX / 100); + assert(Range >= NumCases); + if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) { unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]); unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1]; int64_t NumEntries = j - i + 1; @@ -8662,36 +8955,6 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters, Clusters.resize(DstIndex); } -bool SelectionDAGBuilder::rangeFitsInWord(const APInt &Low, const APInt &High) { - // FIXME: Using the pointer type doesn't seem ideal. - uint64_t BW = DAG.getDataLayout().getPointerSizeInBits(); - uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1; - return Range <= BW; -} - -bool SelectionDAGBuilder::isSuitableForBitTests(unsigned NumDests, - unsigned NumCmps, - const APInt &Low, - const APInt &High) { - // FIXME: I don't think NumCmps is the correct metric: a single case and a - // range of cases both require only one branch to lower. Just looking at the - // number of clusters and destinations should be enough to decide whether to - // build bit tests. - - // To lower a range with bit tests, the range must fit the bitwidth of a - // machine word. - if (!rangeFitsInWord(Low, High)) - return false; - - // Decide whether it's profitable to lower this range with bit tests. Each - // destination requires a bit test and branch, and there is an overall range - // check branch. For a small number of clusters, separate comparisons might be - // cheaper, and for many destinations, splitting the range might be better. - return (NumDests == 1 && NumCmps >= 3) || - (NumDests == 2 && NumCmps >= 5) || - (NumDests == 3 && NumCmps >= 6); -} - bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last, const SwitchInst *SI, @@ -8713,16 +8976,17 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters, APInt High = Clusters[Last].High->getValue(); assert(Low.slt(High)); - if (!isSuitableForBitTests(NumDests, NumCmps, Low, High)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const DataLayout &DL = DAG.getDataLayout(); + if (!TLI.isSuitableForBitTests(NumDests, NumCmps, Low, High, DL)) return false; APInt LowBound; APInt CmpRange; - const int BitWidth = DAG.getTargetLoweringInfo() - .getPointerTy(DAG.getDataLayout()) - .getSizeInBits(); - assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!"); + const int BitWidth = TLI.getPointerTy(DL).getSizeInBits(); + assert(TLI.rangeFitsInWord(Low, High, DL) && + "Case range must fit in bit mask!"); // Check if the clusters cover a contiguous range such that no value in the // range will jump to the default statement. @@ -8812,7 +9076,9 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters, // If target does not have legal shift left, do not emit bit tests at all. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT PTy = TLI.getPointerTy(DAG.getDataLayout()); + const DataLayout &DL = DAG.getDataLayout(); + + EVT PTy = TLI.getPointerTy(DL); if (!TLI.isOperationLegal(ISD::SHL, PTy)) return; @@ -8843,8 +9109,8 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters, // Try building a partition from Clusters[i..j]. // Check the range. - if (!rangeFitsInWord(Clusters[i].Low->getValue(), - Clusters[j].High->getValue())) + if (!TLI.rangeFitsInWord(Clusters[i].Low->getValue(), + Clusters[j].High->getValue(), DL)) continue; // Check nbr of destinations and cluster types. diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index abde8a89befc..77e131fa551c 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -304,10 +304,13 @@ private: BranchProbability DefaultProb; }; - /// Check whether a range of clusters is dense enough for a jump table. - bool isDense(const CaseClusterVector &Clusters, - const SmallVectorImpl<unsigned> &TotalCases, - unsigned First, unsigned Last, unsigned MinDensity) const; + /// Return the range of value in [First..Last]. + uint64_t getJumpTableRange(const CaseClusterVector &Clusters, unsigned First, + unsigned Last) const; + + /// Return the number of cases in [First..Last]. + uint64_t getJumpTableNumCases(const SmallVectorImpl<unsigned> &TotalCases, + unsigned First, unsigned Last) const; /// Build a jump table cluster from Clusters[First..Last]. Returns false if it /// decides it's not a good idea. @@ -319,14 +322,6 @@ private: void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI, MachineBasicBlock *DefaultMBB); - /// Check whether the range [Low,High] fits in a machine word. - bool rangeFitsInWord(const APInt &Low, const APInt &High); - - /// Check whether these clusters are suitable for lowering with bit tests based - /// on the number of destinations, comparison metric, and range. - bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, - const APInt &Low, const APInt &High); - /// Build a bit test cluster from Clusters[First..Last]. Returns false if it /// decides it's not a good idea. bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last, @@ -609,40 +604,34 @@ public: SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo, CodeGenOpt::Level ol) : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), - DAG(dag), FuncInfo(funcinfo), + DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo), HasTailCall(false) { } - void init(GCFunctionInfo *gfi, AliasAnalysis &aa, + void init(GCFunctionInfo *gfi, AliasAnalysis *AA, const TargetLibraryInfo *li); - /// clear - Clear out the current SelectionDAG and the associated - /// state and prepare this SelectionDAGBuilder object to be used - /// for a new block. This doesn't clear out information about - /// additional blocks that are needed to complete switch lowering - /// or PHI node updating; that information is cleared out as it is - /// consumed. + /// Clear out the current SelectionDAG and the associated state and prepare + /// this SelectionDAGBuilder object to be used for a new block. This doesn't + /// clear out information about additional blocks that are needed to complete + /// switch lowering or PHI node updating; that information is cleared out as + /// it is consumed. void clear(); - /// clearDanglingDebugInfo - Clear the dangling debug information - /// map. This function is separated from the clear so that debug - /// information that is dangling in a basic block can be properly - /// resolved in a different basic block. This allows the - /// SelectionDAG to resolve dangling debug information attached - /// to PHI nodes. + /// Clear the dangling debug information map. This function is separated from + /// the clear so that debug information that is dangling in a basic block can + /// be properly resolved in a different basic block. This allows the + /// SelectionDAG to resolve dangling debug information attached to PHI nodes. void clearDanglingDebugInfo(); - /// getRoot - Return the current virtual root of the Selection DAG, - /// flushing any PendingLoad items. This must be done before emitting - /// a store or any other node that may need to be ordered after any - /// prior load instructions. - /// + /// Return the current virtual root of the Selection DAG, flushing any + /// PendingLoad items. This must be done before emitting a store or any other + /// node that may need to be ordered after any prior load instructions. SDValue getRoot(); - /// getControlRoot - Similar to getRoot, but instead of flushing all the - /// PendingLoad items, flush all the PendingExports items. It is necessary - /// to do this before emitting a terminator instruction. - /// + /// Similar to getRoot, but instead of flushing all the PendingLoad items, + /// flush all the PendingExports items. It is necessary to do this before + /// emitting a terminator instruction. SDValue getControlRoot(); SDLoc getCurSDLoc() const { @@ -688,12 +677,13 @@ public: MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, Instruction::BinaryOps Opc, BranchProbability TW, - BranchProbability FW); + BranchProbability FW, bool InvertCond); void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, - BranchProbability TW, BranchProbability FW); + BranchProbability TW, BranchProbability FW, + bool InvertCond); bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases); bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB); void CopyToExportRegsIfNeeded(const Value *V); @@ -782,6 +772,11 @@ public: bool VarArgDisallowed, bool ForceVoidReturnTy); + /// Returns the type of FrameIndex and TargetFrameIndex nodes. + MVT getFrameIndexTy() { + return DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()); + } + private: // Terminator instructions. void visitRet(const ReturnInst &I); @@ -900,6 +895,7 @@ private: void visitInlineAsm(ImmutableCallSite CS); const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); + void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitVAStart(const CallInst &I); void visitVAArg(const VAArgInst &I); @@ -913,6 +909,8 @@ private: void visitGCRelocate(const GCRelocateInst &I); void visitGCResult(const GCResultInst &I); + void visitVectorReduce(const CallInst &I, unsigned Intrinsic); + void visitUserOp1(const Instruction &I) { llvm_unreachable("UserOp1 should not exist at instruction selection time!"); } @@ -932,7 +930,7 @@ private: /// instruction selection, they will be inserted to the entry BB. bool EmitFuncArgumentDbgValue(const Value *V, DILocalVariable *Variable, DIExpression *Expr, DILocation *DL, - int64_t Offset, bool IsIndirect, + int64_t Offset, bool IsDbgDeclare, const SDValue &N); /// Return the next block after MBB, or nullptr if there is none. @@ -944,8 +942,8 @@ private: /// Return the appropriate SDDbgValue based on N. SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable, - DIExpression *Expr, int64_t Offset, DebugLoc dl, - unsigned DbgSDNodeOrder); + DIExpression *Expr, int64_t Offset, + const DebugLoc &dl, unsigned DbgSDNodeOrder); }; /// RegsForValue - This struct represents the registers (physical or virtual) @@ -958,26 +956,23 @@ private: /// type. /// struct RegsForValue { - /// ValueVTs - The value types of the values, which may not be legal, and + /// The value types of the values, which may not be legal, and /// may need be promoted or synthesized from one or more registers. - /// SmallVector<EVT, 4> ValueVTs; - /// RegVTs - The value types of the registers. This is the same size as - /// ValueVTs and it records, for each value, what the type of the assigned - /// register or registers are. (Individual values are never synthesized - /// from more than one type of register.) + /// The value types of the registers. This is the same size as ValueVTs and it + /// records, for each value, what the type of the assigned register or + /// registers are. (Individual values are never synthesized from more than one + /// type of register.) /// /// With virtual registers, the contents of RegVTs is redundant with TLI's /// getRegisterType member function, however when with physical registers /// it is necessary to have a separate record of the types. - /// SmallVector<MVT, 4> RegVTs; - /// Regs - This list holds the registers assigned to the values. + /// This list holds the registers assigned to the values. /// Each legal or promoted value requires one register, and each /// expanded value requires multiple registers. - /// SmallVector<unsigned, 4> Regs; RegsForValue(); @@ -987,33 +982,33 @@ struct RegsForValue { RegsForValue(LLVMContext &Context, const TargetLowering &TLI, const DataLayout &DL, unsigned Reg, Type *Ty); - /// append - Add the specified values to this one. + /// Add the specified values to this one. void append(const RegsForValue &RHS) { ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end()); RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end()); Regs.append(RHS.Regs.begin(), RHS.Regs.end()); } - /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from - /// this value and returns the result as a ValueVTs value. This uses - /// Chain/Flag as the input and updates them for the output Chain/Flag. - /// If the Flag pointer is NULL, no flag is used. + /// Emit a series of CopyFromReg nodes that copies from this value and returns + /// the result as a ValueVTs value. This uses Chain/Flag as the input and + /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no + /// flag is used. SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr) const; - /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified - /// value into the registers specified by this object. This uses Chain/Flag - /// as the input and updates them for the output Chain/Flag. If the Flag - /// pointer is nullptr, no flag is used. If V is not nullptr, then it is used - /// in printing better diagnostic messages on error. + /// Emit a series of CopyToReg nodes that copies the specified value into the + /// registers specified by this object. This uses Chain/Flag as the input and + /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no + /// flag is used. If V is not nullptr, then it is used in printing better + /// diagnostic messages on error. void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, SDValue *Flag, const Value *V = nullptr, ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const; - /// AddInlineAsmOperands - Add this value to the specified inlineasm node - /// operand list. This adds the code marker, matching input operand index - /// (if applicable), and includes the number of values added into it. + /// Add this value to the specified inlineasm node operand list. This adds the + /// code marker, matching input operand index (if applicable), and includes + /// the number of values added into it. void AddInlineAsmOperands(unsigned Kind, bool HasMatching, unsigned MatchingIdx, const SDLoc &dl, SelectionDAG &DAG, std::vector<SDValue> &Ops) const; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0faaad8a21b7..c37d7080f2c5 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -227,6 +227,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CARRY_FALSE: return "carry_false"; case ISD::ADDC: return "addc"; case ISD::ADDE: return "adde"; + case ISD::ADDCARRY: return "addcarry"; case ISD::SADDO: return "saddo"; case ISD::UADDO: return "uaddo"; case ISD::SSUBO: return "ssubo"; @@ -235,6 +236,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UMULO: return "umulo"; case ISD::SUBC: return "subc"; case ISD::SUBE: return "sube"; + case ISD::SUBCARRY: return "subcarry"; case ISD::SHL_PARTS: return "shl_parts"; case ISD::SRA_PARTS: return "sra_parts"; case ISD::SRL_PARTS: return "srl_parts"; @@ -300,6 +302,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; // Bit manipulation + case ISD::ABS: return "abs"; case ISD::BITREVERSE: return "bitreverse"; case ISD::BSWAP: return "bswap"; case ISD::CTPOP: return "ctpop"; @@ -343,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SETFALSE: return "setfalse"; case ISD::SETFALSE2: return "setfalse2"; } + case ISD::VECREDUCE_FADD: return "vecreduce_fadd"; + case ISD::VECREDUCE_FMUL: return "vecreduce_fmul"; + case ISD::VECREDUCE_ADD: return "vecreduce_add"; + case ISD::VECREDUCE_MUL: return "vecreduce_mul"; + case ISD::VECREDUCE_AND: return "vecreduce_and"; + case ISD::VECREDUCE_OR: return "vecreduce_or"; + case ISD::VECREDUCE_XOR: return "vecreduce_xor"; + case ISD::VECREDUCE_SMAX: return "vecreduce_smax"; + case ISD::VECREDUCE_SMIN: return "vecreduce_smin"; + case ISD::VECREDUCE_UMAX: return "vecreduce_umax"; + case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; + case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; + case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; } } @@ -366,11 +382,13 @@ static Printable PrintNodeId(const SDNode &Node) { }); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); } -void SDNode::dump(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const { print(dbgs(), G); dbgs() << '\n'; } +#endif void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const { for (unsigned i = 0, e = getNumValues(); i != e; ++i) { @@ -416,7 +434,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { OS << '<' << CSDN->getValueAPF().convertToDouble() << '>'; else { OS << "<APFloat("; - CSDN->getValueAPF().bitcastToAPInt().dump(); + CSDN->getValueAPF().bitcastToAPInt().print(OS, false); OS << ")>"; } } else if (const GlobalAddressSDNode *GADN = @@ -566,6 +584,7 @@ static bool shouldPrintInline(const SDNode &Node) { return Node.getNumOperands() == 0; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) { for (const SDValue &Op : N->op_values()) { if (shouldPrintInline(*Op.getNode())) @@ -592,6 +611,7 @@ LLVM_DUMP_METHOD void SelectionDAG::dump() const { if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this); dbgs() << "\n\n"; } +#endif void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const { OS << PrintNodeId(*this) << ": "; @@ -618,6 +638,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G, } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet; static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, const SelectionDAG *G, VisitedSDNodeSet &once) { @@ -646,15 +667,16 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, DumpNodesr(OS, Op.getNode(), indent+2, G, once); } -void SDNode::dumpr() const { +LLVM_DUMP_METHOD void SDNode::dumpr() const { VisitedSDNodeSet once; DumpNodesr(dbgs(), this, 0, nullptr, once); } -void SDNode::dumpr(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const { VisitedSDNodeSet once; DumpNodesr(dbgs(), this, 0, G, once); } +#endif static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N, const SelectionDAG *G, unsigned depth, @@ -688,14 +710,17 @@ void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const { printrWithDepth(OS, G, 10); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const { printrWithDepth(dbgs(), G, depth); } -void SDNode::dumprFull(const SelectionDAG *G) const { +LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const { // Don't print impossibly deep things. dumprWithDepth(G, 10); } +#endif void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const { printr(OS, G); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 64e6c221229b..687b882c5e4d 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -11,43 +11,70 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/SelectionDAG.h" #include "ScheduleDAGSDNodes.h" #include "SelectionDAGBuilder.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" -#include "llvm/CodeGen/GCStrategy.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/StackProtector.h" -#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -59,6 +86,13 @@ #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <memory> +#include <string> +#include <utility> +#include <vector> using namespace llvm; @@ -73,104 +107,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered"); STATISTIC(NumFastIselFailLowerArguments, "Number of entry blocks where fast isel failed to lower arguments"); -#ifndef NDEBUG -static cl::opt<bool> -EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden, - cl::desc("Enable extra verbose messages in the \"fast\" " - "instruction selector")); - - // Terminators -STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret"); -STATISTIC(NumFastIselFailBr,"Fast isel fails on Br"); -STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch"); -STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr"); -STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke"); -STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume"); -STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable"); - - // Standard binary operators... -STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add"); -STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd"); -STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub"); -STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub"); -STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul"); -STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul"); -STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv"); -STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv"); -STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv"); -STATISTIC(NumFastIselFailURem,"Fast isel fails on URem"); -STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem"); -STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem"); - - // Logical operators... -STATISTIC(NumFastIselFailAnd,"Fast isel fails on And"); -STATISTIC(NumFastIselFailOr,"Fast isel fails on Or"); -STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor"); - - // Memory instructions... -STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca"); -STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load"); -STATISTIC(NumFastIselFailStore,"Fast isel fails on Store"); -STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg"); -STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM"); -STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence"); -STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr"); - - // Convert instructions... -STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc"); -STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt"); -STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt"); -STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc"); -STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt"); -STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI"); -STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI"); -STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP"); -STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP"); -STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr"); -STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt"); -STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast"); - - // Other instructions... -STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp"); -STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp"); -STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI"); -STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select"); -STATISTIC(NumFastIselFailCall,"Fast isel fails on Call"); -STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl"); -STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr"); -STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr"); -STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg"); -STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement"); -STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement"); -STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector"); -STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue"); -STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue"); -STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad"); - -// Intrinsic instructions... -STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call"); -STATISTIC(NumFastIselFailSAddWithOverflow, - "Fast isel fails on sadd.with.overflow"); -STATISTIC(NumFastIselFailUAddWithOverflow, - "Fast isel fails on uadd.with.overflow"); -STATISTIC(NumFastIselFailSSubWithOverflow, - "Fast isel fails on ssub.with.overflow"); -STATISTIC(NumFastIselFailUSubWithOverflow, - "Fast isel fails on usub.with.overflow"); -STATISTIC(NumFastIselFailSMulWithOverflow, - "Fast isel fails on smul.with.overflow"); -STATISTIC(NumFastIselFailUMulWithOverflow, - "Fast isel fails on umul.with.overflow"); -STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress"); -STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call"); -STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call"); -STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call"); -#endif - -static cl::opt<bool> -EnableFastISelVerbose("fast-isel-verbose", cl::Hidden, - cl::desc("Enable verbose messages in the \"fast\" " - "instruction selector")); static cl::opt<int> EnableFastISelAbort( "fast-isel-abort", cl::Hidden, cl::desc("Enable abort calls when \"fast\" instruction selection " @@ -179,6 +115,11 @@ static cl::opt<int> EnableFastISelAbort( "abort for argument lowering, and 3 will never fallback " "to SelectionDAG.")); +static cl::opt<bool> EnableFastISelFallbackReport( + "fast-isel-report-on-fallback", cl::Hidden, + cl::desc("Emit a diagnostic when \"fast\" instruction selection " + "falls back to SelectionDAG.")); + static cl::opt<bool> UseMBPI("use-mbpi", cl::desc("use Machine Branch Probability Info"), @@ -238,7 +179,7 @@ MachinePassRegistry RegisterScheduler::Registry; /// //===---------------------------------------------------------------------===// static cl::opt<RegisterScheduler::FunctionPassCtor, false, - RegisterPassParser<RegisterScheduler> > + RegisterPassParser<RegisterScheduler>> ISHeuristic("pre-RA-sched", cl::init(&createDefaultScheduler), cl::Hidden, cl::desc("Instruction schedulers available (before register" @@ -249,6 +190,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target", createDefaultScheduler); namespace llvm { + //===--------------------------------------------------------------------===// /// \brief This class is used by SelectionDAGISel to temporarily override /// the optimization level on a per-function basis. @@ -318,6 +260,7 @@ namespace llvm { "Unknown sched type!"); return createILPListDAGScheduler(IS, OptLevel); } + } // end namespace llvm // EmitInstrWithCustomInserter - This method should be implemented by targets @@ -357,7 +300,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm, FuncInfo(new FunctionLoweringInfo()), CurDAG(new SelectionDAG(tm, OL)), SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)), - GFI(), + AA(), GFI(), OptLevel(OL), DAGSize(0) { initializeGCModuleInfoPass(*PassRegistry::getPassRegistry()); @@ -375,7 +318,8 @@ SelectionDAGISel::~SelectionDAGISel() { } void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AAResultsWrapperPass>(); + if (OptLevel != CodeGenOpt::None) + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<GCModuleInfo>(); AU.addRequired<StackProtector>(); AU.addPreserved<StackProtector>(); @@ -431,8 +375,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MachineFunctionProperties::Property::Selected)) return false; // Do some sanity-checking on the command-line options. - assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) && - "-fast-isel-verbose requires -fast-isel"); assert((!EnableFastISelAbort || TM.Options.EnableFastISel) && "-fast-isel-abort > 0 requires -fast-isel"); @@ -454,23 +396,33 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr; + ORE = make_unique<OptimizationRemarkEmitter>(&Fn); DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); SplitCriticalSideEffectEdges(const_cast<Function &>(Fn)); - CurDAG->init(*MF); + CurDAG->init(*MF, *ORE); FuncInfo->set(Fn, *MF, CurDAG); + // Now get the optional analyzes if we want to. + // This is based on the possibly changed OptLevel (after optnone is taken + // into account). That's unfortunate but OK because it just means we won't + // ask for passes that have been required anyway. + if (UseMBPI && OptLevel != CodeGenOpt::None) FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); else FuncInfo->BPI = nullptr; - SDB->init(GFI, *AA, LibInfo); + if (OptLevel != CodeGenOpt::None) + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + else + AA = nullptr; + + SDB->init(GFI, AA, LibInfo); MF->setHasInlineAsm(false); @@ -502,6 +454,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TLI->initializeSplitCSR(EntryMBB); SelectAllBasicBlocks(Fn); + if (FastISelFailed && EnableFastISelFallbackReport) { + DiagnosticInfoISelFallback DiagFallback(Fn); + Fn.getContext().diagnose(DiagFallback); + } // If the first basic block in the function has live ins that need to be // copied into vregs, emit the copies into the top of the block before @@ -628,7 +584,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { unsigned To = I->second; // If To is also scheduled to be replaced, find what its ultimate // replacement is. - for (;;) { + while (true) { DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To); if (J == E) break; To = J->second; @@ -648,13 +604,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MRI.replaceRegWith(From, To); } - if (TLI->hasCopyImplyingStackAdjustment(MF)) - MFI.setHasCopyImplyingStackAdjustment(true); - - // Freeze the set of reserved registers now that MachineFrameInfo has been - // set up. All the information required by getReservedRegs() should be - // available now. - MRI.freezeReservedRegs(*MF); + TLI->finalizeLowering(*MF); // Release function-specific state. SDB and CurDAG are already cleared // at this point. @@ -666,13 +616,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { return true; } +static void reportFastISelFailure(MachineFunction &MF, + OptimizationRemarkEmitter &ORE, + OptimizationRemarkMissed &R, + bool ShouldAbort) { + // Print the function name explicitly if we don't have a debug location (which + // makes the diagnostic less useful) or if we're going to emit a raw error. + if (!R.getLocation().isValid() || ShouldAbort) + R << (" (in function: " + MF.getName() + ")").str(); + + if (ShouldAbort) + report_fatal_error(R.getMsg()); + + ORE.emit(R); +} + void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin, BasicBlock::const_iterator End, bool &HadTailCall) { // Lower the instructions. If a call is emitted as a tail call, cease emitting // nodes for this block. - for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) - SDB->visit(*I); + for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) { + if (!ElidedArgCopyInstrs.count(&*I)) + SDB->visit(*I); + } // Make sure the root of the DAG is up-to-date. CurDAG->setRoot(SDB->getControlRoot()); @@ -689,8 +656,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() { Worklist.push_back(CurDAG->getRoot().getNode()); - APInt KnownZero; - APInt KnownOne; + KnownBits Known; do { SDNode *N = Worklist.pop_back_val(); @@ -719,8 +685,8 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() { continue; unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src); - CurDAG->computeKnownBits(Src, KnownZero, KnownOne); - FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne); + CurDAG->computeKnownBits(Src, Known); + FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, Known); } while (!Worklist.empty()); } @@ -731,6 +697,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { int BlockNumber = -1; (void)BlockNumber; bool MatchFilterBB = false; (void)MatchFilterBB; + + // Pre-type legalization allow creation of any node types. + CurDAG->NewNodesMustHaveLegalTypes = false; + #ifndef NDEBUG MatchFilterBB = (FilterDAGBasicBlockName.empty() || FilterDAGBasicBlockName == @@ -756,7 +726,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine1", "DAG Combining 1", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel); + CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel); } DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber @@ -777,6 +747,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump()); + // Only allow creation of legal node types. CurDAG->NewNodesMustHaveLegalTypes = true; if (Changed) { @@ -787,7 +758,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lt", "DAG Combining after legalize types", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel); } DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber @@ -802,12 +773,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { } if (Changed) { + DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber + << " '" << BlockName << "'\n"; CurDAG->dump()); + { NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName, GroupDescription, TimePassesIsEnabled); CurDAG->LegalizeTypes(); } + DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber + << " '" << BlockName << "'\n"; CurDAG->dump()); + if (ViewDAGCombineLT && MatchFilterBB) CurDAG->viewGraph("dag-combine-lv input for " + BlockName); @@ -815,7 +792,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel); } DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#" @@ -841,7 +818,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine2", "DAG Combining 2", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel); } DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber @@ -907,10 +884,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { } namespace { + /// ISelUpdater - helper class to handle updates of the instruction selection /// graph. class ISelUpdater : public SelectionDAG::DAGUpdateListener { SelectionDAG::allnodes_iterator &ISelPosition; + public: ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp) : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {} @@ -923,6 +902,7 @@ public: ++ISelPosition; } }; + } // end anonymous namespace void SelectionDAGISel::DoInstructionSelection() { @@ -960,6 +940,19 @@ void SelectionDAGISel::DoInstructionSelection() { if (Node->use_empty()) continue; + // When we are using non-default rounding modes or FP exception behavior + // FP operations are represented by StrictFP pseudo-operations. They + // need to be simplified here so that the target-specific instruction + // selectors know how to handle them. + // + // If the current node is a strict FP pseudo-op, the isStrictFPOp() + // function will provide the corresponding normal FP opcode to which the + // node should be mutated. + // + // FIXME: The backends need a way to handle FP constraints. + if (Node->isStrictFPOpcode()) + Node = CurDAG->mutateStrictFPToFP(Node); + Select(Node); } @@ -1046,116 +1039,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I, !FuncInfo->isExportedInst(I); // Exported instrs must be computed. } -#ifndef NDEBUG -// Collect per Instruction statistics for fast-isel misses. Only those -// instructions that cause the bail are accounted for. It does not account for -// instructions higher in the block. Thus, summing the per instructions stats -// will not add up to what is reported by NumFastIselFailures. -static void collectFailStats(const Instruction *I) { - switch (I->getOpcode()) { - default: assert (0 && "<Invalid operator> "); - - // Terminators - case Instruction::Ret: NumFastIselFailRet++; return; - case Instruction::Br: NumFastIselFailBr++; return; - case Instruction::Switch: NumFastIselFailSwitch++; return; - case Instruction::IndirectBr: NumFastIselFailIndirectBr++; return; - case Instruction::Invoke: NumFastIselFailInvoke++; return; - case Instruction::Resume: NumFastIselFailResume++; return; - case Instruction::Unreachable: NumFastIselFailUnreachable++; return; - - // Standard binary operators... - case Instruction::Add: NumFastIselFailAdd++; return; - case Instruction::FAdd: NumFastIselFailFAdd++; return; - case Instruction::Sub: NumFastIselFailSub++; return; - case Instruction::FSub: NumFastIselFailFSub++; return; - case Instruction::Mul: NumFastIselFailMul++; return; - case Instruction::FMul: NumFastIselFailFMul++; return; - case Instruction::UDiv: NumFastIselFailUDiv++; return; - case Instruction::SDiv: NumFastIselFailSDiv++; return; - case Instruction::FDiv: NumFastIselFailFDiv++; return; - case Instruction::URem: NumFastIselFailURem++; return; - case Instruction::SRem: NumFastIselFailSRem++; return; - case Instruction::FRem: NumFastIselFailFRem++; return; - - // Logical operators... - case Instruction::And: NumFastIselFailAnd++; return; - case Instruction::Or: NumFastIselFailOr++; return; - case Instruction::Xor: NumFastIselFailXor++; return; - - // Memory instructions... - case Instruction::Alloca: NumFastIselFailAlloca++; return; - case Instruction::Load: NumFastIselFailLoad++; return; - case Instruction::Store: NumFastIselFailStore++; return; - case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return; - case Instruction::AtomicRMW: NumFastIselFailAtomicRMW++; return; - case Instruction::Fence: NumFastIselFailFence++; return; - case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return; - - // Convert instructions... - case Instruction::Trunc: NumFastIselFailTrunc++; return; - case Instruction::ZExt: NumFastIselFailZExt++; return; - case Instruction::SExt: NumFastIselFailSExt++; return; - case Instruction::FPTrunc: NumFastIselFailFPTrunc++; return; - case Instruction::FPExt: NumFastIselFailFPExt++; return; - case Instruction::FPToUI: NumFastIselFailFPToUI++; return; - case Instruction::FPToSI: NumFastIselFailFPToSI++; return; - case Instruction::UIToFP: NumFastIselFailUIToFP++; return; - case Instruction::SIToFP: NumFastIselFailSIToFP++; return; - case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return; - case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return; - case Instruction::BitCast: NumFastIselFailBitCast++; return; - - // Other instructions... - case Instruction::ICmp: NumFastIselFailICmp++; return; - case Instruction::FCmp: NumFastIselFailFCmp++; return; - case Instruction::PHI: NumFastIselFailPHI++; return; - case Instruction::Select: NumFastIselFailSelect++; return; - case Instruction::Call: { - if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) { - switch (Intrinsic->getIntrinsicID()) { - default: - NumFastIselFailIntrinsicCall++; return; - case Intrinsic::sadd_with_overflow: - NumFastIselFailSAddWithOverflow++; return; - case Intrinsic::uadd_with_overflow: - NumFastIselFailUAddWithOverflow++; return; - case Intrinsic::ssub_with_overflow: - NumFastIselFailSSubWithOverflow++; return; - case Intrinsic::usub_with_overflow: - NumFastIselFailUSubWithOverflow++; return; - case Intrinsic::smul_with_overflow: - NumFastIselFailSMulWithOverflow++; return; - case Intrinsic::umul_with_overflow: - NumFastIselFailUMulWithOverflow++; return; - case Intrinsic::frameaddress: - NumFastIselFailFrameaddress++; return; - case Intrinsic::sqrt: - NumFastIselFailSqrt++; return; - case Intrinsic::experimental_stackmap: - NumFastIselFailStackMap++; return; - case Intrinsic::experimental_patchpoint_void: // fall-through - case Intrinsic::experimental_patchpoint_i64: - NumFastIselFailPatchPoint++; return; - } - } - NumFastIselFailCall++; - return; - } - case Instruction::Shl: NumFastIselFailShl++; return; - case Instruction::LShr: NumFastIselFailLShr++; return; - case Instruction::AShr: NumFastIselFailAShr++; return; - case Instruction::VAArg: NumFastIselFailVAArg++; return; - case Instruction::ExtractElement: NumFastIselFailExtractElement++; return; - case Instruction::InsertElement: NumFastIselFailInsertElement++; return; - case Instruction::ShuffleVector: NumFastIselFailShuffleVector++; return; - case Instruction::ExtractValue: NumFastIselFailExtractValue++; return; - case Instruction::InsertValue: NumFastIselFailInsertValue++; return; - case Instruction::LandingPad: NumFastIselFailLandingPad++; return; - } -} -#endif // NDEBUG - /// Set up SwiftErrorVals by going through the function. If the function has /// swifterror argument, it will be the first entry. static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, @@ -1190,9 +1073,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI, } static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, + FastISel *FastIS, const TargetLowering *TLI, const TargetInstrInfo *TII, - const BasicBlock *LLVMBB, SelectionDAGBuilder *SDB) { if (!TLI->supportSwiftError()) return; @@ -1202,21 +1085,71 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, if (FuncInfo->SwiftErrorVals.empty()) return; - if (pred_begin(LLVMBB) == pred_end(LLVMBB)) { - auto &DL = FuncInfo->MF->getDataLayout(); - auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); - for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { - // We will always generate a copy from the argument. It is always used at - // least by the 'return' of the swifterror. - if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) + assert(FuncInfo->MBB == &*FuncInfo->MF->begin() && + "expected to insert into entry block"); + auto &DL = FuncInfo->MF->getDataLayout(); + auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL)); + for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) { + // We will always generate a copy from the argument. It is always used at + // least by the 'return' of the swifterror. + if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal) + continue; + unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); + // Assign Undef to Vreg. We construct MI directly to make sure it works + // with FastISel. + BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), + SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), + VReg); + + // Keep FastIS informed about the value we just inserted. + if (FastIS) + FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + + FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); + } +} + +/// Collect llvm.dbg.declare information. This is done after argument lowering +/// in case the declarations refer to arguments. +static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) { + MachineFunction *MF = FuncInfo->MF; + const DataLayout &DL = MF->getDataLayout(); + for (const BasicBlock &BB : *FuncInfo->Fn) { + for (const Instruction &I : BB) { + const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I); + if (!DI) continue; - unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC); - // Assign Undef to Vreg. We construct MI directly to make sure it works - // with FastISel. - BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(), - SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), - VReg); - FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg); + + assert(DI->getVariable() && "Missing variable"); + assert(DI->getDebugLoc() && "Missing location"); + const Value *Address = DI->getAddress(); + if (!Address) + continue; + + // Look through casts and constant offset GEPs. These mostly come from + // inalloca. + APInt Offset(DL.getPointerSizeInBits(0), 0); + Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // Check if the variable is a static alloca or a byval or inalloca + // argument passed in memory. If it is not, then we will ignore this + // intrinsic and handle this during isel like dbg.value. + int FI = INT_MAX; + if (const auto *AI = dyn_cast<AllocaInst>(Address)) { + auto SI = FuncInfo->StaticAllocaMap.find(AI); + if (SI != FuncInfo->StaticAllocaMap.end()) + FI = SI->second; + } else if (const auto *Arg = dyn_cast<Argument>(Address)) + FI = FuncInfo->getArgumentFrameIndex(Arg); + + if (FI == INT_MAX) + continue; + + DIExpression *Expr = DI->getExpression(); + if (Offset.getBoolValue()) + Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, + Offset.getZExtValue()); + MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc()); } } } @@ -1340,6 +1273,7 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) { } void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { + FastISelFailed = false; // Initialize the Fast-ISel state, if needed. FastISel *FastIS = nullptr; if (TM.Options.EnableFastISel) @@ -1347,12 +1281,55 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { setupSwiftErrorVals(Fn, TLI, FuncInfo); - // Iterate over all basic blocks in the function. ReversePostOrderTraversal<const Function*> RPOT(&Fn); - for (ReversePostOrderTraversal<const Function*>::rpo_iterator - I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { - const BasicBlock *LLVMBB = *I; + // Lower arguments up front. An RPO iteration always visits the entry block + // first. + assert(*RPOT.begin() == &Fn.getEntryBlock()); + ++NumEntryBlocks; + + // Set up FuncInfo for ISel. Entry blocks never have PHIs. + FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()]; + FuncInfo->InsertPt = FuncInfo->MBB->begin(); + + if (!FastIS) { + LowerArguments(Fn); + } else { + // See if fast isel can lower the arguments. + FastIS->startNewBlock(); + if (!FastIS->lowerArguments()) { + FastISelFailed = true; + // Fast isel failed to lower these arguments + ++NumFastIselFailLowerArguments; + + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Fn.getSubprogram(), + &Fn.getEntryBlock()); + R << "FastISel didn't lower all arguments: " + << ore::NV("Prototype", Fn.getType()); + reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1); + + // Use SelectionDAG argument lowering + LowerArguments(Fn); + CurDAG->setRoot(SDB->getControlRoot()); + SDB->clear(); + CodeGenAndEmitDAG(); + } + + // If we inserted any instructions at the beginning, make a note of + // where they are, so we can be sure to emit subsequent instructions + // after them. + if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) + FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); + else + FastIS->setLastLocalValue(nullptr); + } + createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB); + + processDbgDeclares(FuncInfo); + + // Iterate over all basic blocks in the function. + for (const BasicBlock *LLVMBB : RPOT) { if (OptLevel != CodeGenOpt::None) { bool AllPredsVisited = true; for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB); @@ -1384,8 +1361,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB]; if (!FuncInfo->MBB) continue; // Some blocks like catchpads have no code or MBB. - FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI(); - createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB); + + // Insert new instructions after any phi or argument setup code. + FuncInfo->InsertPt = FuncInfo->MBB->end(); // Setup an EH landing-pad block. FuncInfo->ExceptionPointerVirtReg = 0; @@ -1396,35 +1374,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Before doing SelectionDAG ISel, see if FastISel has been requested. if (FastIS) { - FastIS->startNewBlock(); - - // Emit code for any incoming arguments. This must happen before - // beginning FastISel on the entry block. - if (LLVMBB == &Fn.getEntryBlock()) { - ++NumEntryBlocks; - - // Lower any arguments needed in this block if this is the entry block. - if (!FastIS->lowerArguments()) { - // Fast isel failed to lower these arguments - ++NumFastIselFailLowerArguments; - if (EnableFastISelAbort > 1) - report_fatal_error("FastISel didn't lower all arguments"); - - // Use SelectionDAG argument lowering - LowerArguments(Fn); - CurDAG->setRoot(SDB->getControlRoot()); - SDB->clear(); - CodeGenAndEmitDAG(); - } - - // If we inserted any instructions at the beginning, make a note of - // where they are, so we can be sure to emit subsequent instructions - // after them. - if (FuncInfo->InsertPt != FuncInfo->MBB->begin()) - FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt)); - else - FastIS->setLastLocalValue(nullptr); - } + if (LLVMBB != &Fn.getEntryBlock()) + FastIS->startNewBlock(); unsigned NumFastIselRemaining = std::distance(Begin, End); // Do FastISel on as many instructions as possible. @@ -1432,7 +1383,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { const Instruction *Inst = &*std::prev(BI); // If we no longer require this instruction, skip it. - if (isFoldedOrDeadInstruction(Inst, FuncInfo)) { + if (isFoldedOrDeadInstruction(Inst, FuncInfo) || + ElidedArgCopyInstrs.count(Inst)) { --NumFastIselRemaining; continue; } @@ -1443,6 +1395,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // Try to select the instruction with FastISel. if (FastIS->selectInstruction(Inst)) { + FastISelFailed = true; --NumFastIselRemaining; ++NumFastIselSuccess; // If fast isel succeeded, skip over all the folded instructions, and @@ -1465,22 +1418,22 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } -#ifndef NDEBUG - if (EnableFastISelVerbose2) - collectFailStats(Inst); -#endif - // Then handle certain instructions as single-LLVM-Instruction blocks. if (isa<CallInst>(Inst)) { + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Inst->getDebugLoc(), LLVMBB); - if (EnableFastISelVerbose || EnableFastISelAbort) { - dbgs() << "FastISel missed call: "; - Inst->dump(); + R << "FastISel missed call"; + + if (R.isEnabled() || EnableFastISelAbort) { + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << *Inst; + + R << ": " << InstStr.str(); } - if (EnableFastISelAbort > 2) - // FastISel selector couldn't handle something and bailed. - // For the purpose of debugging, just abort. - report_fatal_error("FastISel didn't select the entire block"); + + reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2); if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() && !Inst->use_empty()) { @@ -1509,35 +1462,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { continue; } + OptimizationRemarkMissed R("sdagisel", "FastISelFailure", + Inst->getDebugLoc(), LLVMBB); + bool ShouldAbort = EnableFastISelAbort; - if (EnableFastISelVerbose || EnableFastISelAbort) { - if (isa<TerminatorInst>(Inst)) { - // Use a different message for terminator misses. - dbgs() << "FastISel missed terminator: "; - // Don't abort unless for terminator unless the level is really high - ShouldAbort = (EnableFastISelAbort > 2); - } else { - dbgs() << "FastISel miss: "; - } - Inst->dump(); + if (isa<TerminatorInst>(Inst)) { + // Use a different message for terminator misses. + R << "FastISel missed terminator"; + // Don't abort for terminator unless the level is really high + ShouldAbort = (EnableFastISelAbort > 2); + } else { + R << "FastISel missed"; } - if (ShouldAbort) - // FastISel selector couldn't handle something and bailed. - // For the purpose of debugging, just abort. - report_fatal_error("FastISel didn't select the entire block"); + + if (R.isEnabled() || EnableFastISelAbort) { + std::string InstStrStorage; + raw_string_ostream InstStr(InstStrStorage); + InstStr << *Inst; + R << ": " << InstStr.str(); + } + + reportFastISelFailure(*MF, *ORE, R, ShouldAbort); NumFastIselFailures += NumFastIselRemaining; break; } FastIS->recomputeInsertPt(); - } else { - // Lower any arguments needed in this block if this is the entry block. - if (LLVMBB == &Fn.getEntryBlock()) { - ++NumEntryBlocks; - LowerArguments(Fn); - } } + if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) { bool FunctionBasedInstrumentation = TLI->getSSPStackGuardCheck(*Fn.getParent()); @@ -1556,10 +1509,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // block. bool HadTailCall; SelectBasicBlock(Begin, BI, HadTailCall); + + // But if FastISel was run, we already selected some of the block. + // If we emitted a tail-call, we need to delete any previously emitted + // instruction that follows it. + if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end()) + FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end()); } FinishBasicBlock(); FuncInfo->PHINodesToUpdate.clear(); + ElidedArgCopyInstrs.clear(); } propagateSwiftErrorVRegs(FuncInfo); @@ -1975,11 +1935,11 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS, // either already zero or is not demanded. Check for known zero input bits. APInt NeededMask = DesiredMask & ~ActualMask; - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(LHS, KnownZero, KnownOne); + KnownBits Known; + CurDAG->computeKnownBits(LHS, Known); // If all the missing bits in the or are already known to be set, match! - if ((NeededMask & KnownOne) == NeededMask) + if (NeededMask.isSubsetOf(Known.One)) return true; // TODO: check to see if missing bits are just not demanded. @@ -2177,7 +2137,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, IgnoreChains = false; } - SmallPtrSet<SDNode*, 16> Visited; return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); } @@ -2554,7 +2513,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, - const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { + const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) { // Accept if it is exactly the same as a previously recorded node. unsigned RecNo = MatcherTable[MatcherIndex++]; assert(RecNo < RecordedNodes.size() && "Invalid CheckSame"); @@ -2564,9 +2523,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, /// CheckChildSame - Implements OP_CheckChildXSame. LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, - SDValue N, - const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes, - unsigned ChildNo) { + SDValue N, + const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes, + unsigned ChildNo) { if (ChildNo >= N.getNumOperands()) return false; // Match fails if out of range child #. return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo), @@ -2688,7 +2647,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table, unsigned Index, SDValue N, bool &Result, const SelectionDAGISel &SDISel, - SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { + SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) { switch (Table[Index++]) { default: Result = false; @@ -2756,6 +2715,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table, } namespace { + struct MatchScope { /// FailIndex - If this match fails, this is the index to continue with. unsigned FailIndex; @@ -2785,6 +2745,7 @@ class MatchStateUpdater : public SelectionDAG::DAGUpdateListener SDNode **NodeToMatch; SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes; SmallVectorImpl<MatchScope> &MatchScopes; + public: MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch, SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN, @@ -2816,6 +2777,7 @@ public: J.setNode(E); } }; + } // end anonymous namespace void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, @@ -2921,7 +2883,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // with an OPC_SwitchOpcode instruction. Populate the table now, since this // is the first time we're selecting an instruction. unsigned Idx = 1; - while (1) { + while (true) { // Get the size of this case. unsigned CaseSize = MatcherTable[Idx++]; if (CaseSize & 128) @@ -2942,7 +2904,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, MatcherIndex = OpcodeOffset[N.getOpcode()]; } - while (1) { + while (true) { assert(MatcherIndex < TableSize && "Invalid index"); #ifndef NDEBUG unsigned CurrentOpcodeIndex = MatcherIndex; @@ -2957,7 +2919,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // immediately fail, don't even bother pushing a scope for them. unsigned FailIndex; - while (1) { + while (true) { unsigned NumToSkip = MatcherTable[MatcherIndex++]; if (NumToSkip & 128) NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex); @@ -3118,7 +3080,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, unsigned CurNodeOpcode = N.getOpcode(); unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart; unsigned CaseSize; - while (1) { + while (true) { // Get the size of this case. CaseSize = MatcherTable[MatcherIndex++]; if (CaseSize & 128) @@ -3149,7 +3111,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, MVT CurNodeVT = N.getSimpleValueType(); unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart; unsigned CaseSize; - while (1) { + while (true) { // Get the size of this case. CaseSize = MatcherTable[MatcherIndex++]; if (CaseSize & 128) @@ -3215,7 +3177,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // a single use. bool HasMultipleUses = false; for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) - if (!NodeStack[i].hasOneUse()) { + if (!NodeStack[i].getNode()->hasOneUse()) { HasMultipleUses = true; break; } @@ -3381,6 +3343,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr)); continue; } + case OPC_Coverage: { + // This is emitted right before MorphNode/EmitNode. + // So it should be safe to assume that this node has been selected + unsigned index = MatcherTable[MatcherIndex++]; + index |= (MatcherTable[MatcherIndex++] << 8); + dbgs() << "COVERED: " << getPatternForIndex(index) << "\n"; + dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n"; + continue; + } case OPC_EmitNode: case OPC_MorphNodeTo: case OPC_EmitNode0: case OPC_EmitNode1: case OPC_EmitNode2: @@ -3473,7 +3444,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i), nullptr)); } - } else { assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE && "NodeToMatch was removed partway through selection"); @@ -3610,7 +3580,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // find a case to check. DEBUG(dbgs() << " Match failed at index " << CurrentOpcodeIndex << "\n"); ++NumDAGIselRetries; - while (1) { + while (true) { if (MatchScopes.empty()) { CannotYetSelect(NodeToMatch); return; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index d27e2455978d..1c66649cae01 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -110,8 +110,8 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType, Builder.FuncInfo.StatepointStackSlots.size() && "Broken invariant"); - StatepointMaxSlotsRequired = std::max<unsigned long>( - StatepointMaxSlotsRequired, Builder.FuncInfo.StatepointStackSlots.size()); + StatepointMaxSlotsRequired.updateMax( + Builder.FuncInfo.StatepointStackSlots.size()); return SpillSlot; } @@ -242,7 +242,8 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue, // Cache this slot so we find it when going through the normal // assignment loop. - SDValue Loc = Builder.DAG.getTargetFrameIndex(*Index, Incoming.getValueType()); + SDValue Loc = + Builder.DAG.getTargetFrameIndex(*Index, Builder.getFrameIndexTy()); Builder.StatepointLowering.setLocation(Incoming, Loc); } @@ -343,7 +344,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain, Builder); int Index = cast<FrameIndexSDNode>(Loc)->getIndex(); // We use TargetFrameIndex so that isel will not select it into LEA - Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType()); + Loc = Builder.DAG.getTargetFrameIndex(Index, Builder.getFrameIndexTy()); // TODO: We can create TokenFactor node instead of // chaining stores one after another, this may allow @@ -391,8 +392,10 @@ static void lowerIncomingStatepointValue(SDValue Incoming, bool LiveInOnly, // This handles allocas as arguments to the statepoint (this is only // really meaningful for a deopt value. For GC, we'd be trying to // relocate the address of the alloca itself?) + assert(Incoming.getValueType() == Builder.getFrameIndexTy() && + "Incoming value is a frame index!"); Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(), - Incoming.getValueType())); + Builder.getFrameIndexTy())); } else if (LiveInOnly) { // If this value is live in (not live-on-return, or live-through), we can // treat it the same way patchpoint treats it's "live in" values. We'll @@ -527,8 +530,10 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops, SDValue Incoming = Builder.getValue(V); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) { // This handles allocas as arguments to the statepoint + assert(Incoming.getValueType() == Builder.getFrameIndexTy() && + "Incoming value is a frame index!"); Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(), - Incoming.getValueType())); + Builder.getFrameIndexTy())); } } @@ -949,8 +954,8 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { return; } - SDValue SpillSlot = DAG.getTargetFrameIndex(*DerivedPtrLocation, - SD.getValueType()); + SDValue SpillSlot = + DAG.getTargetFrameIndex(*DerivedPtrLocation, getFrameIndexTy()); // Be conservative: flush all pending loads // TODO: Probably we can be less restrictive on this, @@ -958,7 +963,9 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { SDValue Chain = getRoot(); SDValue SpillLoad = - DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot, + DAG.getLoad(DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), + Relocate.getType()), + getCurSDLoc(), Chain, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), *DerivedPtrLocation)); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 690f0d2c8082..0dffffee9976 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -27,6 +27,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" @@ -55,14 +56,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node, // Conservatively require the attributes of the call to match those of // the return. Ignore noalias because it doesn't affect the call sequence. - AttributeSet CallerAttrs = F->getAttributes(); - if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex) - .removeAttribute(Attribute::NoAlias).hasAttributes()) + AttributeList CallerAttrs = F->getAttributes(); + if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex) + .removeAttribute(Attribute::NoAlias) + .hasAttributes()) return false; // It's not safe to eliminate the sign / zero extension of the return value. - if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) || - CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) || + CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) return false; // Check if the only use is a function return node. @@ -96,19 +98,19 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI, /// \brief Set CallLoweringInfo attribute flags based on a call instruction /// and called function attributes. -void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS, - unsigned AttrIdx) { - isSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt); - isZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt); - isInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg); - isSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet); - isNest = CS->paramHasAttr(AttrIdx, Attribute::Nest); - isByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal); - isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca); - isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned); - isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf); - isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError); - Alignment = CS->getParamAlignment(AttrIdx); +void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS, + unsigned ArgIdx) { + IsSExt = CS->paramHasAttr(ArgIdx, Attribute::SExt); + IsZExt = CS->paramHasAttr(ArgIdx, Attribute::ZExt); + IsInReg = CS->paramHasAttr(ArgIdx, Attribute::InReg); + IsSRet = CS->paramHasAttr(ArgIdx, Attribute::StructRet); + IsNest = CS->paramHasAttr(ArgIdx, Attribute::Nest); + IsByVal = CS->paramHasAttr(ArgIdx, Attribute::ByVal); + IsInAlloca = CS->paramHasAttr(ArgIdx, Attribute::InAlloca); + IsReturned = CS->paramHasAttr(ArgIdx, Attribute::Returned); + IsSwiftSelf = CS->paramHasAttr(ArgIdx, Attribute::SwiftSelf); + IsSwiftError = CS->paramHasAttr(ArgIdx, Attribute::SwiftError); + Alignment = CS->getParamAlignment(ArgIdx); } /// Generate a libcall taking the given operands as arguments and returning a @@ -125,8 +127,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, for (SDValue Op : Ops) { Entry.Node = Op; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); - Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); Args.push_back(Entry); } @@ -138,10 +140,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) - .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed) - .setSExtResult(signExtend).setZExtResult(!signExtend); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) + .setNoReturn(doesNotReturn) + .setDiscardResult(!isReturnValueUsed) + .setSExtResult(signExtend) + .setZExtResult(!signExtend); return LowerCallTo(CLI); } @@ -334,34 +339,40 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // Optimization Methods //===----------------------------------------------------------------------===// -/// Check to see if the specified operand of the specified instruction is a -/// constant integer. If so, check to see if there are any bits set in the -/// constant that are not demanded. If so, shrink the constant and return true. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, - const APInt &Demanded) { - SDLoc dl(Op); +/// If the specified instruction has a constant integer operand and there are +/// bits set in that constant that are not demanded, then clear those bits and +/// return true. +bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const { + SelectionDAG &DAG = TLO.DAG; + SDLoc DL(Op); + unsigned Opcode = Op.getOpcode(); + + // Do target-specific constant optimization. + if (targetShrinkDemandedConstant(Op, Demanded, TLO)) + return TLO.New.getNode(); // FIXME: ISD::SELECT, ISD::SELECT_CC - switch (Op.getOpcode()) { - default: break; + switch (Opcode) { + default: + break; case ISD::XOR: case ISD::AND: case ISD::OR: { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!C) return false; + auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Op1C) + return false; - if (Op.getOpcode() == ISD::XOR && - (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) + // If this is a 'not' op, don't touch it because that's a canonical form. + const APInt &C = Op1C->getAPIntValue(); + if (Opcode == ISD::XOR && (C | ~Demanded).isAllOnesValue()) return false; - // if we can expand it to have all bits set, do it - if (C->getAPIntValue().intersects(~Demanded)) { + if (C.intersects(~Demanded)) { EVT VT = Op.getValueType(); - SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0), - DAG.getConstant(Demanded & - C->getAPIntValue(), - dl, VT)); - return CombineTo(Op, New); + SDValue NewC = DAG.getConstant(Demanded & C, DL, VT); + SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC); + return TLO.CombineTo(Op, NewOp); } break; @@ -374,15 +385,17 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be /// generalized for targets with other types of implicit widening casts. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, - unsigned BitWidth, - const APInt &Demanded, - const SDLoc &dl) { +bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, + const APInt &Demanded, + TargetLoweringOpt &TLO) const { assert(Op.getNumOperands() == 2 && "ShrinkDemandedOp only supports binary operators!"); assert(Op.getNode()->getNumValues() == 1 && "ShrinkDemandedOp only supports nodes with one result!"); + SelectionDAG &DAG = TLO.DAG; + SDLoc dl(Op); + // Early return, as this function cannot handle vector types. if (Op.getValueType().isVector()) return false; @@ -404,31 +417,28 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, if (TLI.isTruncateFree(Op.getValueType(), SmallVT) && TLI.isZExtFree(SmallVT, Op.getValueType())) { // We found a type with free casts. - SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT, - DAG.getNode(ISD::TRUNCATE, dl, SmallVT, - Op.getNode()->getOperand(0)), - DAG.getNode(ISD::TRUNCATE, dl, SmallVT, - Op.getNode()->getOperand(1))); + SDValue X = DAG.getNode( + Op.getOpcode(), dl, SmallVT, + DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)), + DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1))); bool NeedZext = DemandedSize > SmallVTBits; SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, dl, Op.getValueType(), X); - return CombineTo(Op, Z); + return TLO.CombineTo(Op, Z); } } return false; } bool -TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, - unsigned OpIdx, - const APInt &Demanded, - DAGCombinerInfo &DCI) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +TargetLowering::SimplifyDemandedBits(SDNode *User, unsigned OpIdx, + const APInt &Demanded, + DAGCombinerInfo &DCI, + TargetLoweringOpt &TLO) const { SDValue Op = User->getOperand(OpIdx); - APInt KnownZero, KnownOne; + KnownBits Known; - if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, - *this, 0, true)) + if (!SimplifyDemandedBits(Op, Demanded, Known, TLO, 0, true)) return false; @@ -440,9 +450,9 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, // with the value 'x', which will give us: // Old = i32 and x, 0xffffff // New = x - if (Old.hasOneUse()) { + if (TLO.Old.hasOneUse()) { // For the one use case, we just commit the change. - DCI.CommitTargetLoweringOpt(*this); + DCI.CommitTargetLoweringOpt(TLO); return true; } @@ -450,17 +460,17 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, // AssumeSingleUse flag is not propogated to recursive calls of // SimplifyDemanded bits, so the only node with multiple use that // it will attempt to combine will be opt. - assert(Old == Op); + assert(TLO.Old == Op); SmallVector <SDValue, 4> NewOps; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { if (i == OpIdx) { - NewOps.push_back(New); + NewOps.push_back(TLO.New); continue; } NewOps.push_back(User->getOperand(i)); } - DAG.UpdateNodeOperands(User, NewOps); + TLO.DAG.UpdateNodeOperands(User, NewOps); // Op has less users now, so we may be able to perform additional combines // with it. DCI.AddToWorklist(Op.getNode()); @@ -470,17 +480,30 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, return true; } +bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask, + DAGCombinerInfo &DCI) const { + + SelectionDAG &DAG = DCI.DAG; + TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + KnownBits Known; + + bool Simplified = SimplifyDemandedBits(Op, DemandedMask, Known, TLO); + if (Simplified) + DCI.CommitTargetLoweringOpt(TLO); + return Simplified; +} + /// Look at Op. At this point, we know that only the DemandedMask bits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning the /// original and new nodes in Old and New. Otherwise, analyze the expression and -/// return a mask of KnownOne and KnownZero bits for the expression (used to -/// simplify the caller). The KnownZero/One bits may only be accurate for those -/// bits in the DemandedMask. +/// return a mask of Known bits for the expression (used to simplify the +/// caller). The Known bits may only be accurate for those bits in the +/// DemandedMask. bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth, bool AssumeSingleUse) const { @@ -492,14 +515,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, auto &DL = TLO.DAG.getDataLayout(); // Don't know anything. - KnownZero = KnownOne = APInt(BitWidth, 0); + Known = KnownBits(BitWidth); // Other users may use these bits. if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) { if (Depth != 0) { - // If not at the root, Just compute the KnownZero/KnownOne bits to + // If not at the root, Just compute the Known bits to // simplify things downstream. - TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth); + TLO.DAG.computeKnownBits(Op, Known, Depth); return false; } // If this is the root being simplified, allow it to have multiple uses, @@ -514,38 +537,36 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return false; } - APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut; + KnownBits Known2, KnownOut; switch (Op.getOpcode()) { case ISD::Constant: // We know all of the bits for a constant! - KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue(); - KnownZero = ~KnownOne; + Known.One = cast<ConstantSDNode>(Op)->getAPIntValue(); + Known.Zero = ~Known.One; return false; // Don't fall through, will infinitely loop. case ISD::BUILD_VECTOR: // Collect the known bits that are shared by every constant vector element. - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); Known.One.setAllBits(); for (SDValue SrcOp : Op->ops()) { if (!isa<ConstantSDNode>(SrcOp)) { // We can only handle all constant values - bail out with no known bits. - KnownZero = KnownOne = APInt(BitWidth, 0); + Known = KnownBits(BitWidth); return false; } - KnownOne2 = cast<ConstantSDNode>(SrcOp)->getAPIntValue(); - KnownZero2 = ~KnownOne2; + Known2.One = cast<ConstantSDNode>(SrcOp)->getAPIntValue(); + Known2.Zero = ~Known2.One; // BUILD_VECTOR can implicitly truncate sources, we must handle this. - if (KnownOne2.getBitWidth() != BitWidth) { - assert(KnownOne2.getBitWidth() > BitWidth && - KnownZero2.getBitWidth() > BitWidth && + if (Known2.One.getBitWidth() != BitWidth) { + assert(Known2.getBitWidth() > BitWidth && "Expected BUILD_VECTOR implicit truncation"); - KnownOne2 = KnownOne2.trunc(BitWidth); - KnownZero2 = KnownZero2.trunc(BitWidth); + Known2 = Known2.trunc(BitWidth); } // Known bits are the values that are shared by every element. // TODO: support per-element known bits. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; } return false; // Don't fall through, will infinitely loop. case ISD::AND: @@ -553,18 +574,18 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // using the bits from the RHS. Below, we use knowledge about the RHS to // simplify the LHS, here we're using information from the LHS to simplify // the RHS. - if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (ConstantSDNode *RHSC = isConstOrConstSplat(Op.getOperand(1))) { SDValue Op0 = Op.getOperand(0); - APInt LHSZero, LHSOne; + KnownBits LHSKnown; // Do not increment Depth here; that can cause an infinite loop. - TLO.DAG.computeKnownBits(Op0, LHSZero, LHSOne, Depth); + TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth); // If the LHS already has zeros where RHSC does, this and is dead. - if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask)) + if ((LHSKnown.Zero & NewMask) == (~RHSC->getAPIntValue() & NewMask)) return TLO.CombineTo(Op, Op0); // If any of the set bits in the RHS are known zero on the LHS, shrink // the constant. - if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask)) + if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & NewMask, TLO)) return true; // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its @@ -573,183 +594,191 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // the xor. For example, for a 32-bit X: // and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1 if (isBitwiseNot(Op0) && Op0.hasOneUse() && - LHSOne == ~RHSC->getAPIntValue()) { + LHSKnown.One == ~RHSC->getAPIntValue()) { SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, Op.getValueType(), Op0.getOperand(0), Op.getOperand(1)); return TLO.CombineTo(Op, Xor); } } - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask, - KnownZero2, KnownOne2, TLO, Depth+1)) + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + if (SimplifyDemandedBits(Op.getOperand(0), ~Known.Zero & NewMask, + Known2, TLO, Depth+1)) return true; - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known one on one side, return the other. // These bits cannot contribute to the result of the 'and'. - if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask)) + if (NewMask.isSubsetOf(Known2.Zero | Known.One)) return TLO.CombineTo(Op, Op.getOperand(0)); - if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask)) + if (NewMask.isSubsetOf(Known.Zero | Known2.One)) return TLO.CombineTo(Op, Op.getOperand(1)); // If all of the demanded bits in the inputs are known zeros, return zero. - if ((NewMask & (KnownZero|KnownZero2)) == NewMask) + if (NewMask.isSubsetOf(Known.Zero | Known2.Zero)) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType())); // If the RHS is a constant, see if we can simplify it. - if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask)) + if (ShrinkDemandedConstant(Op, ~Known2.Zero & NewMask, TLO)) return true; // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // Output known-1 bits are only known if set in both the LHS & RHS. - KnownOne &= KnownOne2; + Known.One &= Known2.One; // Output known-0 are known to be clear if zero in either the LHS | RHS. - KnownZero |= KnownZero2; + Known.Zero |= Known2.Zero; break; case ISD::OR: - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask, - KnownZero2, KnownOne2, TLO, Depth+1)) + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + if (SimplifyDemandedBits(Op.getOperand(0), ~Known.One & NewMask, + Known2, TLO, Depth+1)) return true; - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'or'. - if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask)) - return TLO.CombineTo(Op, Op.getOperand(0)); - if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask)) - return TLO.CombineTo(Op, Op.getOperand(1)); - // If all of the potentially set bits on one side are known to be set on - // the other side, just use the 'other' side. - if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask)) + if (NewMask.isSubsetOf(Known2.One | Known.Zero)) return TLO.CombineTo(Op, Op.getOperand(0)); - if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask)) + if (NewMask.isSubsetOf(Known.One | Known2.Zero)) return TLO.CombineTo(Op, Op.getOperand(1)); // If the RHS is a constant, see if we can simplify it. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // Output known-0 bits are only known if clear in both the LHS & RHS. - KnownZero &= KnownZero2; + Known.Zero &= Known2.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. - KnownOne |= KnownOne2; + Known.One |= Known2.One; break; - case ISD::XOR: - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + case ISD::XOR: { + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2, - KnownOne2, TLO, Depth+1)) + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + if (SimplifyDemandedBits(Op.getOperand(0), NewMask, Known2, TLO, Depth+1)) return true; - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'xor'. - if ((KnownZero & NewMask) == NewMask) + if (NewMask.isSubsetOf(Known.Zero)) return TLO.CombineTo(Op, Op.getOperand(0)); - if ((KnownZero2 & NewMask) == NewMask) + if (NewMask.isSubsetOf(Known2.Zero)) return TLO.CombineTo(Op, Op.getOperand(1)); // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // If all of the unknown bits are known to be zero on one side or the other // (but not both) turn this into an *inclusive* or. // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 - if ((NewMask & ~KnownZero & ~KnownZero2) == 0) + if ((NewMask & ~Known.Zero & ~Known2.Zero) == 0) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(), Op.getOperand(0), Op.getOperand(1))); // Output known-0 bits are known if clear or set in both the LHS & RHS. - KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); + KnownOut.Zero = (Known.Zero & Known2.Zero) | (Known.One & Known2.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); + KnownOut.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero); // If all of the demanded bits on one side are known, and all of the set // bits on that side are also known to be set on the other side, turn this // into an AND, as we know the bits will be cleared. // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 // NB: it is okay if more bits are known than are requested - if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side - if (KnownOne == KnownOne2) { // set bits are the same on both sides + if (NewMask.isSubsetOf(Known.Zero|Known.One)) { // all known on one side + if (Known.One == Known2.One) { // set bits are the same on both sides EVT VT = Op.getValueType(); - SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, dl, VT); + SDValue ANDC = TLO.DAG.getConstant(~Known.One & NewMask, dl, VT); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), ANDC)); } } - // If the RHS is a constant, see if we can simplify it. - // for XOR, we prefer to force bits to 1 if they will make a -1. - // If we can't force bits, try to shrink the constant. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - APInt Expanded = C->getAPIntValue() | (~NewMask); - // If we can expand it to have all bits set, do it. - if (Expanded.isAllOnesValue()) { - if (Expanded != C->getAPIntValue()) { - EVT VT = Op.getValueType(); - SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0), - TLO.DAG.getConstant(Expanded, dl, VT)); - return TLO.CombineTo(Op, New); - } - // If it already has all the bits set, nothing to change - // but don't shrink either! - } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) { - return true; + // If the RHS is a constant, see if we can change it. Don't alter a -1 + // constant because that's a 'not' op, and that is better for combining and + // codegen. + ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1)); + if (C && !C->isAllOnesValue()) { + if (NewMask.isSubsetOf(C->getAPIntValue())) { + // We're flipping all demanded bits. Flip the undemanded bits too. + SDValue New = TLO.DAG.getNOT(dl, Op.getOperand(0), Op.getValueType()); + return TLO.CombineTo(Op, New); } + // If we can't turn this into a 'not', try to shrink the constant. + if (ShrinkDemandedConstant(Op, NewMask, TLO)) + return true; } - KnownZero = KnownZeroOut; - KnownOne = KnownOneOut; + Known = std::move(KnownOut); break; + } case ISD::SELECT: - if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known, TLO, Depth+1)) return true; - if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2, - KnownOne2, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known2, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; break; case ISD::SELECT_CC: - if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(3), NewMask, Known, TLO, Depth+1)) return true; - if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2, - KnownOne2, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known2, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + break; + case ISD::SETCC: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + // If (1) we only need the sign-bit, (2) the setcc operands are the same + // width as the setcc result, and (3) the result of a setcc conforms to 0 or + // -1, we may be able to bypass the setcc. + if (NewMask.isSignMask() && Op0.getScalarValueSizeInBits() == BitWidth && + getBooleanContents(Op.getValueType()) == + BooleanContent::ZeroOrNegativeOneBooleanContent) { + // If we're testing X < 0, then this compare isn't needed - just use X! + // FIXME: We're limiting to integer types here, but this should also work + // if we don't care about FP signed-zero. The use of SETLT with FP means + // that we don't care about NaNs. + if (CC == ISD::SETLT && Op1.getValueType().isInteger() && + (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode()))) + return TLO.CombineTo(Op, Op0); + + // TODO: Should we check for other forms of sign-bit comparisons? + // Examples: X <= -1, X >= 0 + } + if (getBooleanContents(Op0.getValueType()) == + TargetLowering::ZeroOrOneBooleanContent && + BitWidth > 1) + Known.Zero.setBitsFrom(1); break; + } case ISD::SHL: if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { unsigned ShAmt = SA->getZExtValue(); @@ -781,17 +810,16 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt), - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt), Known, TLO, Depth+1)) return true; // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits // are not demanded. This will likely allow the anyext to be folded away. if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) { - SDValue InnerOp = InOp.getNode()->getOperand(0); + SDValue InnerOp = InOp.getOperand(0); EVT InnerVT = InnerOp.getValueType(); unsigned InnerBits = InnerVT.getSizeInBits(); - if (ShAmt < InnerBits && NewMask.lshr(InnerBits) == 0 && + if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits && isTypeDesirableForOp(ISD::SHL, InnerVT)) { EVT ShTy = getShiftAmountTy(InnerVT, DL); if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits())) @@ -813,12 +841,12 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, InnerOp.getOpcode() == ISD::SRL && InnerOp.hasOneUse() && isa<ConstantSDNode>(InnerOp.getOperand(1))) { - uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1)) + unsigned InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1)) ->getZExtValue(); if (InnerShAmt < ShAmt && InnerShAmt < InnerBits && - NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 && - NewMask.trunc(ShAmt) == 0) { + NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) && + NewMask.countTrailingZeros() >= ShAmt) { SDValue NewSA = TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, Op.getOperand(1).getValueType()); @@ -831,10 +859,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - KnownZero <<= SA->getZExtValue(); - KnownOne <<= SA->getZExtValue(); + Known.Zero <<= SA->getZExtValue(); + Known.One <<= SA->getZExtValue(); // low bits known zero. - KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue()); + Known.Zero.setLowBits(SA->getZExtValue()); } break; case ISD::SRL: @@ -852,8 +880,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If the shift is exact, then it does demand the low bits (and knows that // they are zero). - if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) - InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); + if (Op->getFlags().hasExact()) + InDemandedMask.setLowBits(ShAmt); // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a // single shift. We can do this if the top bits (which are shifted out) @@ -877,15 +905,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } // Compute the new bits that are at the top now. - if (SimplifyDemandedBits(InOp, InDemandedMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(InOp, InDemandedMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.lshr(ShAmt); - KnownOne = KnownOne.lshr(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); - KnownZero |= HighBits; // High bits known zero. + Known.Zero.setHighBits(ShAmt); // High bits known zero. } break; case ISD::SRA: @@ -910,33 +936,30 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If the shift is exact, then it does demand the low bits (and knows that // they are zero). - if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()) - InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt); + if (Op->getFlags().hasExact()) + InDemandedMask.setLowBits(ShAmt); // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. - APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt); - if (HighBits.intersects(NewMask)) - InDemandedMask |= APInt::getSignBit(VT.getScalarSizeInBits()); + if (NewMask.countLeadingZeros() < ShAmt) + InDemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, Known, TLO, + Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.lshr(ShAmt); - KnownOne = KnownOne.lshr(ShAmt); - - // Handle the sign bit, adjusted to where it is now in the mask. - APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. - if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) { + if (Known.Zero[BitWidth - ShAmt - 1] || + NewMask.countLeadingZeros() >= ShAmt) { SDNodeFlags Flags; - Flags.setExact(cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact()); + Flags.setExact(Op->getFlags().hasExact()); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op.getOperand(0), - Op.getOperand(1), &Flags)); + Op.getOperand(1), Flags)); } int Log2 = NewMask.exactLogBase2(); @@ -949,9 +972,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Op.getOperand(0), NewSA)); } - if (KnownOne.intersects(SignBit)) + if (Known.One[BitWidth - ShAmt - 1]) // New bits are known one. - KnownOne |= HighBits; + Known.One.setHighBits(ShAmt); } break; case ISD::SIGN_EXTEND_INREG: { @@ -993,7 +1016,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return TLO.CombineTo(Op, Op.getOperand(0)); APInt InSignBit = - APInt::getSignBit(ExVT.getScalarSizeInBits()).zext(BitWidth); + APInt::getSignMask(ExVT.getScalarSizeInBits()).zext(BitWidth); APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, ExVT.getScalarSizeInBits()) & @@ -1004,24 +1027,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, InputDemandedBits |= InSignBit; if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits, - KnownZero, KnownOne, TLO, Depth+1)) + Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. // If the input sign bit is known zero, convert this into a zero extension. - if (KnownZero.intersects(InSignBit)) + if (Known.Zero.intersects(InSignBit)) return TLO.CombineTo(Op, TLO.DAG.getZeroExtendInReg( Op.getOperand(0), dl, ExVT.getScalarType())); - if (KnownOne.intersects(InSignBit)) { // Input sign bit known set - KnownOne |= NewBits; - KnownZero &= ~NewBits; + if (Known.One.intersects(InSignBit)) { // Input sign bit known set + Known.One |= NewBits; + Known.Zero &= ~NewBits; } else { // Input sign bit unknown - KnownZero &= ~NewBits; - KnownOne &= ~NewBits; + Known.Zero &= ~NewBits; + Known.One &= ~NewBits; } break; } @@ -1032,22 +1055,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt MaskLo = NewMask.getLoBits(HalfBitWidth).trunc(HalfBitWidth); APInt MaskHi = NewMask.getHiBits(HalfBitWidth).trunc(HalfBitWidth); - APInt KnownZeroLo, KnownOneLo; - APInt KnownZeroHi, KnownOneHi; + KnownBits KnownLo, KnownHi; - if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownZeroLo, - KnownOneLo, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownLo, TLO, Depth + 1)) return true; - if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownZeroHi, - KnownOneHi, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownHi, TLO, Depth + 1)) return true; - KnownZero = KnownZeroLo.zext(BitWidth) | - KnownZeroHi.zext(BitWidth).shl(HalfBitWidth); + Known.Zero = KnownLo.Zero.zext(BitWidth) | + KnownHi.Zero.zext(BitWidth).shl(HalfBitWidth); - KnownOne = KnownOneLo.zext(BitWidth) | - KnownOneHi.zext(BitWidth).shl(HalfBitWidth); + Known.One = KnownLo.One.zext(BitWidth) | + KnownHi.One.zext(BitWidth).shl(HalfBitWidth); break; } case ISD::ZERO_EXTEND: { @@ -1062,20 +1082,18 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Op.getValueType(), Op.getOperand(0))); - if (SimplifyDemandedBits(Op.getOperand(0), InMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); - KnownZero |= NewBits; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known = Known.zext(BitWidth); + Known.Zero |= NewBits; break; } case ISD::SIGN_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); APInt InMask = APInt::getLowBitsSet(BitWidth, InBits); - APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits); + APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1); APInt NewBits = ~InMask & NewMask; // If none of the top bits are demanded, convert this into an any_extend. @@ -1090,37 +1108,34 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, InDemandedBits |= InSignBit; InDemandedBits = InDemandedBits.trunc(InBits); - if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero, - KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, Known, TLO, + Depth+1)) return true; - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + Known = Known.zext(BitWidth); // If the sign bit is known zero, convert this to a zero extend. - if (KnownZero.intersects(InSignBit)) + if (Known.Zero.intersects(InSignBit)) return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Op.getOperand(0))); // If the sign bit is known one, the top bits match. - if (KnownOne.intersects(InSignBit)) { - KnownOne |= NewBits; - assert((KnownZero & NewBits) == 0); + if (Known.One.intersects(InSignBit)) { + Known.One |= NewBits; + assert((Known.Zero & NewBits) == 0); } else { // Otherwise, top bits aren't known. - assert((KnownOne & NewBits) == 0); - assert((KnownZero & NewBits) == 0); + assert((Known.One & NewBits) == 0); + assert((Known.Zero & NewBits) == 0); } break; } case ISD::ANY_EXTEND: { unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); APInt InMask = NewMask.trunc(OperandBitWidth); - if (SimplifyDemandedBits(Op.getOperand(0), InMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known = Known.zext(BitWidth); break; } case ISD::TRUNCATE: { @@ -1128,11 +1143,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // zero/one bits live out. unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits(); APInt TruncMask = NewMask.zext(OperandBitWidth); - if (SimplifyDemandedBits(Op.getOperand(0), TruncMask, - KnownZero, KnownOne, TLO, Depth+1)) + if (SimplifyDemandedBits(Op.getOperand(0), TruncMask, Known, TLO, Depth+1)) return true; - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); + Known = Known.trunc(BitWidth); // If the input is only used by this truncate, see if we can shrink it based // on the known demanded bits. @@ -1158,26 +1171,29 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, getShiftAmountTy(Op.getValueType(), DL)); } - APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, - OperandBitWidth - BitWidth); - HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth); - - if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) { - // None of the shifted in bits are needed. Add a truncate of the - // shift input, then shift it. - SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl, - Op.getValueType(), - In.getOperand(0)); - return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, - Op.getValueType(), - NewTrunc, - Shift)); + if (ShAmt->getZExtValue() < BitWidth) { + APInt HighBits = APInt::getHighBitsSet(OperandBitWidth, + OperandBitWidth - BitWidth); + HighBits.lshrInPlace(ShAmt->getZExtValue()); + HighBits = HighBits.trunc(BitWidth); + + if (!(HighBits & NewMask)) { + // None of the shifted in bits are needed. Add a truncate of the + // shift input, then shift it. + SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl, + Op.getValueType(), + In.getOperand(0)); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, + Op.getValueType(), + NewTrunc, + Shift)); + } } break; } } - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case ISD::AssertZext: { @@ -1187,11 +1203,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits()); if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask, - KnownZero, KnownOne, TLO, Depth+1)) + Known, TLO, Depth+1)) return true; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - KnownZero |= ~InMask & NewMask; + Known.Zero |= ~InMask; break; } case ISD::BITCAST: @@ -1200,7 +1216,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (!TLO.LegalOperations() && !Op.getValueType().isVector() && !Op.getOperand(0).getValueType().isVector() && - NewMask == APInt::getSignBit(Op.getValueSizeInBits()) && + NewMask == APInt::getSignMask(Op.getValueSizeInBits()) && Op.getOperand(0).getValueType().isFloatingPoint()) { bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType()); bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32); @@ -1229,22 +1245,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // of the highest bit demanded of them. APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - NewMask.countLeadingZeros()); - if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2, - KnownOne2, TLO, Depth+1) || - SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2, - KnownOne2, TLO, Depth+1) || + if (SimplifyDemandedBits(Op.getOperand(0), LoMask, Known2, TLO, Depth+1) || + SimplifyDemandedBits(Op.getOperand(1), LoMask, Known2, TLO, Depth+1) || // See if the operation should be performed at a smaller bit width. - TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) { - const SDNodeFlags *Flags = Op.getNode()->getFlags(); - if (Flags->hasNoSignedWrap() || Flags->hasNoUnsignedWrap()) { + ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) { + SDNodeFlags Flags = Op.getNode()->getFlags(); + if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) { // Disable the nsw and nuw flags. We can no longer guarantee that we // won't wrap after simplification. - SDNodeFlags NewFlags = *Flags; - NewFlags.setNoSignedWrap(false); - NewFlags.setNoUnsignedWrap(false); + Flags.setNoSignedWrap(false); + Flags.setNoUnsignedWrap(false); SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Op.getOperand(0), Op.getOperand(1), - &NewFlags); + Flags); return TLO.CombineTo(Op, NewOp); } return true; @@ -1253,13 +1266,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } default: // Just use computeKnownBits to compute output bits. - TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth); + TLO.DAG.computeKnownBits(Op, Known, Depth); break; } // If we know the value of all of the demanded bits, return this as a // constant. - if ((NewMask & (KnownZero|KnownOne)) == NewMask) { + if (NewMask.isSubsetOf(Known.Zero|Known.One)) { // Avoid folding to a constant if any OpaqueConstant is involved. const SDNode *N = Op.getNode(); for (SDNodeIterator I = SDNodeIterator::begin(N), @@ -1270,17 +1283,17 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return false; } return TLO.CombineTo(Op, - TLO.DAG.getConstant(KnownOne, dl, Op.getValueType())); + TLO.DAG.getConstant(Known.One, dl, Op.getValueType())); } return false; } /// Determine which of the bits specified in Mask are known to be either zero or -/// one and return them in the KnownZero/KnownOne bitsets. +/// one and return them in the Known. void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -1289,12 +1302,13 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Op.getOpcode() == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); + Known.resetAll(); } /// This method can be implemented by targets that want to expose additional /// information about sign bits to the DAG Combiner. unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, + const APInt &, const SelectionDAG &, unsigned Depth) const { assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || @@ -1306,31 +1320,38 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, return 1; } +// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must +// work with truncating build vectors and vectors with elements of less than +// 8 bits. bool TargetLowering::isConstTrueVal(const SDNode *N) const { if (!N) return false; - const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N); - if (!CN) { - const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N); - if (!BV) - return false; - - // Only interested in constant splats, we don't care about undef - // elements in identifying boolean constants and getConstantSplatNode - // returns NULL if all ops are undef; - CN = BV->getConstantSplatNode(); + APInt CVal; + if (auto *CN = dyn_cast<ConstantSDNode>(N)) { + CVal = CN->getAPIntValue(); + } else if (auto *BV = dyn_cast<BuildVectorSDNode>(N)) { + auto *CN = BV->getConstantSplatNode(); if (!CN) return false; + + // If this is a truncating build vector, truncate the splat value. + // Otherwise, we may fail to match the expected values below. + unsigned BVEltWidth = BV->getValueType(0).getScalarSizeInBits(); + CVal = CN->getAPIntValue(); + if (BVEltWidth < CVal.getBitWidth()) + CVal = CVal.trunc(BVEltWidth); + } else { + return false; } switch (getBooleanContents(N->getValueType(0))) { case UndefinedBooleanContent: - return CN->getAPIntValue()[0]; + return CVal[0]; case ZeroOrOneBooleanContent: - return CN->isOne(); + return CVal == 1; case ZeroOrNegativeOneBooleanContent: - return CN->isAllOnesValue(); + return CVal.isAllOnesValue(); } llvm_unreachable("Invalid boolean contents"); @@ -1656,7 +1677,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, bestWidth = width; break; } - newMask = newMask << width; + newMask <<= width; } } } @@ -2006,7 +2027,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } else { ShiftBits = C1.countTrailingZeros(); } - NewC = NewC.lshr(ShiftBits); + NewC.lshrInPlace(ShiftBits); if (ShiftBits && NewC.getMinSignedBits() <= 64 && isLegalICmpImmediate(NewC.getSExtValue())) { auto &DL = DAG.getDataLayout(); @@ -2050,6 +2071,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if (Cond == ISD::SETO || Cond == ISD::SETUO) return DAG.getSetCC(dl, VT, N0, N0, Cond); + // setcc (fneg x), C -> setcc swap(pred) x, -C + if (N0.getOpcode() == ISD::FNEG) { + ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(SwapCond, N0.getSimpleValueType())) { + SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1); + return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond); + } + } + // If the condition is not legal, see if we can find an equivalent one // which is legal. if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) { @@ -2470,13 +2501,10 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr)); // Figure out which register class contains this reg. - for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(), - E = RI->regclass_end(); RCI != E; ++RCI) { - const TargetRegisterClass *RC = *RCI; - + for (const TargetRegisterClass *RC : RI->regclasses()) { // If none of the value types for this register class are valid, we // can't use it. For example, 64-bit reg classes on 32-bit targets. - if (!isLegalRC(RC)) + if (!isLegalRC(*RI, *RC)) continue; for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); @@ -2488,9 +2516,9 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, // If this register class has the requested value type, return it, // otherwise keep searching and return the first class found // if no other is found which explicitly has the requested type. - if (RC->hasType(VT)) + if (RI->isTypeLegalForClass(*RC, VT)) return S; - else if (!R.second) + if (!R.second) R = S; } } @@ -2914,9 +2942,9 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d, DAG.getDataLayout())); SDNodeFlags Flags; Flags.setExact(true); - Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, &Flags); + Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, Flags); Created.push_back(Op1.getNode()); - d = d.ashr(ShAmt); + d.ashrInPlace(ShAmt); } // Calculate the multiplicative inverse, using Newton's method. @@ -2933,7 +2961,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d, SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -2958,7 +2986,7 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor, return SDValue(); // If the sdiv has an 'exact' bit we can use a simpler lowering. - if (cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact()) + if (N->getFlags().hasExact()) return BuildExactSDIV(*this, N->getOperand(0), Divisor, dl, DAG, *Created); APInt::ms magics = Divisor.magic(); @@ -3297,7 +3325,7 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT); SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT); SDValue Bias = DAG.getConstant(127, dl, IntVT); - SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()), dl, + SDValue SignMask = DAG.getConstant(APInt::getSignMask(VT.getSizeInBits()), dl, IntVT); SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, dl, IntVT); SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT); @@ -3808,7 +3836,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()); - CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); + CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index ff7d205c1f4c..6750fde57638 100644 --- a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -27,7 +27,7 @@ using namespace llvm; -#define DEBUG_TYPE "shadowstackgclowering" +#define DEBUG_TYPE "shadow-stack-gc-lowering" namespace { @@ -66,10 +66,10 @@ private: }; } -INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering", +INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) INITIALIZE_PASS_DEPENDENCY(GCModuleInfo) -INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering", +INITIALIZE_PASS_END(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); } diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp index 4837495777da..aa75f5e2caa2 100644 --- a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -210,13 +210,12 @@ public: char ShrinkWrap::ID = 0; char &llvm::ShrinkWrapID = ShrinkWrap::ID; -INITIALIZE_PASS_BEGIN(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, - false) +INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false) +INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const { @@ -282,8 +281,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, if (!Restore) Restore = &MBB; - else + else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it + // means the block never returns. If that's the + // case, we don't want to call + // `findNearestCommonDominator`, which will + // return `Restore`. Restore = MPDT->findNearestCommonDominator(Restore, &MBB); + else + Restore = nullptr; // Abort, we can't find a restore point in this case. // Make sure we would be able to insert the restore code before the // terminator. @@ -293,7 +298,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, continue; // One of the terminator needs to happen before the restore point. if (MBB.succ_empty()) { - Restore = nullptr; + Restore = nullptr; // Abort, we can't find a restore point in this case. break; } // Look for a restore point that post-dominates all the successors. @@ -419,7 +424,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF, } bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { - if (MF.empty() || !isShrinkWrapEnabled(MF)) + if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) return false; DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp index 209bbe54ea23..09e9c3bb3354 100644 --- a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -64,6 +64,7 @@ public: private: bool setupEntryBlockAndCallSites(Function &F); + bool undoSwiftErrorSelect(Function &F); void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal); Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads); void lowerIncomingArguments(Function &F); @@ -73,7 +74,7 @@ private: } // end anonymous namespace char SjLjEHPrepare::ID = 0; -INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions", +INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions", false, false) // Public Interface To the SjLjEHPrepare pass. @@ -92,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) { doubleUnderDataTy, // __data VoidPtrTy, // __personality VoidPtrTy, // __lsda - doubleUnderJBufTy, // __jbuf - nullptr); + doubleUnderJBufTy // __jbuf + ); return true; } @@ -174,8 +175,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F, // because the value needs to be added to the global context list. auto &DL = F.getParent()->getDataLayout(); unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy); - FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context", - &EntryBB->front()); + FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), + nullptr, Align, "fn_context", &EntryBB->front()); // Fill in the function context structure. for (LandingPadInst *LPI : LPads) { @@ -458,14 +459,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) { return true; } +bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) { + // We have inserted dummy copies 'select true, arg, undef' in the entry block + // for arguments to simplify this pass. + // swifterror arguments cannot be used in this way. Undo the select for the + // swifterror argument. + for (auto &AI : F.args()) { + if (AI.isSwiftError()) { + assert(AI.hasOneUse() && "Must have converted the argument to a select"); + auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser()); + assert(Select && "There must be single select user"); + auto *OrigSwiftError = cast<Argument>(Select->getTrueValue()); + Select->replaceAllUsesWith(OrigSwiftError); + Select->eraseFromParent(); + return true; + } + } + return false; +} + bool SjLjEHPrepare::runOnFunction(Function &F) { Module &M = *F.getParent(); RegisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy), nullptr); + PointerType::getUnqual(FunctionContextTy)); UnregisterFn = M.getOrInsertFunction( "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()), - PointerType::getUnqual(FunctionContextTy), nullptr); + PointerType::getUnqual(FunctionContextTy)); FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress); StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); @@ -476,5 +496,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) { FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); bool Res = setupEntryBlockAndCallSites(F); + if (Res) + Res |= undoSwiftErrorSelect(F); return Res; } diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp index dba103e9bfb1..3656832a7f1a 100644 --- a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp @@ -19,7 +19,7 @@ using namespace llvm; #define DEBUG_TYPE "slotindexes" char SlotIndexes::ID = 0; -INITIALIZE_PASS(SlotIndexes, "slotindexes", +INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE, "Slot index numbering", false, false) STATISTIC(NumLocalRenum, "Number of local renumberings"); @@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) { return false; } +void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) { + assert(!MI.isBundledWithPred() && + "Use removeSingleMachineInstrFromMaps() instread"); + Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); + if (mi2iItr == mi2iMap.end()) + return; + + SlotIndex MIIndex = mi2iItr->second; + IndexListEntry &MIEntry = *MIIndex.listEntry(); + assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); + mi2iMap.erase(mi2iItr); + // FIXME: Eventually we want to actually delete these indexes. + MIEntry.setInstr(nullptr); +} + +void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) { + Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI); + if (mi2iItr == mi2iMap.end()) + return; + + SlotIndex MIIndex = mi2iItr->second; + IndexListEntry &MIEntry = *MIIndex.listEntry(); + assert(MIEntry.getInstr() == &MI && "Instruction indexes broken."); + mi2iMap.erase(mi2iItr); + + // When removing the first instruction of a bundle update mapping to next + // instruction. + if (MI.isBundledWithSucc()) { + // Only the first instruction of a bundle should have an index assigned. + assert(!MI.isBundledWithPred() && "Should have first bundle isntruction"); + + MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator()); + MachineInstr &NextMI = *Next; + MIEntry.setInstr(&NextMI); + mi2iMap.insert(std::make_pair(&NextMI, MIIndex)); + return; + } else { + // FIXME: Eventually we want to actually delete these indexes. + MIEntry.setInstr(nullptr); + } +} + void SlotIndexes::renumberIndexes() { // Renumber updates the index of every element of the index list. DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n"); diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp index f10c98ef4e50..0abe1c47da55 100644 --- a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp @@ -40,14 +40,14 @@ using namespace llvm; -#define DEBUG_TYPE "spillplacement" +#define DEBUG_TYPE "spill-code-placement" char SpillPlacement::ID = 0; -INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement", +INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE, "Spill Code Placement Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement", +INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE, "Spill Code Placement Analysis", true, true) char &llvm::SpillPlacementID = SpillPlacement::ID; @@ -310,7 +310,7 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) { bool SpillPlacement::scanActiveBundles() { RecentPositive.clear(); - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) { + for (unsigned n : ActiveNodes->set_bits()) { update(n); // A node that must spill, or a node without any links is not going to // change its value ever again, so exclude it from iterations. @@ -365,7 +365,7 @@ SpillPlacement::finish() { // Write preferences back to ActiveNodes. bool Perfect = true; - for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) + for (unsigned n : ActiveNodes->set_bits()) if (!nodes[n].preferReg()) { ActiveNodes->reset(n); Perfect = false; diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp index 1c6a84e53944..3a50aaa69985 100644 --- a/contrib/llvm/lib/CodeGen/SplitKit.cpp +++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -487,12 +488,126 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) { VFP = ValueForcePair(nullptr, true); } +SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + bool FirstCopy = !Def.isValid(); + MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) + .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) + | getInternalReadRegState(!FirstCopy), SubIdx) + .addReg(FromReg, 0, SubIdx); + + BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); + if (FirstCopy) { + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } else { + CopyMI->bundleWithPred(); + } + LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx); + DestLI.refineSubRanges(Allocator, LaneMask, + [Def, &Allocator](LiveInterval::SubRange& SR) { + SR.createDeadDef(Def, Allocator); + }); + return Def; +} + +SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg, + LaneBitmask LaneMask, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { + // The full vreg is copied. + MachineInstr *CopyMI = + BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg); + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } + + // Only a subset of lanes needs to be copied. The following is a simple + // heuristic to construct a sequence of COPYs. We could add a target + // specific callback if this turns out to be suboptimal. + LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx)); + + // First pass: Try to find a perfectly matching subregister index. If none + // exists find the one covering the most lanemask bits. + SmallVector<unsigned, 8> PossibleIndexes; + unsigned BestIdx = 0; + unsigned BestCover = 0; + const TargetRegisterClass *RC = MRI.getRegClass(FromReg); + assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class"); + for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI.getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LaneMask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside \p LaneMask. + if ((SubRegMask & ~LaneMask).any()) + continue; + + unsigned PopCount = countPopulation(SubRegMask.getAsInteger()); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, + BestIdx, DestLI, Late, SlotIndex()); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + LaneBitmask LanesLeft = + LaneMask & ~(TRI.getSubRegIndexLaneMask(BestCover)); + while (LanesLeft.any()) { + unsigned BestIdx = 0; + int BestCover = INT_MIN; + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LanesLeft) { + BestIdx = Idx; + break; + } + + // Try to cover as much of the remaining lanes as possible but + // as few of the already covered lanes as possible. + int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger()) + - countPopulation((SubRegMask & ~LanesLeft).getAsInteger()); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, + DestLI, Late, Def); + LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx); + } + + return Def; +} + VNInfo *SplitEditor::defFromParent(unsigned RegIdx, VNInfo *ParentVNI, SlotIndex UseIdx, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - MachineInstr *CopyMI = nullptr; SlotIndex Def; LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx)); @@ -505,24 +620,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); + unsigned Reg = LI->reg; bool DidRemat = false; if (OrigVNI) { LiveRangeEdit::Remat RM(ParentVNI); RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def); if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) { - Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late); + Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late); ++NumRemats; DidRemat = true; } } if (!DidRemat) { - // Can't remat, just insert a copy from parent. - CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg) - .addReg(Edit->getReg()); - Def = LIS.getSlotIndexes() - ->insertMachineInstrInMaps(*CopyMI, Late) - .getRegSlot(); + LaneBitmask LaneMask; + if (LI->hasSubRanges()) { + LaneMask = LaneBitmask::getNone(); + for (LiveInterval::SubRange &S : LI->subranges()) + LaneMask |= S.LaneMask; + } else { + LaneMask = LaneBitmask::getAll(); + } + ++NumCopies; + Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx); } // Define the value in Reg. diff --git a/contrib/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm/lib/CodeGen/SplitKit.h index a75738aaf446..9d409e924a3d 100644 --- a/contrib/llvm/lib/CodeGen/SplitKit.h +++ b/contrib/llvm/lib/CodeGen/SplitKit.h @@ -405,6 +405,17 @@ private: /// deleteRematVictims - Delete defs that are dead after rematerializing. void deleteRematVictims(); + /// Add a copy instruction copying \p FromReg to \p ToReg before + /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it + /// necessary to construct a sequence of copies to cover it exactly. + SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + bool Late, unsigned RegIdx); + + SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy); + public: /// Create a new SplitEditor for editing the LiveInterval analyzed by SA. /// Newly created intervals will be appended to newIntervals. diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp index 89c4b574f17f..acb3676fdd71 100644 --- a/contrib/llvm/lib/CodeGen/StackColoring.cpp +++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp @@ -23,7 +23,6 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -54,7 +53,7 @@ using namespace llvm; -#define DEBUG_TYPE "stackcoloring" +#define DEBUG_TYPE "stack-coloring" static cl::opt<bool> DisableColoring("no-stack-coloring", @@ -372,12 +371,12 @@ private: char StackColoring::ID = 0; char &llvm::StackColoringID = StackColoring::ID; -INITIALIZE_PASS_BEGIN(StackColoring, - "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE, + "Merge disjoint stack slots", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(StackColoring, - "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE, + "Merge disjoint stack slots", false, false) void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<SlotIndexes>(); @@ -385,14 +384,13 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -#ifndef NDEBUG - +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag, const BitVector &BV) const { - DEBUG(dbgs() << tag << " : { "); + dbgs() << tag << " : { "; for (unsigned I = 0, E = BV.size(); I != E; ++I) - DEBUG(dbgs() << BV.test(I) << " "); - DEBUG(dbgs() << "}\n"); + dbgs() << BV.test(I) << " "; + dbgs() << "}\n"; } LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { @@ -408,20 +406,19 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { LLVM_DUMP_METHOD void StackColoring::dump() const { for (MachineBasicBlock *MBB : depth_first(MF)) { - DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " [" - << MBB->getName() << "]\n"); - DEBUG(dumpBB(MBB)); + dbgs() << "Inspecting block #" << MBB->getNumber() << " [" + << MBB->getName() << "]\n"; + dumpBB(MBB); } } LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const { for (unsigned I = 0, E = Intervals.size(); I != E; ++I) { - DEBUG(dbgs() << "Interval[" << I << "]:\n"); - DEBUG(Intervals[I]->dump()); + dbgs() << "Interval[" << I << "]:\n"; + Intervals[I]->dump(); } } - -#endif // not NDEBUG +#endif static inline int getStartOrEndSlot(const MachineInstr &MI) { @@ -570,9 +567,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) // Step 2: compute begin/end sets for each block - // NOTE: We use a reverse-post-order iteration to ensure that we obtain a - // deterministic numbering, and because we'll need a post-order iteration - // later for solving the liveness dataflow problem. + // NOTE: We use a depth-first iteration to ensure that we obtain a + // deterministic numbering. for (MachineBasicBlock *MBB : depth_first(MF)) { // Assign a serial number to this basic block. @@ -707,12 +703,10 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) { // Create the interval of the blocks that we previously found to be 'alive'. BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB]; - for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1; - pos = MBBLiveness.LiveIn.find_next(pos)) { + for (unsigned pos : MBBLiveness.LiveIn.set_bits()) { Starts[pos] = Indexes->getMBBStartIdx(&MBB); } - for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1; - pos = MBBLiveness.LiveOut.find_next(pos)) { + for (unsigned pos : MBBLiveness.LiveOut.set_bits()) { Finishes[pos] = Indexes->getMBBEndIdx(&MBB); } diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp index 9b7dd400fc92..916b6f08c1b9 100644 --- a/contrib/llvm/lib/CodeGen/StackMaps.cpp +++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp @@ -1,4 +1,4 @@ -//===---------------------------- StackMaps.cpp ---------------------------===// +//===- StackMaps.cpp ------------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,31 +7,42 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackMaps.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> #include <iterator> +#include <utility> using namespace llvm; #define DEBUG_TYPE "stackmaps" static cl::opt<int> StackMapVersion( - "stackmap-version", cl::init(2), - cl::desc("Specify the stackmap encoding version (default = 2)")); + "stackmap-version", cl::init(3), + cl::desc("Specify the stackmap encoding version (default = 3)")); const char *StackMaps::WSMP = "Stack Maps: "; @@ -74,7 +85,7 @@ unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const { } StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) { - if (StackMapVersion != 2) + if (StackMapVersion != 3) llvm_unreachable("Unsupported stackmap version!"); } @@ -150,7 +161,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI, if (SubRegIdx) Offset = TRI->getSubRegIdxOffset(SubRegIdx); - Locs.emplace_back(Location::Register, RC->getSize(), DwarfRegNum, Offset); + Locs.emplace_back(Location::Register, TRI->getSpillSize(*RC), + DwarfRegNum, Offset); return ++MOI; } @@ -209,8 +221,9 @@ void StackMaps::print(raw_ostream &OS) { OS << "Constant Index " << Loc.Offset; break; } - OS << "\t[encoding: .byte " << Loc.Type << ", .byte " << Loc.Size - << ", .short " << Loc.Reg << ", .int " << Loc.Offset << "]\n"; + OS << "\t[encoding: .byte " << Loc.Type << ", .byte 0" + << ", .short " << Loc.Size << ", .short " << Loc.Reg << ", .short 0" + << ", .int " << Loc.Offset << "]\n"; Idx++; } @@ -234,7 +247,7 @@ void StackMaps::print(raw_ostream &OS) { StackMaps::LiveOutReg StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const { unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + unsigned Size = TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg)); return LiveOutReg(Reg, DwarfRegNum, Size); } @@ -276,7 +289,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const { } LiveOuts.erase( - remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }), + llvm::remove_if(LiveOuts, + [](const LiveOutReg &LO) { return LO.Reg == 0; }), LiveOuts.end()); return LiveOuts; @@ -286,7 +300,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID, MachineInstr::const_mop_iterator MOI, MachineInstr::const_mop_iterator MOE, bool recordResult) { - MCContext &OutContext = AP.OutStreamer->getContext(); MCSymbol *MILabel = OutContext.createTempSymbol(); AP.OutStreamer->EmitLabel(MILabel); @@ -378,6 +391,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) { } #endif } + void StackMaps::recordStatepoint(const MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint"); @@ -508,11 +522,16 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) { for (const auto &Loc : CSLocs) { OS.EmitIntValue(Loc.Type, 1); - OS.EmitIntValue(Loc.Size, 1); + OS.EmitIntValue(0, 1); // Reserved + OS.EmitIntValue(Loc.Size, 2); OS.EmitIntValue(Loc.Reg, 2); + OS.EmitIntValue(0, 2); // Reserved OS.EmitIntValue(Loc.Offset, 4); } + // Emit alignment to 8 byte. + OS.EmitValueToAlignment(8); + // Num live-out registers and padding to align to 4 byte. OS.EmitIntValue(0, 2); OS.EmitIntValue(LiveOuts.size(), 2); diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp index c2c010a29d44..ca8bde2d114a 100644 --- a/contrib/llvm/lib/CodeGen/StackProtector.cpp +++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp @@ -1,4 +1,4 @@ -//===-- StackProtector.cpp - Stack Protector Insertion --------------------===// +//===- StackProtector.cpp - Stack Protector Insertion ---------------------===// // // The LLVM Compiler Infrastructure // @@ -14,30 +14,38 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/StackProtector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <cstdlib> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "stack-protector" @@ -50,12 +58,13 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp", cl::init(true), cl::Hidden); char StackProtector::ID = 0; -INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors", - false, true) +INITIALIZE_PASS_BEGIN(StackProtector, DEBUG_TYPE, + "Insert stack protectors", false, true) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE, + "Insert stack protectors", false, true) -FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) { - return new StackProtector(TM); -} +FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); } StackProtector::SSPLayoutKind StackProtector::getSSPLayout(const AllocaInst *AI) const { @@ -89,6 +98,8 @@ bool StackProtector::runOnFunction(Function &Fn) { DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + Trip = TM->getTargetTriple(); TLI = TM->getSubtargetImpl(Fn)->getTargetLowering(); HasPrologue = false; HasIRCheck = false; @@ -222,7 +233,16 @@ bool StackProtector::RequiresStackProtector() { if (F->hasFnAttribute(Attribute::SafeStack)) return false; + // We are constructing the OptimizationRemarkEmitter on the fly rather than + // using the analysis pass to avoid building DominatorTree and LoopInfo which + // are not available this late in the IR pipeline. + OptimizationRemarkEmitter ORE(F); + if (F->hasFnAttribute(Attribute::StackProtectReq)) { + ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a function attribute or command-line switch"); NeedsProtector = true; Strong = true; // Use the same heuristic as strong to determine SSPLayout } else if (F->hasFnAttribute(Attribute::StackProtectStrong)) @@ -236,20 +256,29 @@ bool StackProtector::RequiresStackProtector() { for (const Instruction &I : BB) { if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { if (AI->isArrayAllocation()) { + OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray", + &I); + Remark + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a call to alloca or use of a variable length array"; if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) { if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) { // A call to alloca with size >= SSPBufferSize requires // stack protectors. Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); + ORE.emit(Remark); NeedsProtector = true; } else if (Strong) { // Require protectors for all alloca calls in strong mode. Layout.insert(std::make_pair(AI, SSPLK_SmallArray)); + ORE.emit(Remark); NeedsProtector = true; } } else { // A call to alloca with a variable size requires protectors. Layout.insert(std::make_pair(AI, SSPLK_LargeArray)); + ORE.emit(Remark); NeedsProtector = true; } continue; @@ -259,6 +288,11 @@ bool StackProtector::RequiresStackProtector() { if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) { Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray : SSPLK_SmallArray)); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to a stack allocated buffer or struct containing a " + "buffer"); NeedsProtector = true; continue; } @@ -266,6 +300,11 @@ bool StackProtector::RequiresStackProtector() { if (Strong && HasAddressTaken(AI)) { ++NumAddrTaken; Layout.insert(std::make_pair(AI, SSPLK_AddrOf)); + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I) + << "Stack protection applied to function " + << ore::NV("Function", F) + << " due to the address of a local variable being taken"); NeedsProtector = true; } } @@ -448,13 +487,13 @@ BasicBlock *StackProtector::CreateFailBB() { Constant *StackChkFail = M->getOrInsertFunction("__stack_smash_handler", Type::getVoidTy(Context), - Type::getInt8PtrTy(Context), nullptr); + Type::getInt8PtrTy(Context)); B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH")); } else { Constant *StackChkFail = - M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context), - nullptr); + M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context)); + B.CreateCall(StackChkFail, {}); } B.CreateUnreachable(); diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp index 234b2043a6a1..d1758ecbd79f 100644 --- a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -32,7 +32,7 @@ #include <vector> using namespace llvm; -#define DEBUG_TYPE "stackslotcoloring" +#define DEBUG_TYPE "stack-slot-coloring" static cl::opt<bool> DisableSharing("no-stack-slot-sharing", @@ -116,12 +116,12 @@ namespace { char StackSlotColoring::ID = 0; char &llvm::StackSlotColoringID = StackSlotColoring::ID; -INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring", +INITIALIZE_PASS_BEGIN(StackSlotColoring, DEBUG_TYPE, "Stack Slot Coloring", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveStacks) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring", +INITIALIZE_PASS_END(StackSlotColoring, DEBUG_TYPE, "Stack Slot Coloring", false, false) namespace { diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp index e2377d89497d..ad0b04373656 100644 --- a/contrib/llvm/lib/CodeGen/TailDuplication.cpp +++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp @@ -40,8 +40,7 @@ char TailDuplicatePass::ID = 0; char &llvm::TailDuplicateID = TailDuplicatePass::ID; -INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false, - false) +INITIALIZE_PASS(TailDuplicatePass, DEBUG_TYPE, "Tail Duplication", false, false) bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) diff --git a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp index 7709236bbaa8..d40f7af431a9 100644 --- a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp @@ -725,6 +725,7 @@ bool TailDuplicator::duplicateSimpleBB( if (PredTBB == NextBB && PredFBB == nullptr) PredTBB = nullptr; + auto DL = PredBB->findBranchDebugLoc(); TII->removeBranch(*PredBB); if (!PredBB->isSuccessor(NewTarget)) @@ -735,7 +736,7 @@ bool TailDuplicator::duplicateSimpleBB( } if (PredTBB) - TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); + TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL); TDBBs.push_back(PredBB); } @@ -748,7 +749,7 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, if (PredBB->succ_size() > 1) return false; - MachineBasicBlock *PredTBB, *PredFBB; + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector<MachineOperand, 4> PredCond; if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond)) return false; @@ -831,7 +832,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, appendCopies(PredBB, CopyInfos, Copies); // Simplify - MachineBasicBlock *PredTBB, *PredFBB; + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector<MachineOperand, 4> PredCond; TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond); diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index f082add8c7dd..e5def6752e07 100644 --- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -73,7 +73,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF, return; // Get the callee saved register list... - const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); // Early exit if there are no callee saved registers. if (!CSRegs || CSRegs[0] == 0) diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp index 01f91b96b58a..14c5adc0d898 100644 --- a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -345,12 +345,12 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = RC->getSize(); + Size = TRI->getSpillSize(*RC); Offset = 0; return true; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); // Convert bit size to byte size to be consistent with // MCRegisterClass::getSize(). @@ -364,10 +364,10 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, Size = BitSize /= 8; Offset = (unsigned)BitOffset / 8; - assert(RC->getSize() >= (Offset + Size) && "bad subregister range"); + assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = RC->getSize() - (Offset + Size); + Offset = TRI->getSpillSize(*RC) - (Offset + Size); } return true; } @@ -428,8 +428,8 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI, return nullptr; } -void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { - llvm_unreachable("Not a MachO target"); +void TargetInstrInfo::getNoop(MCInst &NopInst) const { + llvm_unreachable("Not implemented"); } static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, @@ -470,7 +470,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, // No need to fold return, the meta data, and function arguments for (unsigned i = 0; i < StartIdx; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) { MachineOperand &MO = MI.getOperand(i); @@ -490,7 +490,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, MIB.addImm(SpillOffset); } else - MIB.addOperand(MO); + MIB.add(MO); } return NewMI; } @@ -941,12 +941,10 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const { unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); - if (MI.getOpcode() != FrameSetupOpcode && - MI.getOpcode() != FrameDestroyOpcode) + if (!isFrameInstr(MI)) return 0; - int SPAdj = MI.getOperand(0).getImm(); - SPAdj = TFI->alignSPAdjust(SPAdj); + int SPAdj = TFI->alignSPAdjust(getFrameSize(MI)); if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) || (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode)) diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp index 003311b157fc..5f63fd4320bb 100644 --- a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -53,6 +54,18 @@ static cl::opt<unsigned> MaximumJumpTableSize ("max-jump-table-size", cl::init(0), cl::Hidden, cl::desc("Set maximum size of jump tables; zero for no limit.")); +/// Minimum jump table density for normal functions. +static cl::opt<unsigned> + JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden, + cl::desc("Minimum density for building a jump table in " + "a normal function")); + +/// Minimum jump table density for -Os or -Oz functions. +static cl::opt<unsigned> OptsizeJumpTableDensity( + "optsize-jump-table-density", cl::init(40), cl::Hidden, + cl::desc("Minimum density for building a jump table in " + "an optsize function")); + // Although this default value is arbitrary, it is not random. It is assumed // that a condition that evaluates the same way by a higher percentage than this // is best represented as control flow. Therefore, the default value N should be @@ -838,7 +851,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { HasExtractBitsInsn = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; - MaskAndBranchFoldingIsLegal = false; EnableExtLdPromotion = false; HasFloatingPointExceptions = true; StackPointerRegisterToSaveRestore = 0; @@ -851,7 +863,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { MinFunctionAlignment = 0; PrefFunctionAlignment = 0; PrefLoopAlignment = 0; - GatherAllAliasesMaxDepth = 6; + GatherAllAliasesMaxDepth = 18; MinStackArgumentAlignment = 1; // TODO: the default will be switched to 0 in the next commit, along // with the Target-specific changes necessary. @@ -901,6 +913,7 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SMAX, VT, Expand); setOperationAction(ISD::UMIN, VT, Expand); setOperationAction(ISD::UMAX, VT, Expand); + setOperationAction(ISD::ABS, VT, Expand); // Overflow operations default to expand setOperationAction(ISD::SADDO, VT, Expand); @@ -910,6 +923,10 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SMULO, VT, Expand); setOperationAction(ISD::UMULO, VT, Expand); + // ADDCARRY operations default to expand + setOperationAction(ISD::ADDCARRY, VT, Expand); + setOperationAction(ISD::SUBCARRY, VT, Expand); + // These default to Expand so they will be expanded to CTLZ/CTTZ by default. setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); @@ -1184,12 +1201,11 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT, /// isLegalRC - Return true if the value types that can be represented by the /// specified register class are all legal. -bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const { - for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end(); - I != E; ++I) { +bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) const { + for (auto I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I) if (isTypeLegal(*I)) return true; - } return false; } @@ -1227,7 +1243,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, // Copy operands before the frame-index. for (unsigned i = 0; i < OperIdx; ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); // Add frame index operands recognized by stackmaps.cpp if (MFI.isStatepointSpillSlotObjectIndex(FI)) { // indirect-mem-ref tag, size, #FI, offset. @@ -1237,18 +1253,18 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity"); MIB.addImm(StackMaps::IndirectMemRefOp); MIB.addImm(MFI.getObjectSize(FI)); - MIB.addOperand(MI->getOperand(OperIdx)); + MIB.add(MI->getOperand(OperIdx)); MIB.addImm(0); } else { // direct-mem-ref tag, #FI, offset. // Used by patchpoint, and direct alloca arguments to statepoints MIB.addImm(StackMaps::DirectMemRefOp); - MIB.addOperand(MI->getOperand(OperIdx)); + MIB.add(MI->getOperand(OperIdx)); MIB.addImm(0); } // Copy the operands after the frame index. for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); // Inherit previous memory operands. MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); @@ -1296,12 +1312,12 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI, // Find the first legal register class with the largest spill size. const TargetRegisterClass *BestRC = RC; - for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) { + for (unsigned i : SuperRegRC.set_bits()) { const TargetRegisterClass *SuperRC = TRI->getRegClass(i); // We want the largest possible spill size. - if (SuperRC->getSize() <= BestRC->getSize()) + if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC)) continue; - if (!isLegalRC(SuperRC)) + if (!isLegalRC(*TRI, *SuperRC)) continue; BestRC = SuperRC; } @@ -1589,7 +1605,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT /// type of the given function. This does not require a DAG or a return value, /// and is suitable for use before any DAGs for the function are constructed. /// TODO: Move this out of TargetLowering.cpp. -void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, +void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr, SmallVectorImpl<ISD::OutputArg> &Outs, const TargetLowering &TLI, const DataLayout &DL) { SmallVector<EVT, 4> ValueVTs; @@ -1601,9 +1617,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, EVT VT = ValueVTs[j]; ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) + else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; // FIXME: C calling convention requires the return type to be promoted to @@ -1621,13 +1637,13 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr, // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg)) Flags.setInReg(); // Propagate extension type if any - if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt)) + if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) Flags.setSExt(); - else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt)) + else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) @@ -1818,7 +1834,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Type *StackPtrTy = Type::getInt8PtrTy(M->getContext()); Value *Fn = M->getOrInsertFunction("__safestack_pointer_address", - StackPtrTy->getPointerTo(0), nullptr); + StackPtrTy->getPointerTo(0)); return IRB.CreateCall(Fn); } @@ -1902,6 +1918,10 @@ void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) { MinimumJumpTableEntries = Val; } +unsigned TargetLoweringBase::getMinimumJumpTableDensity(bool OptForSize) const { + return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity; +} + unsigned TargetLoweringBase::getMaximumJumpTableSize() const { return MaximumJumpTableSize; } @@ -1918,11 +1938,7 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) { /// override the target defaults. static StringRef getRecipEstimateForFunc(MachineFunction &MF) { const Function *F = MF.getFunction(); - StringRef RecipAttrName = "reciprocal-estimates"; - if (!F->hasFnAttribute(RecipAttrName)) - return StringRef(); - - return F->getFnAttribute(RecipAttrName).getValueAsString(); + return F->getFnAttribute("reciprocal-estimates").getValueAsString(); } /// Construct a string for the given reciprocal operation of the given type. @@ -2097,3 +2113,7 @@ int TargetLoweringBase::getDivRefinementSteps(EVT VT, MachineFunction &MF) const { return getOpRefinementSteps(false, VT, getRecipEstimateForFunc(MF)); } + +void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const { + MF.getRegInfo().freezeReservedRegs(MF); +} diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index eb2a28f574a5..1d232c71d824 100644 --- a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===// +//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===// // // The LLVM Compiler Infrastructure // @@ -12,36 +12,52 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/Comdat.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/MC/SectionKind.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <string> + using namespace llvm; using namespace dwarf; @@ -53,10 +69,10 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol( const GlobalValue *GV, const TargetMachine &TM, MachineModuleInfo *MMI) const { unsigned Encoding = getPersonalityEncoding(); - if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect) + if ((Encoding & 0x80) == DW_EH_PE_indirect) return getContext().getOrCreateSymbol(StringRef("DW.ref.") + TM.getSymbol(GV)->getName()); - if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr) + if ((Encoding & 0x70) == DW_EH_PE_absptr) return TM.getSymbol(GV); report_fatal_error("We do not support this DWARF encoding yet!"); } @@ -86,8 +102,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue( const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, MachineModuleInfo *MMI, MCStreamer &Streamer) const { - - if (Encoding & dwarf::DW_EH_PE_indirect) { + if (Encoding & DW_EH_PE_indirect) { MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>(); MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM); @@ -102,7 +117,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference( return TargetLoweringObjectFile:: getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + Encoding & ~DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -117,8 +132,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { // section(".eh_frame") gcc will produce: // // .section .eh_frame,"a",@progbits - - if (Name == getInstrProfCoverageSectionName(false)) + + if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF, + /*AddSegmentInfo=*/false)) return SectionKind::getMetadata(); if (Name.empty() || Name[0] != '.') return K; @@ -149,7 +165,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) { return K; } - static unsigned getELFSectionType(StringRef Name, SectionKind K) { // Use SHT_NOTE for section whose name starts with ".note" to allow // emitting ELF notes from C variable declaration. @@ -211,6 +226,24 @@ static const Comdat *getELFComdat(const GlobalValue *GV) { return C; } +static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, + const TargetMachine &TM) { + MDNode *MD = GO->getMetadata(LLVMContext::MD_associated); + if (!MD) + return nullptr; + + const MDOperand &Op = MD->getOperand(0); + if (!Op.get()) + return nullptr; + + auto *VM = dyn_cast<ValueAsMetadata>(Op); + if (!VM) + report_fatal_error("MD_associated operand is not ValueAsMetadata"); + + GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue()); + return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr; +} + MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { StringRef SectionName = GO->getSection(); @@ -224,9 +257,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( Group = C->getName(); Flags |= ELF::SHF_GROUP; } - return getContext().getELFSection(SectionName, - getELFSectionType(SectionName, Kind), Flags, - /*EntrySize=*/0, Group); + + // A section can have at most one associated section. Put each global with + // MD_associated in a unique section. + unsigned UniqueID = MCContext::GenericSectionID; + const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); + if (AssociatedSymbol) { + UniqueID = NextUniqueID++; + Flags |= ELF::SHF_LINK_ORDER; + } + + MCSectionELF *Section = getContext().getELFSection( + SectionName, getELFSectionType(SectionName, Kind), Flags, + /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); + // Make sure that we did not get some other section with incompatible sh_link. + // This should not be possible due to UniqueID code above. + assert(Section->getAssociatedSymbol() == AssociatedSymbol); + return Section; } /// Return the section prefix name used by options FunctionsSections and @@ -248,11 +295,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro"; } -static MCSectionELF * -selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, - SectionKind Kind, Mangler &Mang, - const TargetMachine &TM, bool EmitUniqueSection, - unsigned Flags, unsigned *NextUniqueID) { +static MCSectionELF *selectELFSectionForGlobal( + MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, + const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, + unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) { unsigned EntrySize = 0; if (Kind.isMergeableCString()) { if (Kind.isMergeable2ByteCString()) { @@ -319,7 +365,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, if (Kind.isExecuteOnly()) UniqueID = 0; return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags, - EntrySize, Group, UniqueID); + EntrySize, Group, UniqueID, AssociatedSymbol); } MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( @@ -337,8 +383,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal( } EmitUniqueSection |= GO->hasComdat(); - return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, - EmitUniqueSection, Flags, &NextUniqueID); + const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM); + if (AssociatedSymbol) { + EmitUniqueSection = true; + Flags |= ELF::SHF_LINK_ORDER; + } + + MCSectionELF *Section = selectELFSectionForGlobal( + getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags, + &NextUniqueID, AssociatedSymbol); + assert(Section->getAssociatedSymbol() == AssociatedSymbol); + return Section; } MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( @@ -351,8 +406,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable( return ReadOnlySection; return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(), - getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC, - &NextUniqueID); + getMangler(), TM, EmitUniqueSection, + ELF::SHF_ALLOC, &NextUniqueID, + /* AssociatedSymbol */ nullptr); } bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection( @@ -723,7 +779,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference( return TargetLoweringObjectFile:: getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()), - Encoding & ~dwarf::DW_EH_PE_indirect, Streamer); + Encoding & ~DW_EH_PE_indirect, Streamer); } return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM, @@ -1122,33 +1178,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection( void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal( raw_ostream &OS, const GlobalValue *GV) const { - if (!GV->hasDLLExportStorageClass() || GV->isDeclaration()) - return; + emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler()); +} - const Triple &TT = getTargetTriple(); +//===----------------------------------------------------------------------===// +// Wasm +//===----------------------------------------------------------------------===// - if (TT.isKnownWindowsMSVCEnvironment()) - OS << " /EXPORT:"; - else - OS << " -export:"; - - if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) { - std::string Flag; - raw_string_ostream FlagOS(Flag); - getMangler().getNameWithPrefix(FlagOS, GV, false); - FlagOS.flush(); - if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix()) - OS << Flag.substr(1); - else - OS << Flag; - } else { - getMangler().getNameWithPrefix(OS, GV, false); +static const Comdat *getWasmComdat(const GlobalValue *GV) { + const Comdat *C = GV->getComdat(); + if (!C) + return nullptr; + + if (C->getSelectionKind() != Comdat::Any) + report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" + + C->getName() + "' cannot be lowered."); + + return C; +} + +MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + llvm_unreachable("getExplicitSectionGlobal not yet implemented"); + return nullptr; +} + +static MCSectionWasm * +selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM, bool EmitUniqueSection, + unsigned Flags, unsigned *NextUniqueID) { + StringRef Group = ""; + if (getWasmComdat(GO)) + llvm_unreachable("comdat not yet supported for wasm"); + + bool UniqueSectionNames = TM.getUniqueSectionNames(); + SmallString<128> Name = getSectionPrefixForGlobal(Kind); + + if (const auto *F = dyn_cast<Function>(GO)) { + const auto &OptionalPrefix = F->getSectionPrefix(); + if (OptionalPrefix) + Name += *OptionalPrefix; } - if (!GV->getValueType()->isFunctionTy()) { - if (TT.isKnownWindowsMSVCEnvironment()) - OS << ",DATA"; - else - OS << ",data"; + if (EmitUniqueSection && UniqueSectionNames) { + Name.push_back('.'); + TM.getNameWithPrefix(Name, GO, Mang, true); + } + unsigned UniqueID = MCContext::GenericSectionID; + if (EmitUniqueSection && !UniqueSectionNames) { + UniqueID = *NextUniqueID; + (*NextUniqueID)++; } + return Ctx.getWasmSection(Name, /*Type=*/0, Flags, + Group, UniqueID); +} + +MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + + if (Kind.isCommon()) + report_fatal_error("mergable sections not supported yet on wasm"); + + // If we have -ffunction-section or -fdata-section then we should emit the + // global value to a uniqued section specifically for it. + bool EmitUniqueSection = false; + if (Kind.isText()) + EmitUniqueSection = TM.getFunctionSections(); + else + EmitUniqueSection = TM.getDataSections(); + EmitUniqueSection |= GO->hasComdat(); + + return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM, + EmitUniqueSection, /*Flags=*/0, + &NextUniqueID); +} + +bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection( + bool UsesLabelDifference, const Function &F) const { + // We can always create relative relocations, so use another section + // that can be marked non-executable. + return false; +} + +const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference( + const GlobalValue *LHS, const GlobalValue *RHS, + const TargetMachine &TM) const { + // We may only use a PLT-relative relocation to refer to unnamed_addr + // functions. + if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) + return nullptr; + + // Basic sanity checks. + if (LHS->getType()->getPointerAddressSpace() != 0 || + RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || + RHS->isThreadLocal()) + return nullptr; + + return MCBinaryExpr::createSub( + MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None, + getContext()), + MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext()); +} + +void +TargetLoweringObjectFileWasm::InitializeWasm() { + // TODO: Initialize StaticCtorSection and StaticDtorSection. } diff --git a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp index b6da8e0aa60d..c20d5ab814f8 100644 --- a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { return false; } -/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option -/// is specified on the command line. When this flag is off(default), the -/// code generator is not allowed to generate mad (multiply add) if the -/// result is "less precise" than doing those operations individually. -bool TargetOptions::LessPreciseFPMAD() const { - return UnsafeFPMath || LessPreciseFPMADOption; -} - /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume /// that the rounding mode of the FPU can change from its default. bool TargetOptions::HonorSignDependentRoundingFPMath() const { diff --git a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp index e7ea2b4563f9..83348058eca9 100644 --- a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -92,6 +92,9 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden, cl::desc("Verify generated machine code"), cl::init(false), cl::ZeroOrMore); +static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner", + cl::Hidden, + cl::desc("Enable machine outliner")); static cl::opt<std::string> PrintMachineInstrs("print-machineinstrs", cl::ValueOptional, @@ -261,7 +264,8 @@ TargetPassConfig::~TargetPassConfig() { TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false), AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), - DisableVerify(false), EnableTailMerge(true) { + DisableVerify(false), EnableTailMerge(true), + RequireCodeGenSCCOrder(false) { Impl = new PassConfigImpl(); @@ -279,6 +283,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) if (StringRef(PrintMachineInstrs.getValue()).equals("")) TM->Options.PrintMachineCode = true; + + if (TM->Options.EnableIPRA) + setRequiresCodeGenSCCOrder(); } CodeGenOpt::Level TargetPassConfig::getOptLevel() const { @@ -308,7 +315,9 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) { TargetPassConfig::TargetPassConfig() : ImmutablePass(ID), PM(nullptr) { - llvm_unreachable("TargetPassConfig should not be constructed on-the-fly"); + report_fatal_error("Trying to construct TargetPassConfig without a target " + "machine. Scheduling a CodeGen pass without a target " + "triple set?"); } // Helper to verify the analysis is really immutable. @@ -480,6 +489,14 @@ void TargetPassConfig::addIRPasses() { // Insert calls to mcount-like functions. addPass(createCountingFunctionInserterPass()); + + // Add scalarization of target's unsupported masked memory intrinsics pass. + // the unsupported intrinsic will be replaced with a chain of basic blocks, + // that stores/loads element one-by-one if the appropriate mask bit is set. + addPass(createScalarizeMaskedMemIntrinPass()); + + // Expand reduction intrinsics into shuffle sequences if the target wants to. + addPass(createExpandReductionsPass()); } /// Turn exception handling constructs into something the code generators can @@ -499,14 +516,14 @@ void TargetPassConfig::addPassesToHandleExceptions() { LLVM_FALLTHROUGH; case ExceptionHandling::DwarfCFI: case ExceptionHandling::ARM: - addPass(createDwarfEHPass(TM)); + addPass(createDwarfEHPass()); break; case ExceptionHandling::WinEH: // We support using both GCC-style and MSVC-style exceptions on Windows, so // add both preparation passes. Each pass will only actually run if it // recognizes the personality function. - addPass(createWinEHPass(TM)); - addPass(createDwarfEHPass(TM)); + addPass(createWinEHPass()); + addPass(createDwarfEHPass()); break; case ExceptionHandling::None: addPass(createLowerInvokePass()); @@ -521,7 +538,7 @@ void TargetPassConfig::addPassesToHandleExceptions() { /// before exception handling preparation passes. void TargetPassConfig::addCodeGenPrepare() { if (getOptLevel() != CodeGenOpt::None && !DisableCGP) - addPass(createCodeGenPreparePass(TM)); + addPass(createCodeGenPreparePass()); addPass(createRewriteSymbolsPass()); } @@ -531,13 +548,13 @@ void TargetPassConfig::addISelPrepare() { addPreISel(); // Force codegen to run according to the callgraph. - if (TM->Options.EnableIPRA) + if (requiresCodeGenSCCOrder()) addPass(new DummyCGSCCPass); // Add both the safe stack and the stack protection passes: each of them will // only protect functions that have corresponding attributes. - addPass(createSafeStackPass(TM)); - addPass(createStackProtectorPass(TM)); + addPass(createSafeStackPass()); + addPass(createStackProtectorPass()); if (PrintISelInput) addPass(createPrintFunctionPass( @@ -549,6 +566,14 @@ void TargetPassConfig::addISelPrepare() { addPass(createVerifierPass()); } +/// -regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } +static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, + RegisterPassParser<RegisterRegAlloc> > +RegAlloc("regalloc", + cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use")); + /// Add the complete set of target-independent postISel code generator passes. /// /// This can be read as the standard order of major LLVM CodeGen stages. Stages @@ -607,8 +632,12 @@ void TargetPassConfig::addMachinePasses() { // including phi elimination and scheduling. if (getOptimizeRegAlloc()) addOptimizedRegAlloc(createRegAllocPass(true)); - else + else { + if (RegAlloc != &useDefaultRegisterAllocator && + RegAlloc != &createFastRegisterAllocator) + report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addFastRegAlloc(createRegAllocPass(false)); + } // Run post-ra passes. addPostRegAlloc(); @@ -620,7 +649,7 @@ void TargetPassConfig::addMachinePasses() { // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only // do so if it hasn't been disabled, substituted, or overridden. if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID)) - addPass(createPrologEpilogInserterPass(TM)); + addPass(createPrologEpilogInserterPass()); /// Add passes that optimize machine instructions after register allocation. if (getOptLevel() != CodeGenOpt::None) @@ -668,9 +697,15 @@ void TargetPassConfig::addMachinePasses() { addPass(&StackMapLivenessID, false); addPass(&LiveDebugValuesID, false); + // Insert before XRay Instrumentation. + addPass(&FEntryInserterID, false); + addPass(&XRayInstrumentationID, false); addPass(&PatchableFunctionID, false); + if (EnableMachineOutliner) + PM->add(createMachineOutlinerPass()); + AddingMachinePasses = false; } @@ -704,6 +739,10 @@ void TargetPassConfig::addMachineSSAOptimization() { addPass(&MachineLICMID, false); addPass(&MachineCSEID, false); + + // Coalesce basic blocks with the same branch condition + addPass(&BranchCoalescingID); + addPass(&MachineSinkingID); addPass(&PeepholeOptimizerID); @@ -730,20 +769,13 @@ MachinePassRegistry RegisterRegAlloc::Registry; /// A dummy default pass factory indicates whether the register allocator is /// overridden on the command line. -LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag); -static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } +static llvm::once_flag InitializeDefaultRegisterAllocatorFlag; + static RegisterRegAlloc defaultRegAlloc("default", "pick register allocator based on -O option", useDefaultRegisterAllocator); -/// -regalloc=... command line option. -static cl::opt<RegisterRegAlloc::FunctionPassCtor, false, - RegisterPassParser<RegisterRegAlloc> > -RegAlloc("regalloc", - cl::init(&useDefaultRegisterAllocator), - cl::desc("Register allocator to use")); - static void initializeDefaultRegisterAllocatorOnce() { RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault(); @@ -753,7 +785,6 @@ static void initializeDefaultRegisterAllocatorOnce() { } } - /// Instantiate the default register allocator pass for this target for either /// the optimized or unoptimized allocation path. This will be added to the pass /// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc @@ -903,6 +934,11 @@ void TargetPassConfig::addBlockPlacement() { //===---------------------------------------------------------------------===// /// GlobalISel Configuration //===---------------------------------------------------------------------===// + +bool TargetPassConfig::isGlobalISelEnabled() const { + return false; +} + bool TargetPassConfig::isGlobalISelAbortEnabled() const { return EnableGlobalISelAbort == 1; } diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp index cd50c5b6571d..41ec082a24cf 100644 --- a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -50,8 +50,7 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet, ArrayRef<MCPhysReg> Exceptions) const { // Check that all super registers of reserved regs are reserved as well. BitVector Checked(getNumRegs()); - for (int Reg = RegisterSet.find_first(); Reg>=0; - Reg = RegisterSet.find_next(Reg)) { + for (unsigned Reg : RegisterSet.set_bits()) { if (Checked[Reg]) continue; for (MCSuperRegIterator SR(Reg, this); SR.isValid(); ++SR) { @@ -155,10 +154,9 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const { // Pick the most sub register class of the right type that contains // this physreg. const TargetRegisterClass* BestRC = nullptr; - for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){ - const TargetRegisterClass* RC = *I; - if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) && - (!BestRC || BestRC->hasSubClass(RC))) + for (const TargetRegisterClass* RC : regclasses()) { + if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) && + RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC))) BestRC = RC; } @@ -185,10 +183,9 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, if (SubClass) getAllocatableSetForRC(MF, SubClass, Allocatable); } else { - for (TargetRegisterInfo::regclass_iterator I = regclass_begin(), - E = regclass_end(); I != E; ++I) - if ((*I)->isAllocatable()) - getAllocatableSetForRC(MF, *I, Allocatable); + for (const TargetRegisterClass *C : regclasses()) + if (C->isAllocatable()) + getAllocatableSetForRC(MF, C, Allocatable); } // Mask out the reserved registers @@ -209,7 +206,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A, if (unsigned Common = *A++ & *B++) { const TargetRegisterClass *RC = TRI->getRegClass(I + countTrailingZeros(Common)); - if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT)) + if (SVT == MVT::SimpleValueType::Any || TRI->isTypeLegalForClass(*RC, VT)) return RC; } return nullptr; @@ -267,7 +264,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, const TargetRegisterClass *BestRC = nullptr; unsigned *BestPreA = &PreA; unsigned *BestPreB = &PreB; - if (RCA->getSize() < RCB->getSize()) { + if (getRegSizeInBits(*RCA) < getRegSizeInBits(*RCB)) { std::swap(RCA, RCB); std::swap(SubA, SubB); std::swap(BestPreA, BestPreB); @@ -275,7 +272,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, // Also terminate the search one we have found a register class as small as // RCA. - unsigned MinSize = RCA->getSize(); + unsigned MinSize = getRegSizeInBits(*RCA); for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) { unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA); @@ -283,7 +280,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, // Check if a common super-register class exists for this index pair. const TargetRegisterClass *RC = firstCommonClass(IA.getMask(), IB.getMask(), this); - if (!RC || RC->getSize() < MinSize) + if (!RC || getRegSizeInBits(*RC) < MinSize) continue; // The indexes must compose identically: PreA+SubA == PreB+SubB. @@ -292,7 +289,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, continue; // Is RC a better candidate than BestRC? - if (BestRC && RC->getSize() >= BestRC->getSize()) + if (BestRC && getRegSizeInBits(*RC) >= getRegSizeInBits(*BestRC)) continue; // Yes, RC is the smallest super-register seen so far. @@ -301,7 +298,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, *BestPreB = IB.getSubReg(); // Bail early if we reached MinSize. We won't find a better candidate. - if (BestRC->getSize() == MinSize) + if (getRegSizeInBits(*BestRC) == MinSize) return BestRC; } } @@ -415,9 +412,9 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void -TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, - const TargetRegisterInfo *TRI) { +LLVM_DUMP_METHOD +void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, + const TargetRegisterInfo *TRI) { dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n"; } #endif diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp index 83e52d335354..0df34ce43112 100644 --- a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp @@ -1,4 +1,4 @@ -//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===// +//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===// // // The LLVM Compiler Infrastructure // @@ -12,12 +12,22 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> using namespace llvm; @@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const { static unsigned gcd(unsigned Dividend, unsigned Divisor) { // Dividend and Divisor will be naturally swapped as needed. - while(Divisor) { + while (Divisor) { unsigned Rem = Dividend % Divisor; Dividend = Divisor; Divisor = Rem; }; return Dividend; } + static unsigned lcm(unsigned A, unsigned B) { unsigned LCM = (uint64_t(A) * B) / gcd(A, B); assert((LCM >= A && LCM >= B) && "LCM overflow"); @@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm, } } +/// Returns true only if instruction is specified as single issue. +bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->BeginGroup; + } + return false; +} + +bool TargetSchedModel::mustEndGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->EndGroup; + } + return false; +} + unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC) const { if (hasInstrItineraries()) { @@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) { /// evaluation of predicates that depend on instruction operands or flags. const MCSchedClassDesc *TargetSchedModel:: resolveSchedClass(const MachineInstr *MI) const { - // Get the definition's scheduling class descriptor from this machine model. unsigned SchedClass = MI->getDesc().getSchedClass(); const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); @@ -244,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const { if (SCDesc->isValid() && !SCDesc->isVariant()) return computeInstrLatency(*SCDesc); - llvm_unreachable("No MI sched latency"); + if (SCDesc->isValid()) { + assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()"); + return computeInstrLatency(*SCDesc); + } + return 0; } unsigned @@ -298,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx, } return 0; } + +static Optional<double> +getRTroughputFromItineraries(unsigned schedClass, + const InstrItineraryData *IID){ + double Unknown = std::numeric_limits<double>::infinity(); + double Throughput = Unknown; + + for (const InstrStage *IS = IID->beginStage(schedClass), + *E = IID->endStage(schedClass); + IS != E; ++IS) { + unsigned Cycles = IS->getCycles(); + if (!Cycles) + continue; + Throughput = + std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +static Optional<double> +getRTroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc, + const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel) { + double Unknown = std::numeric_limits<double>::infinity(); + double Throughput = Unknown; + + for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), + *WEnd = STI->getWriteProcResEnd(SCDesc); + WPR != WEnd; ++WPR) { + unsigned Cycles = WPR->Cycles; + if (!Cycles) + return Optional<double>(); + + unsigned NumUnits = + SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; + Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const { + if (hasInstrItineraries()) + return getRTroughputFromItineraries(MI->getDesc().getSchedClass(), + getInstrItineraries()); + if (hasInstrSchedModel()) + return getRTroughputFromInstrSchedModel(resolveSchedClass(MI), STI, + SchedModel); + return Optional<double>(); +} + +Optional<double> +TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const { + unsigned SchedClass = TII->get(Opcode).getSchedClass(); + if (hasInstrItineraries()) + return getRTroughputFromItineraries(SchedClass, getInstrItineraries()); + if (hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); + if (SCDesc->isValid() && !SCDesc->isVariant()) + return getRTroughputFromInstrSchedModel(SCDesc, STI, SchedModel); + } + return Optional<double>(); +} diff --git a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp index c74707d95b9e..0a444e0fff07 100644 --- a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -11,6 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -52,3 +55,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const { bool TargetSubtargetInfo::useAA() const { return false; } + +static std::string createSchedInfoStr(unsigned Latency, + Optional<double> RThroughput) { + static const char *SchedPrefix = " sched: ["; + std::string Comment; + raw_string_ostream CS(Comment); + if (Latency > 0 && RThroughput.hasValue()) + CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue()) + << "]"; + else if (Latency > 0) + CS << SchedPrefix << Latency << ":?]"; + else if (RThroughput.hasValue()) + CS << SchedPrefix << "?:" << RThroughput.getValue() << "]"; + CS.flush(); + return Comment; +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { + if (MI.isPseudo() || MI.isTerminator()) + return std::string(); + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI); + return createSchedInfoStr(Latency, RThroughput); +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + if (!TSchedModel.hasInstrSchedModel()) + return std::string(); + unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode()); + Optional<double> RThroughput = + TSchedModel.computeInstrRThroughput(MCI.getOpcode()); + return createSchedInfoStr(Latency, RThroughput); +} diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 0f1b2ed994b7..552a89f76ca2 100644 --- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -52,7 +52,7 @@ using namespace llvm; -#define DEBUG_TYPE "twoaddrinstr" +#define DEBUG_TYPE "twoaddressinstruction" STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions"); STATISTIC(NumCommuted , "Number of instructions commuted to coalesce"); @@ -155,7 +155,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AAResultsWrapperPass>(); + AU.addUsedIfAvailable<AAResultsWrapperPass>(); AU.addUsedIfAvailable<LiveVariables>(); AU.addPreserved<LiveVariables>(); AU.addPreserved<SlotIndexes>(); @@ -171,10 +171,10 @@ public: } // end anonymous namespace char TwoAddressInstructionPass::ID = 0; -INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction", +INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction", +INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID; @@ -905,7 +905,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, ++End; } - // Check if the reschedule will not break depedencies. + // Check if the reschedule will not break dependencies. unsigned NumVisited = 0; MachineBasicBlock::iterator KillPos = KillMI; ++KillPos; @@ -1627,7 +1627,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { InstrItins = MF->getSubtarget().getInstrItineraryData(); LV = getAnalysisIfAvailable<LiveVariables>(); LIS = getAnalysisIfAvailable<LiveIntervals>(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>()) + AA = &AAPass->getAAResults(); + else + AA = nullptr; OptLevel = TM.getOptLevel(); bool MadeChange = false; @@ -1785,7 +1788,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY)) .addReg(DstReg, RegState::Define, SubIdx) - .addOperand(UseMO); + .add(UseMO); // The first def needs an <undef> flag because there is no live register // before it. diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp index c2db56a7657c..407fd9b162e9 100644 --- a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -195,18 +196,31 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { } if (phi->getNumOperands() == 3) { - unsigned Input = phi->getOperand(1).getReg(); - unsigned Output = phi->getOperand(0).getReg(); - - phi++->eraseFromParent(); + const MachineOperand &Input = phi->getOperand(1); + const MachineOperand &Output = phi->getOperand(0); + unsigned InputReg = Input.getReg(); + unsigned OutputReg = Output.getReg(); + assert(Output.getSubReg() == 0 && "Cannot have output subregister"); ModifiedPHI = true; - if (Input != Output) { + if (InputReg != OutputReg) { MachineRegisterInfo &MRI = F.getRegInfo(); - MRI.constrainRegClass(Input, MRI.getRegClass(Output)); - MRI.replaceRegWith(Output, Input); + unsigned InputSub = Input.getSubReg(); + if (InputSub == 0 && + MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) { + MRI.replaceRegWith(OutputReg, InputReg); + } else { + // The input register to the PHI has a subregister or it can't be + // constrained to the proper register class: + // insert a COPY instead of simply replacing the output + // with the input. + const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); + BuildMI(*BB, BB->getFirstNonPHI(), phi->getDebugLoc(), + TII->get(TargetOpcode::COPY), OutputReg) + .addReg(InputReg, getRegState(Input), InputSub); + } + phi++->eraseFromParent(); } - continue; } diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp index 0d506d646659..d10ca1a7ff91 100644 --- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp @@ -73,8 +73,9 @@ void VirtRegMap::grow() { } unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) { - int SS = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(), - RC->getAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); + int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Align); ++NumSpillSlots; return SS; } @@ -167,6 +168,7 @@ class VirtRegRewriter : public MachineFunctionPass { bool readsUndefSubreg(const MachineOperand &MO) const; void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const; void handleIdentityCopy(MachineInstr &MI) const; + void expandCopyBundle(MachineInstr &MI) const; public: static char ID; @@ -367,11 +369,41 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const { } if (Indexes) - Indexes->removeMachineInstrFromMaps(MI); - MI.eraseFromParent(); + Indexes->removeSingleMachineInstrFromMaps(MI); + MI.eraseFromBundle(); DEBUG(dbgs() << " deleted.\n"); } +/// The liverange splitting logic sometimes produces bundles of copies when +/// subregisters are involved. Expand these into a sequence of copy instructions +/// after processing the last in the bundle. Does not update LiveIntervals +/// which we shouldn't need for this instruction anymore. +void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const { + if (!MI.isCopy()) + return; + + if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) { + // Only do this when the complete bundle is made out of COPYs. + MachineBasicBlock &MBB = *MI.getParent(); + for (MachineBasicBlock::reverse_instr_iterator I = + std::next(MI.getReverseIterator()), E = MBB.instr_rend(); + I != E && I->isBundledWithSucc(); ++I) { + if (!I->isCopy()) + return; + } + + for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator(); + I->isBundledWithPred(); ) { + MachineInstr &MI = *I; + ++I; + + MI.unbundleFromPred(); + if (Indexes) + Indexes->insertMachineInstrInMaps(MI); + } + } +} + void VirtRegRewriter::rewrite() { bool NoSubRegLiveness = !MRI->subRegLivenessEnabled(); SmallVector<unsigned, 8> SuperDeads; @@ -431,12 +463,14 @@ void VirtRegRewriter::rewrite() { } } - // The <def,undef> flag only makes sense for sub-register defs, and - // we are substituting a full physreg. An <imp-use,kill> operand - // from the SuperKills list will represent the partial read of the - // super-register. - if (MO.isDef()) + // The <def,undef> and <def,internal> flags only make sense for + // sub-register defs, and we are substituting a full physreg. An + // <imp-use,kill> operand from the SuperKills list will represent the + // partial read of the super-register. + if (MO.isDef()) { MO.setIsUndef(false); + MO.setIsInternalRead(false); + } // PhysReg operands cannot have subregister indexes. PhysReg = TRI->getSubReg(PhysReg, SubReg); @@ -461,6 +495,8 @@ void VirtRegRewriter::rewrite() { DEBUG(dbgs() << "> " << *MI); + expandCopyBundle(*MI); + // We can remove identity copies right now. handleIdentityCopy(*MI); } diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp index 568720c66e55..4e7542bf31e0 100644 --- a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -54,7 +54,7 @@ namespace { class WinEHPrepare : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {} + WinEHPrepare() : FunctionPass(ID) {} bool runOnFunction(Function &Fn) override; @@ -86,6 +86,7 @@ private: // All fields are reset by runOnFunction. EHPersonality Personality = EHPersonality::Unknown; + const DataLayout *DL = nullptr; DenseMap<BasicBlock *, ColorVector> BlockColors; MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks; }; @@ -93,12 +94,10 @@ private: } // end anonymous namespace char WinEHPrepare::ID = 0; -INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions", - false, false) +INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions", + false, false) -FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) { - return new WinEHPrepare(TM); -} +FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); } bool WinEHPrepare::runOnFunction(Function &Fn) { if (!Fn.hasPersonalityFn()) @@ -111,6 +110,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) { if (!isFuncletEHPersonality(Personality)) return false; + DL = &Fn.getParent()->getDataLayout(); return prepareExplicitEH(Fn); } @@ -1070,7 +1070,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) { if (!isa<TerminatorInst>(EHPad)) { // If the EHPad isn't a terminator, then we can insert a load in this block // that will dominate all uses. - SpillSlot = new AllocaInst(PN->getType(), nullptr, + SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr, Twine(PN->getName(), ".wineh.spillslot"), &F.getEntryBlock().front()); Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"), @@ -1157,7 +1157,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot, Function &F) { // Lazilly create the spill slot. if (!SpillSlot) - SpillSlot = new AllocaInst(V->getType(), nullptr, + SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr, Twine(V->getName(), ".wineh.spillslot"), &F.getEntryBlock().front()); diff --git a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp index 63bd762eeb2b..2df3602733f3 100644 --- a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp +++ b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -18,6 +18,8 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetInstrInfo.h" @@ -33,6 +35,14 @@ struct XRayInstrumentation : public MachineFunctionPass { initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry()); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; private: @@ -43,7 +53,7 @@ private: // This is the approach to go on CPUs which have a single RET instruction, // like x86/x86_64. void replaceRetWithPatchableRet(MachineFunction &MF, - const TargetInstrInfo *TII); + const TargetInstrInfo *TII); // Prepend the original return instruction with the exit sled code ("patchable // function exit" pseudo-instruction), preserving the original return @@ -54,13 +64,12 @@ private: // have to call the trampoline and return from it to the original return // instruction of the function being instrumented. void prependRetWithPatchableExit(MachineFunction &MF, - const TargetInstrInfo *TII); + const TargetInstrInfo *TII); }; } // anonymous namespace -void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, - const TargetInstrInfo *TII) -{ +void XRayInstrumentation::replaceRetWithPatchableRet( + MachineFunction &MF, const TargetInstrInfo *TII) { // We look for *all* terminators and returns, then replace those with // PATCHABLE_RET instructions. SmallVector<MachineInstr *, 4> Terminators; @@ -81,7 +90,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc)) .addImm(T.getOpcode()); for (auto &MO : T.operands()) - MIB.addOperand(MO); + MIB.add(MO); Terminators.push_back(&T); } } @@ -91,9 +100,8 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF, I->eraseFromParent(); } -void XRayInstrumentation::prependRetWithPatchableExit(MachineFunction &MF, - const TargetInstrInfo *TII) -{ +void XRayInstrumentation::prependRetWithPatchableExit( + MachineFunction &MF, const TargetInstrInfo *TII) { for (auto &MBB : MF) { for (auto &T : MBB.terminators()) { unsigned Opc = 0; @@ -106,7 +114,7 @@ void XRayInstrumentation::prependRetWithPatchableExit(MachineFunction &MF, if (Opc != 0) { // Prepend the return instruction with PATCHABLE_FUNCTION_EXIT or // PATCHABLE_TAIL_CALL . - BuildMI(MBB, T, T.getDebugLoc(),TII->get(Opc)); + BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc)); } } } @@ -125,8 +133,13 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { return false; // XRay threshold attribute not found. if (Attr.getValueAsString().getAsInteger(10, XRayThreshold)) return false; // Invalid value for threshold. - if (F.size() < XRayThreshold) - return false; // Function is too small. + + // Check if we have a loop. + // FIXME: Maybe make this smarter, and see whether the loops are dependent + // on inputs or side-effects? + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + if (MLI.empty() && F.size() < XRayThreshold) + return false; // Function is too small and has no loops. } // We look for the first non-empty MachineBasicBlock, so that we can insert @@ -142,12 +155,10 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { if (!MF.getSubtarget().isXRaySupported()) { FirstMI.emitError("An attempt to perform XRay instrumentation for an" - " unsupported target."); + " unsupported target."); return false; } - // FIXME: Do the loop triviality analysis here or in an earlier pass. - // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the // MachineFunction. BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(), @@ -157,6 +168,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { case Triple::ArchType::arm: case Triple::ArchType::thumb: case Triple::ArchType::aarch64: + case Triple::ArchType::ppc64le: + case Triple::ArchType::mips: + case Triple::ArchType::mipsel: + case Triple::ArchType::mips64: + case Triple::ArchType::mips64el: // For the architectures which don't have a single return instruction prependRetWithPatchableExit(MF, TII); break; @@ -171,5 +187,8 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { char XRayInstrumentation::ID = 0; char &llvm::XRayInstrumentationID = XRayInstrumentation::ID; -INITIALIZE_PASS(XRayInstrumentation, "xray-instrumentation", "Insert XRay ops", - false, false) +INITIALIZE_PASS_BEGIN(XRayInstrumentation, "xray-instrumentation", + "Insert XRay ops", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(XRayInstrumentation, "xray-instrumentation", + "Insert XRay ops", false, false) |
