diff options
Diffstat (limited to 'contrib/llvm/lib')
72 files changed, 1006 insertions, 372 deletions
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp index 90bc249bcb39..c2039e1dec2b 100644 --- a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -623,6 +623,7 @@ template <> struct GraphTraits<IrreducibleGraph> { typedef bfi_detail::IrreducibleGraph GraphT; typedef const GraphT::IrrNode NodeType; + typedef const GraphT::IrrNode *NodeRef; typedef GraphT::IrrNode::iterator ChildIteratorType; static const NodeType *getEntryNode(const GraphT &G) { diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp index 6c471ab45048..c9adaa7b111c 100644 --- a/contrib/llvm/lib/Analysis/ConstantFolding.cpp +++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp @@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V, /// integer type Ty is used to select how many bits are available for the /// result. Returns null if the conversion cannot be performed, otherwise /// returns the Constant value resulting from the conversion. -Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero, - Type *Ty) { +Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero, + Type *Ty) { // All of these conversion intrinsics form an integer of at most 64bits. unsigned ResultWidth = Ty->getIntegerBitWidth(); assert(ResultWidth <= 64 && @@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero, APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth, /*isSigned=*/true, mode, &isExact); - if (status != APFloat::opOK && status != APFloat::opInexact) + if (status != APFloat::opOK && + (!roundTowardZero || status != APFloat::opInexact)) return nullptr; return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true); } @@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, case Intrinsic::x86_sse2_cvtsd2si: case Intrinsic::x86_sse2_cvtsd2si64: if (ConstantFP *FPOp = - dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U))) - return ConstantFoldConvertToInt(FPOp->getValueAPF(), - /*roundTowardZero=*/false, Ty); + dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U))) + return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), + /*roundTowardZero=*/false, Ty); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: if (ConstantFP *FPOp = - dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U))) - return ConstantFoldConvertToInt(FPOp->getValueAPF(), - /*roundTowardZero=*/true, Ty); + dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U))) + return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), + /*roundTowardZero=*/true, Ty); } } diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp index 0cb2c78afb40..aeaf9388579c 100644 --- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp @@ -3400,7 +3400,10 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal, return TrueVal; if (const auto *ICI = dyn_cast<ICmpInst>(CondVal)) { - unsigned BitWidth = Q.DL.getTypeSizeInBits(TrueVal->getType()); + // FIXME: This code is nearly duplicated in InstCombine. Using/refactoring + // decomposeBitTestICmp() might help. + unsigned BitWidth = + Q.DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); ICmpInst::Predicate Pred = ICI->getPredicate(); Value *CmpLHS = ICI->getOperand(0); Value *CmpRHS = ICI->getOperand(1); @@ -4274,7 +4277,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV, // Gracefully handle edge cases where the instruction is not wired into any // parent block. - if (I->getParent()) + if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) && + !I->mayHaveSideEffects()) I->eraseFromParent(); } else { Worklist.insert(I); @@ -4302,7 +4306,8 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV, // Gracefully handle edge cases where the instruction is not wired into any // parent block. - if (I->getParent()) + if (I->getParent() && !I->isEHPad() && !isa<TerminatorInst>(I) && + !I->mayHaveSideEffects()) I->eraseFromParent(); } return Simplified; diff --git a/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp index f59257ab16b5..7bdf3408a581 100644 --- a/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ b/contrib/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -115,13 +115,19 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) { // We might have a vector load from an array. FIXME: for now we just bail // out in this case, but we should be able to resolve and simplify such // loads. - if(CDS->getElementType() != I.getType()) + if (CDS->getElementType() != I.getType()) return false; - int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; - if (SimplifiedAddrOp->getValue().getActiveBits() >= 64) + unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; + if (SimplifiedAddrOp->getValue().getActiveBits() > 64) return false; - int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize; + int64_t SimplifiedAddrOpV = SimplifiedAddrOp->getSExtValue(); + if (SimplifiedAddrOpV < 0) { + // FIXME: For now we conservatively ignore out of bound accesses, but + // we're allowed to perform the optimization in this case. + return false; + } + uint64_t Index = static_cast<uint64_t>(SimplifiedAddrOpV) / ElemSize; if (Index >= CDS->getNumElements()) { // FIXME: For now we conservatively ignore out of bound accesses, but // we're allowed to perform the optimization in this case. diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp index 77e4ec7ab40c..2e45bb840946 100644 --- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp @@ -1610,8 +1610,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty, Instruction *IP) { - assert(IP); - Builder.SetInsertPoint(IP); + setInsertPoint(IP); return expandCodeFor(SH, Ty); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index b0ba57122206..ebf80dea2c4b 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -214,10 +214,7 @@ TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) { } TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) { - // It's possible to ask for the FuncId of a function which doesn't have a - // subprogram: inlining a function with debug info into a function with none. - if (!SP) - return TypeIndex::None(); + assert(SP); // Check if we've already translated this subprogram. auto I = TypeIndices.find({SP, nullptr}); @@ -621,11 +618,12 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, std::string FuncName; auto *SP = GV->getSubprogram(); + assert(SP); setCurrentSubprogram(SP); // If we have a display name, build the fully qualified name by walking the // chain of scopes. - if (SP != nullptr && !SP->getDisplayName().empty()) + if (!SP->getDisplayName().empty()) FuncName = getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName()); @@ -864,7 +862,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) { void CodeViewDebug::beginFunction(const MachineFunction *MF) { assert(!CurFn && "Can't process two functions at once!"); - if (!Asm || !MMI->hasDebugInfo()) + if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram()) return; DebugHandlerBase::beginFunction(MF); @@ -1939,7 +1937,8 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); // Ignore DBG_VALUE locations and function prologue. - if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup)) + if (!Asm || !CurFn || MI->isDebugValue() || + MI->getFlag(MachineInstr::FrameSetup)) return; DebugLoc DL = MI->getDebugLoc(); if (DL == PrevInstLoc || !DL) diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp index fa705761645f..23e2aa70d0c7 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp +++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp @@ -996,6 +996,24 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { MachineBasicBlock *IBB = &*I; MachineBasicBlock *PredBB = &*std::prev(I); MergePotentials.clear(); + MachineLoop *ML; + + // Bail if merging after placement and IBB is the loop header because + // -- If merging predecessors that belong to the same loop as IBB, the + // common tail of merged predecessors may become the loop top if block + // placement is called again and the predecessors may branch to this common + // tail and require more branches. This can be relaxed if + // MachineBlockPlacement::findBestLoopTop is more flexible. + // --If merging predecessors that do not belong to the same loop as IBB, the + // loop info of IBB's loop and the other loops may be affected. Calling the + // block placement again may make big change to the layout and eliminate the + // reason to do tail merging here. + if (AfterBlockPlacement && MLI) { + ML = MLI->getLoopFor(IBB); + if (ML && IBB == ML->getHeader()) + continue; + } + for (MachineBasicBlock *PBB : I->predecessors()) { if (MergePotentials.size() == TailMergeThreshold) break; @@ -1015,16 +1033,12 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { if (PBB->hasEHPadSuccessor()) continue; - // Bail out if the loop header (IBB) is not the top of the loop chain - // after the block placement. Otherwise, the common tail of IBB's - // predecessors may become the loop top if block placement is called again - // and the predecessors may branch to this common tail. - // FIXME: Relaxed this check if the algorithm of finding loop top is - // changed in MBP. + // After block placement, only consider predecessors that belong to the + // same loop as IBB. The reason is the same as above when skipping loop + // header. if (AfterBlockPlacement && MLI) - if (MachineLoop *ML = MLI->getLoopFor(IBB)) - if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB)) - continue; + if (ML != MLI->getLoopFor(PBB)) + continue; MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp index 19cd59b9dba7..4a1b9958a5b5 100644 --- a/contrib/llvm/lib/CodeGen/SafeStack.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp @@ -530,7 +530,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( unsigned Align = std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot), - Align, SSC.getLiveRange(StackGuardSlot)); + Align, SSC.getFullLiveRange()); } for (Argument *Arg : ByValArguments) { diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp index 709614f57e7d..795eb8d27191 100644 --- a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp @@ -25,7 +25,9 @@ static cl::opt<bool> ClColoring("safe-stack-coloring", cl::Hidden, cl::init(true)); const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) { - return LiveRanges[AllocaNumbering[AI]]; + const auto IT = AllocaNumbering.find(AI); + assert(IT != AllocaNumbering.end()); + return LiveRanges[IT->second]; } bool StackColoring::readMarker(Instruction *I, bool *IsStart) { diff --git a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp index b8190e0f2153..fb433c1856a6 100644 --- a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp +++ b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp @@ -100,7 +100,8 @@ void StackLayout::layoutObject(StackObject &Obj) { } // Split starting and ending regions if necessary. - for (StackRegion &R : Regions) { + for (unsigned i = 0; i < Regions.size(); ++i) { + StackRegion &R = Regions[i]; if (Start > R.Start && Start < R.End) { StackRegion R0 = R; R.Start = R0.End = Start; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d888676583f3..5ecc6da32144 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6198,13 +6198,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } } - // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0) - unsigned ElementWidth = VT.getScalarType().getSizeInBits(); + // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) + // Here, T can be 1 or -1, depending on the type of the setcc and + // getBooleanContents(). + unsigned SetCCWidth = N0.getValueType().getScalarSizeInBits(); + SDLoc DL(N); - SDValue NegOne = - DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT); + // To determine the "true" side of the select, we need to know the high bit + // of the value returned by the setcc if it evaluates to true. + // If the type of the setcc is i1, then the true case of the select is just + // sext(i1 1), that is, -1. + // If the type of the setcc is larger (say, i8) then the value of the high + // bit depends on getBooleanContents(). So, ask TLI for a real "true" value + // of the appropriate width. + SDValue ExtTrueVal = + (SetCCWidth == 1) + ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), + DL, VT) + : TLI.getConstTrueVal(DAG, VT, DL); + if (SDValue SCC = SimplifySelectCC( - DL, N0.getOperand(0), N0.getOperand(1), NegOne, + DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal, DAG.getConstant(0, DL, VT), cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) return SCC; @@ -6215,10 +6229,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { SDLoc DL(N); ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - SDValue SetCC = DAG.getSetCC(DL, SetCCVT, - N0.getOperand(0), N0.getOperand(1), CC); - return DAG.getSelect(DL, VT, SetCC, - NegOne, DAG.getConstant(0, DL, VT)); + SDValue SetCC = + DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC); + return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, + DAG.getConstant(0, DL, VT)); } } } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 8235522b14bd..29d11c79ac25 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6639,19 +6639,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) { SDNode *FromNode = From.getNode(); SDNode *ToNode = To.getNode(); ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode); + SmallVector<SDDbgValue *, 2> ClonedDVs; for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end(); I != E; ++I) { SDDbgValue *Dbg = *I; // Only add Dbgvalues attached to same ResNo. if (Dbg->getKind() == SDDbgValue::SDNODE && - Dbg->getResNo() == From.getResNo()) { + Dbg->getSDNode() == From.getNode() && + Dbg->getResNo() == From.getResNo() && !Dbg->isInvalidated()) { + assert(FromNode != ToNode && + "Should not transfer Debug Values intranode"); SDDbgValue *Clone = getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode, To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(), Dbg->getDebugLoc(), Dbg->getOrder()); - AddDbgValue(Clone, ToNode, false); + ClonedDVs.push_back(Clone); + Dbg->setIsInvalidated(); } } + for (SDDbgValue *I : ClonedDVs) + AddDbgValue(I, ToNode, false); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f2bc88a98597..806646fbc676 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1234,6 +1234,16 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const { llvm_unreachable("Invalid boolean contents"); } +SDValue TargetLowering::getConstTrueVal(SelectionDAG &DAG, EVT VT, + const SDLoc &DL) const { + unsigned ElementWidth = VT.getScalarSizeInBits(); + APInt TrueInt = + getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent + ? APInt(ElementWidth, 1) + : APInt::getAllOnesValue(ElementWidth); + return DAG.getConstant(TrueInt, DL, VT); +} + bool TargetLowering::isConstFalseVal(const SDNode *N) const { if (!N) return false; diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 3d9a51864b6c..8feb18b4d030 100644 --- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -29,7 +29,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" @@ -539,6 +539,16 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { return TRI->regsOverlap(RegA, RegB); } +// Returns true if Reg is equal or aliased to at least one register in Set. +static bool regOverlapsSet(const SmallVectorImpl<unsigned> &Set, unsigned Reg, + const TargetRegisterInfo *TRI) { + for (unsigned R : Set) + if (TRI->regsOverlap(R, Reg)) + return true; + + return false; +} + /// Return true if it's potentially profitable to commute the two-address /// instruction that's being processed. bool @@ -864,9 +874,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, // FIXME: Needs more sophisticated heuristics. return false; - SmallSet<unsigned, 2> Uses; - SmallSet<unsigned, 2> Kills; - SmallSet<unsigned, 2> Defs; + SmallVector<unsigned, 2> Uses; + SmallVector<unsigned, 2> Kills; + SmallVector<unsigned, 2> Defs; for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; @@ -874,12 +884,12 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, if (!MOReg) continue; if (MO.isDef()) - Defs.insert(MOReg); + Defs.push_back(MOReg); else { - Uses.insert(MOReg); + Uses.push_back(MOReg); if (MOReg != Reg && (MO.isKill() || (LIS && isPlainlyKilled(MI, MOReg, LIS)))) - Kills.insert(MOReg); + Kills.push_back(MOReg); } } @@ -888,8 +898,9 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator AfterMI = std::next(Begin); MachineBasicBlock::iterator End = AfterMI; - while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) { - Defs.insert(End->getOperand(0).getReg()); + while (End->isCopy() && + regOverlapsSet(Defs, End->getOperand(1).getReg(), TRI)) { + Defs.push_back(End->getOperand(0).getReg()); ++End; } @@ -915,21 +926,21 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, if (!MOReg) continue; if (MO.isDef()) { - if (Uses.count(MOReg)) + if (regOverlapsSet(Uses, MOReg, TRI)) // Physical register use would be clobbered. return false; - if (!MO.isDead() && Defs.count(MOReg)) + if (!MO.isDead() && regOverlapsSet(Defs, MOReg, TRI)) // May clobber a physical register def. // FIXME: This may be too conservative. It's ok if the instruction // is sunken completely below the use. return false; } else { - if (Defs.count(MOReg)) + if (regOverlapsSet(Defs, MOReg, TRI)) return false; bool isKill = MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS)); - if (MOReg != Reg && - ((isKill && Uses.count(MOReg)) || Kills.count(MOReg))) + if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) || + regOverlapsSet(Kills, MOReg, TRI))) // Don't want to extend other live ranges and update kills. return false; if (MOReg == Reg && !isKill) diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h index 267a0dab2f25..d58bff56576d 100644 --- a/contrib/llvm/lib/IR/AttributeImpl.h +++ b/contrib/llvm/lib/IR/AttributeImpl.h @@ -19,8 +19,8 @@ #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Optional.h" #include "llvm/IR/Attributes.h" +#include "AttributeSetNode.h" #include "llvm/Support/DataTypes.h" -#include "llvm/Support/TrailingObjects.h" #include <climits> #include <string> @@ -142,73 +142,6 @@ public: StringRef getStringValue() const { return Val; } }; -//===----------------------------------------------------------------------===// -/// \class -/// \brief This class represents a group of attributes that apply to one -/// element: function, return type, or parameter. -class AttributeSetNode final - : public FoldingSetNode, - private TrailingObjects<AttributeSetNode, Attribute> { - friend TrailingObjects; - - unsigned NumAttrs; ///< Number of attributes in this node. - /// Bitset with a bit for each available attribute Attribute::AttrKind. - uint64_t AvailableAttrs; - - AttributeSetNode(ArrayRef<Attribute> Attrs) - : NumAttrs(Attrs.size()), AvailableAttrs(0) { - static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT, - "Too many attributes for AvailableAttrs"); - // There's memory after the node where we can store the entries in. - std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>()); - - for (Attribute I : *this) { - if (!I.isStringAttribute()) { - AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum(); - } - } - } - - // AttributesSetNode is uniqued, these should not be publicly available. - void operator=(const AttributeSetNode &) = delete; - AttributeSetNode(const AttributeSetNode &) = delete; -public: - void operator delete(void *p) { ::operator delete(p); } - - static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs); - - /// \brief Return the number of attributes this AttributeSet contains. - unsigned getNumAttributes() const { return NumAttrs; } - - bool hasAttribute(Attribute::AttrKind Kind) const { - return AvailableAttrs & ((uint64_t)1) << Kind; - } - bool hasAttribute(StringRef Kind) const; - bool hasAttributes() const { return NumAttrs != 0; } - - Attribute getAttribute(Attribute::AttrKind Kind) const; - Attribute getAttribute(StringRef Kind) const; - - unsigned getAlignment() const; - unsigned getStackAlignment() const; - uint64_t getDereferenceableBytes() const; - uint64_t getDereferenceableOrNullBytes() const; - std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const; - std::string getAsString(bool InAttrGrp) const; - - typedef const Attribute *iterator; - iterator begin() const { return getTrailingObjects<Attribute>(); } - iterator end() const { return begin() + NumAttrs; } - - void Profile(FoldingSetNodeID &ID) const { - Profile(ID, makeArrayRef(begin(), end())); - } - static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) { - for (unsigned I = 0, E = AttrList.size(); I != E; ++I) - AttrList[I].Profile(ID); - } -}; - typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/IR/AttributeSetNode.h b/contrib/llvm/lib/IR/AttributeSetNode.h new file mode 100644 index 000000000000..fab1ed51e4d6 --- /dev/null +++ b/contrib/llvm/lib/IR/AttributeSetNode.h @@ -0,0 +1,98 @@ +//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines the node class used internally by AttributeSet. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_ATTRIBUTESETNODE_H +#define LLVM_IR_ATTRIBUTESETNODE_H + +#include "llvm/ADT/FoldingSet.h" +#include "llvm/IR/Attributes.h" +#include "llvm/Support/TrailingObjects.h" +#include <climits> + +namespace llvm { + +//===----------------------------------------------------------------------===// +/// \class +/// \brief This class represents a group of attributes that apply to one +/// element: function, return type, or parameter. +class AttributeSetNode final + : public FoldingSetNode, + private TrailingObjects<AttributeSetNode, Attribute> { + friend TrailingObjects; + + unsigned NumAttrs; ///< Number of attributes in this node. + /// Bitset with a bit for each available attribute Attribute::AttrKind. + uint64_t AvailableAttrs; + + AttributeSetNode(ArrayRef<Attribute> Attrs) + : NumAttrs(Attrs.size()), AvailableAttrs(0) { + static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT, + "Too many attributes for AvailableAttrs"); + // There's memory after the node where we can store the entries in. + std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>()); + + for (Attribute I : *this) { + if (!I.isStringAttribute()) { + AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum(); + } + } + } + + // AttributesSetNode is uniqued, these should not be publicly available. + void operator=(const AttributeSetNode &) = delete; + AttributeSetNode(const AttributeSetNode &) = delete; +public: + void operator delete(void *p) { ::operator delete(p); } + + static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs); + + static AttributeSetNode *get(AttributeSet AS, unsigned Index) { + return AS.getAttributes(Index); + } + + /// \brief Return the number of attributes this AttributeSet contains. + unsigned getNumAttributes() const { return NumAttrs; } + + bool hasAttribute(Attribute::AttrKind Kind) const { + return AvailableAttrs & ((uint64_t)1) << Kind; + } + bool hasAttribute(StringRef Kind) const; + bool hasAttributes() const { return NumAttrs != 0; } + + Attribute getAttribute(Attribute::AttrKind Kind) const; + Attribute getAttribute(StringRef Kind) const; + + unsigned getAlignment() const; + unsigned getStackAlignment() const; + uint64_t getDereferenceableBytes() const; + uint64_t getDereferenceableOrNullBytes() const; + std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const; + std::string getAsString(bool InAttrGrp) const; + + typedef const Attribute *iterator; + iterator begin() const { return getTrailingObjects<Attribute>(); } + iterator end() const { return begin() + NumAttrs; } + + void Profile(FoldingSetNodeID &ID) const { + Profile(ID, makeArrayRef(begin(), end())); + } + static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) { + for (unsigned I = 0, E = AttrList.size(); I != E; ++I) + AttrList[I].Profile(ID); + } +}; + +} // end llvm namespace + +#endif diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp index 431e51bb4562..2e4a2f89e2c7 100644 --- a/contrib/llvm/lib/IR/AutoUpgrade.cpp +++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp @@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "sse2.cvtps2pd" || Name == "avx.cvtdq2.pd.256" || Name == "avx.cvt.ps2.pd.256" || - Name == "sse2.cvttps2dq" || - Name.startswith("avx.cvtt.") || Name.startswith("avx.vinsertf128.") || Name == "avx2.vinserti128" || Name.startswith("avx.vextractf128.") || @@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd"); else Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd"); - } else if (IsX86 && (Name == "sse2.cvttps2dq" || - Name.startswith("avx.cvtt."))) { - // Truncation (round to zero) float/double to i32 vector conversion. - Value *Src = CI->getArgOperand(0); - VectorType *DstTy = cast<VectorType>(CI->getType()); - Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt"); } else if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); SmallVector<Metadata *, 1> Elts; diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp index a55361489ada..3c4b0cf2f8ff 100644 --- a/contrib/llvm/lib/IR/Core.cpp +++ b/contrib/llvm/lib/IR/Core.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/IR/Attributes.h" +#include "AttributeSetNode.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -1844,6 +1845,18 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unwrap<Function>(F)->addAttribute(Idx, unwrap(A)); } +unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) { + auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx); + return ASN->getNumAttributes(); +} + +void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, + LLVMAttributeRef *Attrs) { + auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx); + for (auto A: make_range(ASN->begin(), ASN->end())) + *Attrs++ = wrap(A); +} + LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unsigned KindID) { @@ -2216,6 +2229,21 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, CallSite(unwrap<Instruction>(C)).addAttribute(Idx, unwrap(A)); } +unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C, + LLVMAttributeIndex Idx) { + auto CS = CallSite(unwrap<Instruction>(C)); + auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx); + return ASN->getNumAttributes(); +} + +void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx, + LLVMAttributeRef *Attrs) { + auto CS = CallSite(unwrap<Instruction>(C)); + auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx); + for (auto A: make_range(ASN->begin(), ASN->end())) + *Attrs++ = wrap(A); +} + LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, unsigned KindID) { diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp index 5201c2ecce6a..f35c64b27b5b 100644 --- a/contrib/llvm/lib/IR/Metadata.cpp +++ b/contrib/llvm/lib/IR/Metadata.cpp @@ -675,8 +675,8 @@ void MDNode::handleChangedOperand(void *Ref, Metadata *New) { Metadata *Old = getOperand(Op); setOperand(Op, New); - // Drop uniquing for self-reference cycles. - if (New == this) { + // Drop uniquing for self-reference cycles and deleted constants. + if (New == this || (!New && Old && isa<ConstantAsMetadata>(Old))) { if (!isResolved()) resolve(); storeDistinctInContext(); diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp index cfa12a9f0b27..2bac2a310670 100644 --- a/contrib/llvm/lib/Support/Triple.cpp +++ b/contrib/llvm/lib/Support/Triple.cpp @@ -201,6 +201,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) { switch (Kind) { case UnknownEnvironment: return "unknown"; case GNU: return "gnu"; + case GNUABI64: return "gnuabi64"; case GNUEABIHF: return "gnueabihf"; case GNUEABI: return "gnueabi"; case GNUX32: return "gnux32"; @@ -468,6 +469,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { return StringSwitch<Triple::EnvironmentType>(EnvironmentName) .StartsWith("eabihf", Triple::EABIHF) .StartsWith("eabi", Triple::EABI) + .StartsWith("gnuabi64", Triple::GNUABI64) .StartsWith("gnueabihf", Triple::GNUEABIHF) .StartsWith("gnueabi", Triple::GNUEABI) .StartsWith("gnux32", Triple::GNUX32) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index b1e881685b0c..b97a0f155dc2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -250,6 +250,7 @@ def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", FeatureMacroOpFusion, FeatureNEON, FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, HasV8_1aOps]>; def : ProcessorModel<"generic", NoSchedModel, [ diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d6f2a190d4c8..ac7de1b422e0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7685,6 +7685,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); @@ -7728,10 +7729,16 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: - ResTy = MVT::v4i32; + ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } + if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) + return SDValue(); + + assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && + "Illegal vector type after legalization"); + SDLoc DL(N); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs @@ -9853,7 +9860,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: - return performFpToIntCombine(N, DAG, Subtarget); + return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, Subtarget); case ISD::OR: diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index 7e59710a427a..d4784b5463d7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -20,6 +20,7 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class GCNTargetMachine; struct MachineSchedContext; class MCAsmInfo; class raw_ostream; @@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); -FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr); +FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index cfe6346fb6b1..c9c95c796a69 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -783,15 +783,19 @@ void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) { emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion, RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2); if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, - RuntimeMD::OpenCL_C, 1); - auto Node = MD->getOperand(0); - unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) - ->getZExtValue(); - unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) - ->getZExtValue(); - emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, - Major * 100 + Minor * 10, 2); + if (MD->getNumOperands()) { + auto Node = MD->getOperand(0); + if (Node->getNumOperands() > 1) { + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, + RuntimeMD::OpenCL_C, 1); + uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) + ->getZExtValue(); + uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) + ->getZExtValue(); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, + Major * 100 + Minor * 10, 2); + } + } } } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 3b415774df49..b955e231699a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" @@ -30,15 +32,28 @@ using namespace llvm; namespace { class AMDGPUCodeGenPrepare : public FunctionPass, - public InstVisitor<AMDGPUCodeGenPrepare> { + public InstVisitor<AMDGPUCodeGenPrepare, bool> { + const GCNTargetMachine *TM; + const SISubtarget *ST; DivergenceAnalysis *DA; - const TargetMachine *TM; + Module *Mod; + bool HasUnsafeFPMath; public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID), - TM(TM) { } + TM(static_cast<const GCNTargetMachine *>(TM)), + ST(nullptr), + DA(nullptr), + Mod(nullptr), + HasUnsafeFPMath(false) { } + + bool visitFDiv(BinaryOperator &I); + + bool visitInstruction(Instruction &I) { + return false; + } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -55,7 +70,92 @@ public: } // End anonymous namespace +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { + const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); + if (!CNum) + return false; + + // Reciprocal f32 is handled separately without denormals. + return UnsafeDiv || CNum->isExactlyValue(+1.0); +} + +// Insert an intrinsic for fast fdiv for safe math situations where we can +// reduce precision. Leave fdiv for situations where the generic node is +// expected to be optimized. +bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { + Type *Ty = FDiv.getType(); + + // TODO: Handle half + if (!Ty->getScalarType()->isFloatTy()) + return false; + + MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); + if (!FPMath) + return false; + + const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); + float ULP = FPOp->getFPAccuracy(); + if (ULP < 2.5f) + return false; + + FastMathFlags FMF = FPOp->getFastMathFlags(); + bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || + FMF.allowReciprocal(); + if (ST->hasFP32Denormals() && !UnsafeDiv) + return false; + + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + Builder.setFastMathFlags(FMF); + Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); + + const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); + Function *Decl + = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *NewFDiv = nullptr; + + if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + NewFDiv = UndefValue::get(VT); + + // FIXME: Doesn't do the right thing for cases where the vector is partially + // constant. This works when the scalarizer pass is run first. + for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { + Value *NumEltI = Builder.CreateExtractElement(Num, I); + Value *DenEltI = Builder.CreateExtractElement(Den, I); + Value *NewElt; + + if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { + NewElt = Builder.CreateFDiv(NumEltI, DenEltI); + } else { + NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); + } + + NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); + } + } else { + if (!shouldKeepFDivF32(Num, UnsafeDiv)) + NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } + + if (NewFDiv) { + FDiv.replaceAllUsesWith(NewFDiv); + NewFDiv->takeName(&FDiv); + FDiv.eraseFromParent(); + } + + return true; +} + +static bool hasUnsafeFPMath(const Function &F) { + Attribute Attr = F.getFnAttribute("unsafe-fp-math"); + return Attr.getValueAsString() == "true"; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { + Mod = &M; return false; } @@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; + ST = &TM->getSubtarget<SISubtarget>(F); DA = &getAnalysis<DivergenceAnalysis>(); - visit(F); + HasUnsafeFPMath = hasUnsafeFPMath(F); - return true; + bool MadeChange = false; + + for (BasicBlock &BB : F) { + BasicBlock::iterator Next; + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + Next = std::next(I); + MadeChange |= visit(*I); + } + } + + return MadeChange; } INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, @@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, char AMDGPUCodeGenPrepare::ID = 0; -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) { +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { return new AMDGPUCodeGenPrepare(TM); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 6761b4b5df95..3944fdbd31e3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -420,9 +420,10 @@ int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding -int FP32_NEG_ONE = 0xbf800000; int FP32_ONE = 0x3f800000; +int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; +int FP64_NEG_ONE = 0xbff0000000000000; } def CONST : Constants; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 791872a9db40..8e3471bd2083 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = { #undef GET_INTRINSIC_NAME_TABLE }; -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - if (IntrID < Intrinsic::num_intrinsics) { - return nullptr; - } +namespace { +#define GET_INTRINSIC_ATTRIBUTES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ATTRIBUTES +} + +StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, + ArrayRef<Type *> Tys) const { + if (IntrID < Intrinsic::num_intrinsics) + return StringRef(); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); - std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]); - return Result; + return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; +} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned NumTys) const { + return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); +} + +FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, + ArrayRef<Type*> Tys) const { + // FIXME: Re-use Intrinsic::getType machinery + switch (ID) { + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + Type *F32Ty = Type::getFloatTy(Context); + return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false); + } + default: + llvm_unreachable("unhandled intrinsic"); + } } unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, @@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { } Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + ArrayRef<Type *> Tys) const { + FunctionType *FTy = getType(M->getContext(), IntrID, Tys); + Function *F + = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); + + AttributeSet AS = getAttributes(M->getContext(), + static_cast<AMDGPUIntrinsic::ID>(IntrID)); + F->setAttributes(AS); + return F; +} + +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, Type **Tys, - unsigned numTys) const { - llvm_unreachable("Not implemented"); + unsigned NumTys) const { + return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index f4173929259c..6cb8b9644642 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -34,13 +34,23 @@ enum ID { class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(); + + StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const; + std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; bool isOverloaded(unsigned IID) const override; Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + + Function *getDeclaration(Module *M, unsigned ID, + ArrayRef<Type *> = None) const; + + FunctionType *getType(LLVMContext &Context, unsigned ID, + ArrayRef<Type*> Tys = None) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 775463809634..0bad63fa77ad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -348,9 +348,6 @@ static VectorType *arrayTypeToVecType(Type *ArrayTy) { static Value * calculateVectorIndex(Value *Ptr, const std::map<GetElementPtrInst *, Value *> &GEPIdx) { - if (isa<AllocaInst>(Ptr)) - return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); - GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); auto I = GEPIdx.find(GEP); @@ -360,11 +357,11 @@ calculateVectorIndex(Value *Ptr, static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // FIXME we only support simple cases if (GEP->getNumOperands() != 3) - return NULL; + return nullptr; ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); if (!I0 || !I0->isZero()) - return NULL; + return nullptr; return GEP->getOperand(2); } @@ -398,7 +395,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { // are just being conservative for now. if (!AllocaTy || AllocaTy->getElementType()->isVectorTy() || - AllocaTy->getNumElements() > 4) { + AllocaTy->getNumElements() > 4 || + AllocaTy->getNumElements() < 2) { DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } @@ -443,9 +441,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = Inst->getOperand(0); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); Inst->replaceAllUsesWith(ExtractElement); @@ -453,9 +453,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; } case Instruction::Store: { + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Value *Ptr = Inst->getOperand(1); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(BitCast); Value *NewVecValue = Builder.CreateInsertElement(VecValue, Inst->getOperand(0), @@ -469,7 +471,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; default: - Inst->dump(); llvm_unreachable("Inconsistency in instructions promotable to vector"); } } @@ -477,11 +478,6 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { } static bool isCallPromotable(CallInst *CI) { - // TODO: We might be able to handle some cases where the callee is a - // constantexpr bitcast of a function. - if (!CI->getCalledFunction()) - return false; - IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); if (!II) return false; @@ -773,28 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { continue; } - IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); - if (!Intr) { - // FIXME: What is this for? It doesn't make sense to promote arbitrary - // function calls. If the call is to a defined function that can also be - // promoted, we should be able to do this once that function is also - // rewritten. - - std::vector<Type*> ArgTypes; - for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); - ArgIdx != ArgEnd; ++ArgIdx) { - ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); - } - Function *F = Call->getCalledFunction(); - FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, - F->isVarArg()); - Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), - NewType, F->getAttributes()); - Function *NewF = cast<Function>(C); - Call->setCalledFunction(NewF); - continue; - } - + IntrinsicInst *Intr = cast<IntrinsicInst>(Call); Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3e53f52c689f..b2d4e1144c75 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -309,6 +309,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; + void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; bool addInstSelector() override; @@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&DeadMachineInstructionElimID); } +void GCNPassConfig::addIRPasses() { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); + + AMDGPUPassConfig::addIRPasses(); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 8f78edd76a51..8ccd176930a6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -122,6 +122,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); @@ -832,13 +833,18 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::FP_TO_UINT: if (N->getValueType(0) == MVT::i1) { - Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); return; } // Fall-through. Since we don't care about out of bounds values // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint // considers some extra cases which are not necessary here. case ISD::FP_TO_SINT: { + if (N->getValueType(0) == MVT::i1) { + Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); + return; + } + SDValue Result; if (expandFP_TO_SINT(N, Result, DAG)) Results.push_back(Result); @@ -1052,15 +1058,24 @@ SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); } -SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { +SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode( + ISD::SETCC, + DL, + MVT::i1, + Op, DAG.getConstantFP(1.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETEQ)); +} + +SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode( ISD::SETCC, DL, MVT::i1, - Op, DAG.getConstantFP(0.0f, DL, MVT::f32), - DAG.getCondCode(ISD::SETNE) - ); + Op, DAG.getConstantFP(-1.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETEQ)); } SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 2fb6ee25caa9..9700ce14c6f3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -72,7 +72,8 @@ private: SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index 54efdc0a0466..f4b04e3631a5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -41,7 +41,8 @@ enum { WQM = 1 << 22, VGPRSpill = 1 << 23, VOPAsmPrefer32Bit = 1 << 24, - Gather4 = 1 << 25 + Gather4 = 1 << 25, + DisableWQM = 1 << 26 }; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 51241cf0a432..80d44351267d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1134,9 +1134,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineFunction *MF = BB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); DebugLoc DL = MI.getDebugLoc(); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) - .addOperand(MI.getOperand(0)) - .addImm(MFI->LDSSize); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) + .addOperand(MI.getOperand(0)) + .addImm(MFI->LDSSize); MI.eraseFromParent(); return BB; } @@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + return lowerFDIV_FAST(Op, DAG); + } case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), @@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Catch division cases where we can use shortcuts with rcp and rsq // instructions. -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { +SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, + SelectionDAG &DAG) const { SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) - return FastLowered; - +// Faster 2.5 ULP division that does not support denormals. +SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); - // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag - if (EnableAMDGPUFastFDIV) { - // This does not support denormals. - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - // TODO: Should this propagate fast-math-flags? + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - // rcp does not support denormals. - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); - } +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); - // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); @@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) - return LowerFastFDIV(Op, DAG); + return lowerFastUnsafeFDIV(Op, DAG); SDLoc SL(Op); SDValue X = Op.getOperand(0); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index 8e055eea58c2..1d349faa592c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 2f63d4ed13b3..6163f0547bd5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string asm = "", field bits<1> DS = 0; field bits<1> MIMG = 0; field bits<1> FLAT = 0; + + // Whether WQM _must_ be enabled for this instruction. field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; @@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string asm = "", field bits<1> Gather4 = 0; + // Whether WQM _must_ be disabled for this instruction. + field bits<1> DisableWQM = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{23} = VGPRSpill; let TSFlags{24} = VOPAsmPrefer32Bit; let TSFlags{25} = Gather4; + let TSFlags{26} = DisableWQM; let SchedRW = [Write32Bit]; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d171e21c8a4f..5cc6a4e0e83e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -738,7 +738,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( MachineBasicBlock::iterator Insert = Entry.front(); DebugLoc DL = Insert->getDebugLoc(); - TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, + *MF); if (TIDReg == AMDGPU::NoRegister) return TIDReg; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 227b817227c2..fef8904a6c87 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -340,6 +340,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::WQM; } + static bool isDisableWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM; + } + + bool isDisableWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DisableWQM; + } + static bool isVGPRSpill(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 253cc32b27e4..00f53e846db4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2949,6 +2949,10 @@ multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, MUBUFAddr64Table <0>; + let DisableWQM = 1 in { + def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>; + } + let addr64 = 0, isCodeGenOnly = 0 in { def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; } @@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins, multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, ValueType vt, SDPatternOperator atomic> { - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in { + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1, + DisableWQM = 1 in { // No return variants let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in { @@ -3423,6 +3428,7 @@ class MIMG_Store_Helper <bits<7> op, string asm, let mayStore = 1; let hasSideEffects = 1; let hasPostISelHook = 0; + let DisableWQM = 1; } multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, @@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, let mayStore = 1; let hasSideEffects = 1; let hasPostISelHook = 0; + let DisableWQM = 1; let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index 6427db87cd6f..18b7d5d62efe 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2200,7 +2200,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset, + (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset, + (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; @@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), - (!cast<MUBUF>(opcode # _BOTHEN) + (!cast<MUBUF>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), @@ -3391,6 +3391,16 @@ def : Pat < (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; +class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat < + (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), + (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) +>; + +def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>; +def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>; + // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector // comparisions still write to a pair of SGPRs, so treat these as diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td index a9b7c39096e7..9d06ccfc6c7f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// SI Intrinsic Definitions +// Backend internal SI Intrinsic Definitions. User code should not +// directly use these. // //===----------------------------------------------------------------------===// @@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in { } // End TargetPrefix = "SI", isTarget = 1 let TargetPrefix = "amdgcn", isTarget = 1 in { + // Emit 2.5 ulp, no denormal division. Should only be inserted by + // pass based on !fpmath metadata. + def int_amdgcn_fdiv_fast : Intrinsic< + [llvm_float_ty], [llvm_float_ty], [IntrNoMem] + >; + /* Control flow Intrinsics */ def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 4d12a1ef9a93..848be32cd515 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -203,7 +203,8 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( Spill.Lane = Lane; if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, + *MF); if (LaneVGPR == AMDGPU::NoRegister) // We have no VGPRs left for spilling SGPRs. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 0dd88ee45c58..347c33fb3760 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -957,10 +957,13 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, /// \brief Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. -unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const { +unsigned +SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineFunction &MF) const { + for (unsigned Reg : *RC) - if (!MRI.isPhysRegUsed(Reg)) + if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) return Reg; return AMDGPU::NoRegister; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 6e97b1b910a9..d8b2d9f4e975 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -185,7 +185,8 @@ public: unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const; + const TargetRegisterClass *RC, + const MachineFunction &MF) const; unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index c1a237ea5f51..b200c153df0b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -94,12 +94,15 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; + LiveIntervals *LIS; DenseMap<const MachineInstr *, InstrInfo> Instructions; DenseMap<MachineBasicBlock *, BlockInfo> Blocks; SmallVector<const MachineInstr *, 2> ExecExports; SmallVector<MachineInstr *, 1> LiveMaskQueries; + void markInstruction(MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist); char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); @@ -126,6 +129,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -135,8 +139,11 @@ public: char SIWholeQuadMode::ID = 0; -INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, - "SI Whole Quad Mode", false, false) +INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; @@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() { return new SIWholeQuadMode; } +void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist) { + InstrInfo &II = Instructions[&MI]; + + assert(Flag == StateWQM || Flag == StateExact); + + // Ignore if the instruction is already marked. The typical case is that we + // mark an instruction WQM multiple times, but for atomics it can happen that + // Flag is StateWQM, but Needs is already set to StateExact. In this case, + // letting the atomic run in StateExact is correct as per the relevant specs. + if (II.Needs) + return; + + II.Needs = Flag; + Worklist.push_back(&MI); +} + // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, @@ -161,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { Flags = StateWQM; - } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { + } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { // Handle export instructions with the exec mask valid flag set @@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, continue; } - Instructions[&MI].Needs = Flags; - Worklist.push_back(&MI); + markInstruction(MI, Flags, Worklist); GlobalFlags |= Flags; } @@ -214,9 +237,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references BlockInfo &BI = Blocks[MBB]; - // Control flow-type instructions that are followed by WQM computations - // must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { + // Control flow-type instructions and stores to temporary memory that are + // followed by WQM computations must themselves be in WQM. + if ((II.OutNeeds & StateWQM) && !II.Needs && + (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } @@ -249,32 +273,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, if (!Use.isReg() || !Use.isUse()) continue; - // At this point, physical registers appear as inputs or outputs - // and following them makes no sense (and would in fact be incorrect - // when the same VGPR is used as both an output and an input that leads - // to a NeedsWQM instruction). - // - // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we - // have to trace this, in practice it happens for 64-bit computations like - // pointers where both dwords are followed already anyway. - if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) - continue; - - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) { - InstrInfo &DefII = Instructions[&DefMI]; + unsigned Reg = Use.getReg(); - // Obviously skip if DefMI is already flagged as NeedWQM. - // - // The instruction might also be flagged as NeedExact. This happens when - // the result of an atomic is used in a WQM computation. In this case, - // the atomic must not run for helper pixels and the WQM result is - // undefined. - if (DefII.Needs != 0) + // Handle physical registers that we need to track; this is mostly relevant + // for VCC, which can appear as the (implicit) input of a uniform branch, + // e.g. when a loop counter is stored in a VGPR. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Reg == AMDGPU::EXEC) continue; - DefII.Needs = StateWQM; - Worklist.push_back(&DefMI); + for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) + continue; + + // Since we're in machine SSA, we do not need to track physical + // registers across basic blocks. + if (Value->isPHIDef()) + continue; + + markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + Worklist); + } + + continue; } + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) + markInstruction(DefMI, StateWQM, Worklist); } } @@ -468,6 +495,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); + LIS = &getAnalysis<LiveIntervals>(); char GlobalFlags = analyzeFunction(MF); if (!(GlobalFlags & StateWQM)) { diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index d6e7caf98a80..3cfcb1e09f0b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3857,7 +3857,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to convert two saturating conditional selects into a single SSAT SDValue SatValue; uint64_t SatConstant; - if (isSaturatingConditional(Op, SatValue, SatConstant)) + if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && + isSaturatingConditional(Op, SatValue, SatConstant)) return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index 060376b0a273..c9735f3ec277 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -3650,7 +3650,8 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), def SSAT : AI<(outs GPRnopc:$Rd), (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), - SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<5> sat_imm; bits<4> Rn; @@ -3666,7 +3667,8 @@ def SSAT : AI<(outs GPRnopc:$Rd), def SSAT16 : AI<(outs GPRnopc:$Rd), (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm, - NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> { + NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<4> sat_imm; bits<4> Rn; @@ -3679,7 +3681,8 @@ def SSAT16 : AI<(outs GPRnopc:$Rd), def USAT : AI<(outs GPRnopc:$Rd), (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), - SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsARM,HasV6]> { bits<4> Rd; bits<5> sat_imm; bits<4> Rn; @@ -3695,7 +3698,8 @@ def USAT : AI<(outs GPRnopc:$Rd), def USAT16 : AI<(outs GPRnopc:$Rd), (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm, - NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> { + NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsARM,HasV6]>{ bits<4> Rd; bits<4> sat_imm; bits<4> Rn; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index 55e5308be40e..fe699b284882 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -2240,7 +2240,8 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin, def t2SSAT: T2SatI< (outs rGPR:$Rd), (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), - NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsThumb2]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2251,7 +2252,7 @@ def t2SSAT: T2SatI< def t2SSAT16: T2SatI< (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2265,7 +2266,8 @@ def t2SSAT16: T2SatI< def t2USAT: T2SatI< (outs rGPR:$Rd), (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), - NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>, + Requires<[IsThumb2]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1110; let Inst{20} = 0; @@ -2275,7 +2277,7 @@ def t2USAT: T2SatI< def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-22} = 0b1111001110; let Inst{20} = 0; let Inst{15} = 0; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index cdad7ce1b73a..20c5f3691d23 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -518,6 +518,10 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, return true; return false; + case ELF::R_MIPS_GOT_PAGE: + case ELF::R_MICROMIPS_GOT_PAGE: + case ELF::R_MIPS_GOT_OFST: + case ELF::R_MICROMIPS_GOT_OFST: case ELF::R_MIPS_16: case ELF::R_MIPS_32: case ELF::R_MIPS_GPREL32: @@ -539,8 +543,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, case ELF::R_MIPS_SHIFT5: case ELF::R_MIPS_SHIFT6: case ELF::R_MIPS_GOT_DISP: - case ELF::R_MIPS_GOT_PAGE: - case ELF::R_MIPS_GOT_OFST: case ELF::R_MIPS_GOT_HI16: case ELF::R_MIPS_GOT_LO16: case ELF::R_MIPS_INSERT_A: @@ -589,8 +591,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, case ELF::R_MICROMIPS_PC16_S1: case ELF::R_MICROMIPS_CALL16: case ELF::R_MICROMIPS_GOT_DISP: - case ELF::R_MICROMIPS_GOT_PAGE: - case ELF::R_MICROMIPS_GOT_OFST: case ELF::R_MICROMIPS_GOT_HI16: case ELF::R_MICROMIPS_GOT_LO16: case ELF::R_MICROMIPS_SUB: diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 1622b2212665..1ce8f07092b1 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -28,12 +28,19 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) { PointerSize = CalleeSaveStackSlotSize = 8; } + // FIXME: This condition isn't quite right but it's the best we can do until + // this object can identify the ABI. It will misbehave when using O32 + // on a mips64*-* triple. + if ((TheTriple.getArch() == Triple::mipsel) || + (TheTriple.getArch() == Triple::mips)) { + PrivateGlobalPrefix = "$"; + PrivateLabelPrefix = "$"; + } + AlignmentIsInBytes = false; Data16bitsDirective = "\t.2byte\t"; Data32bitsDirective = "\t.4byte\t"; Data64bitsDirective = "\t.8byte\t"; - PrivateGlobalPrefix = "$"; - PrivateLabelPrefix = "$"; CommentString = "#"; ZeroDirective = "\t.space\t"; GPRel32Directive = "\t.gpword\t"; diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp index c248c3a50ac8..80641ed9bd31 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -57,7 +57,10 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, else Ret += "E"; - Ret += "-m:m"; + if (ABI.IsO32()) + Ret += "-m:m"; + else + Ret += "-m:e"; // Pointers are 32 bit on some ABIs. if (!ABI.IsN64()) diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index e54711195900..2c548384f1cb 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1187,6 +1187,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); @@ -13373,6 +13381,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, @@ -13380,6 +13389,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, DAG.getUNDEF(SrcVT))); } if (SrcVT.getVectorElementType() == MVT::i1) { + if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT)) + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src)); MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); @@ -13694,6 +13706,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); + if (SVT.getVectorElementType() == MVT::i1) { + if (SVT == MVT::v2i1) + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0)); + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0)); + } + switch (SVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 1672b3855b79..5f0aab9ddc68 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2661,7 +2661,8 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, unsigned &NewSrc, bool &isKill, bool &isUndef, - MachineOperand &ImplicitOp) const { + MachineOperand &ImplicitOp, + LiveVariables *LV) const { MachineFunction &MF = *MI.getParent()->getParent(); const TargetRegisterClass *RC; if (AllowSP) { @@ -2715,13 +2716,17 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. NewSrc = MF.getRegInfo().createVirtualRegister(RC); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) + MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + get(TargetOpcode::COPY)) .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) .addOperand(Src); // Which is obviously going to be dead after we're done with it. isKill = true; isUndef = false; + + if (LV) + LV->replaceKillInstruction(SrcReg, MI, *Copy); } // We've set all the parameters without issue. @@ -2900,7 +2905,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = @@ -2943,7 +2948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = @@ -2977,7 +2982,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) @@ -3016,7 +3021,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; const MachineOperand &Src2 = MI.getOperand(2); @@ -3024,7 +3029,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, - SrcReg2, isKill2, isUndef2, ImplicitOp2)) + SrcReg2, isKill2, isUndef2, ImplicitOp2, LV)) return nullptr; MachineInstrBuilder MIB = @@ -3087,7 +3092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) + SrcReg, isKill, isUndef, ImplicitOp, LV)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index 858f35d1cbf0..a8a9f629fc1d 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -230,7 +230,7 @@ public: bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, bool &isKill, bool &isUndef, - MachineOperand &ImplicitOp) const; + MachineOperand &ImplicitOp, LiveVariables *LV) const; /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 9a98f5cac2ee..f91764a67d1c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -1820,7 +1820,7 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; -def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, +def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss @@ -1836,7 +1836,7 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; -def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, +def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss @@ -2009,24 +2009,35 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (loadv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + [(set VR256:$dst, + (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, + [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 + (loadv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), @@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index fff544085414..787f4342831d 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -332,6 +332,7 @@ struct ArgumentUsesTracker : public CaptureTracker { namespace llvm { template <> struct GraphTraits<ArgumentGraphNode *> { typedef ArgumentGraphNode NodeType; + typedef ArgumentGraphNode *NodeRef; typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType; static inline NodeType *getEntryNode(NodeType *A) { return A; } diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 310c29275faf..99b12d4db0d0 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -44,6 +44,7 @@ #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/Evaluator.h" #include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -779,7 +780,8 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL, // Instructions could multiply use V. while (UI != E && *UI == I) ++UI; - I->eraseFromParent(); + if (isInstructionTriviallyDead(I, TLI)) + I->eraseFromParent(); } } diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index cf5b76dc365b..df6a48e05d42 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -134,6 +134,10 @@ static cl::opt<int> PreInlineThreshold( cl::desc("Control the amount of inlining in pre-instrumentation inliner " "(default = 75)")); +static cl::opt<bool> EnableGVNHoist( + "enable-gvn-hoist", cl::init(false), cl::Hidden, + cl::desc("Enable the experimental GVN Hoisting pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -232,7 +236,8 @@ void PassManagerBuilder::populateFunctionPassManager( FPM.add(createCFGSimplificationPass()); FPM.add(createSROAPass()); FPM.add(createEarlyCSEPass()); - FPM.add(createGVNHoistPass()); + if(EnableGVNHoist) + FPM.add(createGVNHoistPass()); FPM.add(createLowerExpectIntrinsicPass()); } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index d7eed790e2ab..8f1ff8ac0e66 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -553,8 +553,11 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } } + // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring + // decomposeBitTestICmp() might help. { - unsigned BitWidth = DL.getTypeSizeInBits(TrueVal->getType()); + unsigned BitWidth = + DL.getTypeSizeInBits(TrueVal->getType()->getScalarType()); APInt MinSignedValue = APInt::getSignBit(BitWidth); Value *X; const APInt *Y, *C; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 51c3262b5d14..377ccb9c37f7 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2830,7 +2830,8 @@ bool InstCombiner::run() { // Add operands to the worklist. replaceInstUsesWith(*I, C); ++NumConstProp; - eraseInstFromFunction(*I); + if (isInstructionTriviallyDead(I, TLI)) + eraseInstFromFunction(*I); MadeIRChange = true; continue; } @@ -2851,7 +2852,8 @@ bool InstCombiner::run() { // Add operands to the worklist. replaceInstUsesWith(*I, C); ++NumConstProp; - eraseInstFromFunction(*I); + if (isInstructionTriviallyDead(I, TLI)) + eraseInstFromFunction(*I); MadeIRChange = true; continue; } @@ -3007,7 +3009,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, << *Inst << '\n'); Inst->replaceAllUsesWith(C); ++NumConstProp; - Inst->eraseFromParent(); + if (isInstructionTriviallyDead(Inst, TLI)) + Inst->eraseFromParent(); continue; } diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index dcb62d3ed1b5..41041c78db97 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -272,8 +272,9 @@ static bool shouldInstrumentReadWriteFromAddress(Value *Addr) { return false; } - // Check if the global is in a GCOV counter array. - if (GV->getName().startswith("__llvm_gcov_ctr")) + // Check if the global is private gcov data. + if (GV->getName().startswith("__llvm_gcov") || + GV->getName().startswith("__llvm_gcda")) return false; } diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index 88172d19fe5a..9e982194bac7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -19,6 +19,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/IR/Constant.h" @@ -90,11 +91,13 @@ bool ConstantPropagation::runOnFunction(Function &F) { // Remove the dead instruction. WorkList.erase(I); - I->eraseFromParent(); + if (isInstructionTriviallyDead(I, TLI)) { + I->eraseFromParent(); + ++NumInstKilled; + } // We made a change to the function... Changed = true; - ++NumInstKilled; } } return Changed; diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 9d0ef42e0396..0b16e2703dc4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -582,6 +582,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // its simpler value. if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); + bool Killed = false; if (!Inst->use_empty()) { Inst->replaceAllUsesWith(V); Changed = true; @@ -589,11 +590,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (isInstructionTriviallyDead(Inst, &TLI)) { Inst->eraseFromParent(); Changed = true; + Killed = true; } - if (Changed) { + if (Changed) ++NumSimplify; + if (Killed) continue; - } } // If this is a simple instruction that we can value number, process it. diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 542cf38e43bb..e958563e2d10 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -815,6 +815,14 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, if (!Cast->getModule()->getDataLayout().isLegalInteger(Width)) return; + // Check that `Cast` actually extends the induction variable (we rely on this + // later). This takes care of cases where `Cast` is extending a truncation of + // the narrow induction variable, and thus can end up being narrower than the + // "narrow" induction variable. + uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType()); + if (NarrowIVWidth >= Width) + return; + // Cast is either an sext or zext up to this point. // We should not widen an indvar if arithmetics on the wider indvar are more // expensive than those on the narrower indvar. We check only the cost of ADD diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b9e717cf763e..d1769fc3ebb3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -758,7 +758,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI); if (SimpleVal) { I->replaceAllUsesWith(SimpleVal); - I->eraseFromParent(); + if (isInstructionTriviallyDead(I, TLI)) + I->eraseFromParent(); Condition = SimpleVal; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 2c0a70e44f57..cdd17fc516a8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -377,9 +377,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, &I, I.getModule()->getDataLayout(), TLI)) { DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); CurAST->copyValue(&I, C); - CurAST->deleteValue(&I); I.replaceAllUsesWith(C); - I.eraseFromParent(); + if (isInstructionTriviallyDead(&I, TLI)) { + CurAST->deleteValue(&I); + I.eraseFromParent(); + } continue; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 77c77eb7d798..70bd9d3cca95 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -4442,6 +4442,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Determine an input position which will be dominated by the operands and // which will dominate the result. IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); + Rewriter.setInsertPoint(&*IP); // Inform the Rewriter if we have a post-increment use, so that it can // perform an advantageous expansion. @@ -4473,7 +4474,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr))); } // Expand the ScaledReg portion. @@ -4491,14 +4492,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand ScaleReg as if it was part of the base regs. if (F.Scale == 1) Ops.push_back( - SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP))); + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr))); else { // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr); } } else { // Otherwise just expand the scaled register and an explicit scale, @@ -4508,11 +4509,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Unless the addressing mode will not be folded. if (!Ops.empty() && LU.Kind == LSRUse::Address && isAMCompletelyFolded(TTI, LU, F)) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)); if (F.Scale != 1) ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); @@ -4524,7 +4525,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4534,7 +4535,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Flush the operand list to suppress SCEVExpander hoisting of both folded and // unfolded offsets. LSR assumes they both live next to their uses. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4570,7 +4571,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); - Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty); // We're done expanding now, so reset the rewriter. Rewriter.clearPostInc(); diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index c5ca56360fc8..4f1052d81433 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -552,9 +553,39 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // two PHINodes, the iteration over the old PHIs remains valid, and the // mapping will just map us to the new node (which may not even be a PHI // node). + const DataLayout &DL = NewFunc->getParent()->getDataLayout(); + SmallSetVector<const Value *, 8> Worklist; for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx) - if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]])) - recursivelySimplifyInstruction(PN); + if (isa<PHINode>(VMap[PHIToResolve[Idx]])) + Worklist.insert(PHIToResolve[Idx]); + + // Note that we must test the size on each iteration, the worklist can grow. + for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { + const Value *OrigV = Worklist[Idx]; + auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV)); + if (!I) + continue; + + // See if this instruction simplifies. + Value *SimpleV = SimplifyInstruction(I, DL); + if (!SimpleV) + continue; + + // Stash away all the uses of the old instruction so we can check them for + // recursive simplifications after a RAUW. This is cheaper than checking all + // uses of To on the recursive step in most cases. + for (const User *U : OrigV->users()) + Worklist.insert(cast<Instruction>(U)); + + // Replace the instruction with its simplified value. + I->replaceAllUsesWith(SimpleV); + + // If the original instruction had no side effects, remove it. + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); + else + VMap[OrigV] = I; + } // Now that the inlined function body has been fully constructed, go through // and zap unconditional fall-through branches. This happens all the time when diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index 1fbb19d2b8ad..e82c07fd7b59 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1294,6 +1294,13 @@ updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode, return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); } +/// Return the result of AI->isStaticAlloca() if AI were moved to the entry +/// block. Allocas used in inalloca calls and allocas of dynamic array size +/// cannot be static. +static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) { + return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca(); +} + /// Update inlined instructions' line numbers to /// to encode location where these instructions are inlined. static void fixupLineNumbers(Function *Fn, Function::iterator FI, @@ -1328,7 +1335,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // Don't update static allocas, as they may get moved later. if (auto *AI = dyn_cast<AllocaInst>(BI)) - if (isa<Constant>(AI->getArraySize())) + if (allocaWouldBeStaticInEntry(AI)) continue; BI->setDebugLoc(TheCallDL); @@ -1626,7 +1633,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, continue; } - if (!isa<Constant>(AI->getArraySize())) + if (!allocaWouldBeStaticInEntry(AI)) continue; // Keep track of the static allocas that we inline into the caller. @@ -1635,7 +1642,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Scan for the block of allocas that we can move over, and move them // all at once. while (isa<AllocaInst>(I) && - isa<Constant>(cast<AllocaInst>(I)->getArraySize())) { + allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) { IFI.StaticAllocas.push_back(cast<AllocaInst>(I)); ++I; } diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 9658966779b9..0d5a25b8ebc5 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -64,6 +64,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, DominatorTree &DT, LoopInfo &LI) { SmallVector<Use *, 16> UsesToRewrite; SmallVector<BasicBlock *, 8> ExitBlocks; + SmallSetVector<PHINode *, 16> PHIsToRemove; PredIteratorCache PredCache; bool Changed = false; @@ -115,7 +116,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, SmallVector<PHINode *, 16> AddedPHIs; SmallVector<PHINode *, 8> PostProcessPHIs; - SSAUpdater SSAUpdate; + SmallVector<PHINode *, 4> InsertedPHIs; + SSAUpdater SSAUpdate(&InsertedPHIs); SSAUpdate.Initialize(I->getType(), I->getName()); // Insert the LCSSA phi's into all of the exit blocks dominated by the @@ -184,6 +186,14 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, // Otherwise, do full PHI insertion. SSAUpdate.RewriteUse(*UseToRewrite); + + // SSAUpdater might have inserted phi-nodes inside other loops. We'll need + // to post-process them to keep LCSSA form. + for (PHINode *InsertedPN : InsertedPHIs) { + if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent())) + if (!L->contains(OtherLoop)) + PostProcessPHIs.push_back(InsertedPN); + } } // Post process PHI instructions that were inserted into another disjoint @@ -196,13 +206,19 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, Worklist.push_back(PostProcessPN); } - // Remove PHI nodes that did not have any uses rewritten. + // Keep track of PHI nodes that we want to remove because they did not have + // any uses rewritten. for (PHINode *PN : AddedPHIs) if (PN->use_empty()) - PN->eraseFromParent(); + PHIsToRemove.insert(PN); Changed = true; } + // Remove PHI nodes that did not have any uses rewritten. + for (PHINode *PN : PHIsToRemove) { + assert (PN->use_empty() && "Trying to remove a phi with uses."); + PN->eraseFromParent(); + } return Changed; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index b3a928bf7753..2846e8f235b7 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -327,6 +327,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, else NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I)); + SmallVector<BasicBlock *, 8> OuterLoopBlocks; + OuterLoopBlocks.push_back(NewBB); // Now that we know which blocks are in L and which need to be moved to // OuterLoop, move any blocks that need it. for (unsigned i = 0; i != L->getBlocks().size(); ++i) { @@ -334,12 +336,53 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (!BlocksInL.count(BB)) { // Move this block to the parent, updating the exit blocks sets L->removeBlockFromLoop(BB); - if ((*LI)[BB] == L) + if ((*LI)[BB] == L) { LI->changeLoopFor(BB, NewOuter); + OuterLoopBlocks.push_back(BB); + } --i; } } + // Split edges to exit blocks from the inner loop, if they emerged in the + // process of separating the outer one. + SmallVector<BasicBlock *, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + for (BasicBlock *ExitBlock : ExitBlockSet) { + if (any_of(predecessors(ExitBlock), + [L](BasicBlock *BB) { return !L->contains(BB); })) { + rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); + } + } + + if (PreserveLCSSA) { + // Fix LCSSA form for L. Some values, which previously were only used inside + // L, can now be used in NewOuter loop. We need to insert phi-nodes for them + // in corresponding exit blocks. + + // Go through all instructions in OuterLoopBlocks and check if they are + // using operands from the inner loop. In this case we'll need to fix LCSSA + // for these instructions. + SmallSetVector<Instruction *, 8> WorklistSet; + for (BasicBlock *OuterBB: OuterLoopBlocks) { + for (Instruction &I : *OuterBB) { + for (Value *Op : I.operands()) { + Instruction *OpI = dyn_cast<Instruction>(Op); + if (!OpI || !L->contains(OpI)) + continue; + WorklistSet.insert(OpI); + } + } + } + SmallVector<Instruction *, 8> Worklist(WorklistSet.begin(), + WorklistSet.end()); + formLCSSAForInstructions(Worklist, *DT, *LI); + assert(NewOuter->isRecursivelyLCSSAForm(*DT) && + "LCSSA is broken after separating nested loops!"); + } + return NewOuter; } @@ -541,17 +584,12 @@ ReprocessLoop: SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); for (BasicBlock *ExitBlock : ExitBlockSet) { - for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock); - PI != PE; ++PI) - // Must be exactly this loop: no subloops, parent loops, or non-loop preds - // allowed. - if (!L->contains(*PI)) { - if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) { - ++NumInserted; - Changed = true; - } - break; - } + if (any_of(predecessors(ExitBlock), + [L](BasicBlock *BB) { return !L->contains(BB); })) { + rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA); + ++NumInserted; + Changed = true; + } } // If the header has more than two predecessors at this point (from the diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8b85e320d3b2..ee5733d20f4f 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -50,6 +50,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -220,6 +221,81 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; +// A traits type that is intended to be used in graph algorithms. The graph it +// models starts at the loop header, and traverses the BasicBlocks that are in +// the loop body, but not the loop header. Since the loop header is skipped, +// the back edges are excluded. +struct LoopBodyTraits { + using NodeRef = std::pair<const Loop *, BasicBlock *>; + + // This wraps a const Loop * into the iterator, so we know which edges to + // filter out. + class WrappedSuccIterator + : public iterator_adaptor_base< + WrappedSuccIterator, succ_iterator, + typename std::iterator_traits<succ_iterator>::iterator_category, + NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> { + using BaseT = iterator_adaptor_base< + WrappedSuccIterator, succ_iterator, + typename std::iterator_traits<succ_iterator>::iterator_category, + NodeRef, std::ptrdiff_t, NodeRef *, NodeRef>; + + const Loop *L; + + public: + WrappedSuccIterator(succ_iterator Begin, const Loop *L) + : BaseT(Begin), L(L) {} + + NodeRef operator*() const { return {L, *I}; } + }; + + struct LoopBodyFilter { + bool operator()(NodeRef N) const { + const Loop *L = N.first; + return N.second != L->getHeader() && L->contains(N.second); + } + }; + + using ChildIteratorType = + filter_iterator<WrappedSuccIterator, LoopBodyFilter>; + + static NodeRef getEntryNode(const Loop &G) { return {&G, G.getHeader()}; } + + static ChildIteratorType child_begin(NodeRef Node) { + return make_filter_range(make_range<WrappedSuccIterator>( + {succ_begin(Node.second), Node.first}, + {succ_end(Node.second), Node.first}), + LoopBodyFilter{}) + .begin(); + } + + static ChildIteratorType child_end(NodeRef Node) { + return make_filter_range(make_range<WrappedSuccIterator>( + {succ_begin(Node.second), Node.first}, + {succ_end(Node.second), Node.first}), + LoopBodyFilter{}) + .end(); + } +}; + +/// Returns true if the given loop body has a cycle, excluding the loop +/// itself. +static bool hasCyclesInLoopBody(const Loop &L) { + if (!L.empty()) + return true; + + for (const auto SCC : + make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L), + scc_iterator<Loop, LoopBodyTraits>::end(L))) { + if (SCC.size() > 1) { + DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n"); + DEBUG(L.dump()); + return true; + } + } + return false; +} + /// \brief This modifies LoopAccessReport to initialize message with /// loop-vectorizer-specific part. class VectorizationReport : public LoopAccessReport { @@ -1782,12 +1858,14 @@ private: Instruction *UnsafeAlgebraInst; }; -static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { - if (L.empty()) - return V.push_back(&L); - +static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { + if (L.empty()) { + if (!hasCyclesInLoopBody(L)) + V.push_back(&L); + return; + } for (Loop *InnerL : L) - addInnerLoop(*InnerL, V); + addAcyclicInnerLoop(*InnerL, V); } /// The LoopVectorize Pass. @@ -4395,6 +4473,9 @@ bool LoopVectorizationLegality::canVectorize() { return false; } + // FIXME: The code is currently dead, since the loop gets sent to + // LoopVectorizationLegality is already an innermost loop. + // // We can only vectorize innermost loops. if (!TheLoop->empty()) { emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); @@ -6639,7 +6720,7 @@ bool LoopVectorizePass::runImpl( SmallVector<Loop *, 8> Worklist; for (Loop *L : *LI) - addInnerLoop(*L, Worklist); + addAcyclicInnerLoop(*L, Worklist); LoopsAnalyzed += Worklist.size(); |
