diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2018-01-24 20:23:48 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2018-01-24 20:23:48 +0000 |
| commit | a096e0bdf6cfa020569afca490d8e4c9ac8ebb01 (patch) | |
| tree | 39ef21ba905e021d44b9a5fb47336d4a864da27e /lib | |
| parent | d215fd3b74b90f5dc1964610926fcc2a20f959aa (diff) | |
Notes
Diffstat (limited to 'lib')
22 files changed, 396 insertions, 261 deletions
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 9dc1ab4e6bb5..26ca8d4ee88c 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -2700,8 +2700,13 @@ public: // we still need to collect it due to original value is different. // And later we will need all original values as anchors during // finding the common Phi node. + // We also must reject the case when base offset is different and + // scale reg is not null, we cannot handle this case due to merge of + // different offsets will be used as ScaleReg. if (DifferentField != ExtAddrMode::MultipleFields && - DifferentField != ExtAddrMode::ScaleField) { + DifferentField != ExtAddrMode::ScaleField && + (DifferentField != ExtAddrMode::BaseOffsField || + !NewAddrMode.ScaledReg)) { AddrModes.emplace_back(NewAddrMode); return true; } diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp index 8b9545da914e..3888226fa059 100644 --- a/lib/CodeGen/GlobalMerge.cpp +++ b/lib/CodeGen/GlobalMerge.cpp @@ -577,7 +577,8 @@ bool GlobalMerge::doInitialization(Module &M) { for (auto &GV : M.globals()) { // Merge is safe for "normal" internal or external globals only if (GV.isDeclaration() || GV.isThreadLocal() || - GV.hasSection() || GV.hasImplicitSection()) + GV.hasSection() || GV.hasImplicitSection() || + GV.hasDLLExportStorageClass()) continue; // It's not safe to merge globals that may be preempted diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp index 45078081987a..11acbe687a31 100644 --- a/lib/CodeGen/PeepholeOptimizer.cpp +++ b/lib/CodeGen/PeepholeOptimizer.cpp @@ -719,15 +719,14 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg, CurSrcPair = Pair; ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, !DisableAdvCopyOpt, TII); - ValueTrackerResult Res; - bool ShouldRewrite = false; - do { - // Follow the chain of copies until we reach the top of the use-def chain - // or find a more suitable source. - Res = ValTracker.getNextSource(); + // Follow the chain of copies until we find a more suitable source, a phi + // or have to abort. + while (true) { + ValueTrackerResult Res = ValTracker.getNextSource(); + // Abort at the end of a chain (without finding a suitable source). if (!Res.isValid()) - break; + return false; // Insert the Def -> Use entry for the recently found source. ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair); @@ -763,24 +762,19 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg, if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg)) return false; + // Keep following the chain if the value isn't any better yet. const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); - ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, - CurSrcPair.SubReg); - } while (!ShouldRewrite); - - // Continue looking for new sources... - if (Res.isValid()) - continue; + if (!TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, CurSrcPair.SubReg)) + continue; - // Do not continue searching for a new source if the there's at least - // one use-def which cannot be rewritten. - if (!ShouldRewrite) - return false; - } + // We currently cannot deal with subreg operands on PHI instructions + // (see insertPHI()). + if (PHICount > 0 && CurSrcPair.SubReg != 0) + continue; - if (PHICount >= RewritePHILimit) { - DEBUG(dbgs() << "findNextSource: PHI limit reached\n"); - return false; + // We found a suitable source, and are done with this chain. + break; + } } // If we did not find a more suitable source, there is nothing to optimize. @@ -799,6 +793,9 @@ insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII, assert(!SrcRegs.empty() && "No sources to create a PHI instruction?"); const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg); + // NewRC is only correct if no subregisters are involved. findNextSource() + // should have rejected those cases already. + assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand"); unsigned NewVR = MRI->createVirtualRegister(NewRC); MachineBasicBlock *MBB = OrigPHI->getParent(); MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(), diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 81bff4d7eefa..2c6b724c02df 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3842,9 +3842,16 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, EVT ExtVT; if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) { - // Only add this load if we can make it more narrow. - if (ExtVT.bitsLT(Load->getMemoryVT())) + + // ZEXTLOAD is already small enough. + if (Load->getExtensionType() == ISD::ZEXTLOAD && + ExtVT.bitsGE(Load->getMemoryVT())) + continue; + + // Use LE to convert equal sized loads to zext. + if (ExtVT.bitsLE(Load->getMemoryVT())) Loads.insert(Load); + continue; } return false; @@ -3899,11 +3906,13 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { if (Loads.size() == 0) return false; + DEBUG(dbgs() << "Backwards propagate AND: "; N->dump()); SDValue MaskOp = N->getOperand(1); // If it exists, fixup the single node we allow in the tree that needs // masking. if (FixupNode) { + DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0), SDValue(FixupNode, 0), MaskOp); @@ -3914,14 +3923,21 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { // Narrow any constants that need it. for (auto *LogicN : NodesWithConsts) { - auto *C = cast<ConstantSDNode>(LogicN->getOperand(1)); - SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0), - SDValue(C, 0), MaskOp); - DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And); + SDValue Op0 = LogicN->getOperand(0); + SDValue Op1 = LogicN->getOperand(1); + + if (isa<ConstantSDNode>(Op0)) + std::swap(Op0, Op1); + + SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), + Op1, MaskOp); + + DAG.UpdateNodeOperands(LogicN, Op0, And); } // Create narrow loads. for (auto *Load : Loads) { + DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), SDValue(Load, 0), MaskOp); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); @@ -5209,7 +5225,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { return SDValue(); // Loads must share the same base address - BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); + BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); int64_t ByteOffsetFromBase = 0; if (!Base) Base = Ptr; @@ -12928,7 +12944,7 @@ void DAGCombiner::getStoreMergeCandidates( StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); + BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); EVT MemVT = St->getMemoryVT(); SDValue Val = peekThroughBitcast(St->getValue()); @@ -12949,7 +12965,7 @@ void DAGCombiner::getStoreMergeCandidates( EVT LoadVT; if (IsLoadSrc) { auto *Ld = cast<LoadSDNode>(Val); - LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); + LBasePtr = BaseIndexOffset::match(Ld, DAG); LoadVT = Ld->getMemoryVT(); // Load and store should be the same type. if (MemVT != LoadVT) @@ -12968,7 +12984,7 @@ void DAGCombiner::getStoreMergeCandidates( return false; // The Load's Base Ptr must also match if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) { - auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); + auto LPtr = BaseIndexOffset::match(OtherLd, DAG); if (LoadVT != OtherLd->getMemoryVT()) return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) @@ -12992,7 +13008,7 @@ void DAGCombiner::getStoreMergeCandidates( Val.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; } - Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); + Ptr = BaseIndexOffset::match(Other, DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; @@ -13365,7 +13381,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (Ld->getMemoryVT() != MemVT) break; - BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); + BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG); // If this is not the first ptr that we check. int64_t LdOffset = 0; if (LdBasePtr.getBase().getNode()) { @@ -17432,44 +17448,46 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize(); // Check for BaseIndexOffset matching. - BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); - BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); + BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG); + BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG); int64_t PtrDiff; - if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) - return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); + if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) { + if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) + return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); - // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be - // able to calculate their relative offset if at least one arises - // from an alloca. However, these allocas cannot overlap and we - // can infer there is no alias. - if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase())) - if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) { - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - // If the base are the same frame index but the we couldn't find a - // constant offset, (indices are different) be conservative. - if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || - !MFI.isFixedObjectIndex(B->getIndex()))) - return false; - } + // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be + // able to calculate their relative offset if at least one arises + // from an alloca. However, these allocas cannot overlap and we + // can infer there is no alias. + if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase())) + if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + // If the base are the same frame index but the we couldn't find a + // constant offset, (indices are different) be conservative. + if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || + !MFI.isFixedObjectIndex(B->getIndex()))) + return false; + } - bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase()); - bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase()); - bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase()); - bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase()); - bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase()); - bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase()); + bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase()); + bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase()); + bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase()); + bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase()); + bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase()); + bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase()); - // If of mismatched base types or checkable indices we can check - // they do not alias. - if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || - (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && - (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) - return false; + // If of mismatched base types or checkable indices we can check + // they do not alias. + if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || + (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && + (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) + return false; + } - // If we know required SrcValue1 and SrcValue2 have relatively large alignment - // compared to the size and offset of the access, we may be able to prove they - // do not alias. This check is conservative for now to catch cases created by - // splitting vector types. + // If we know required SrcValue1 and SrcValue2 have relatively large + // alignment compared to the size and offset of the access, we may be able + // to prove they do not alias. This check is conservative for now to catch + // cases created by splitting vector types. int64_t SrcValOffset0 = Op0->getSrcValueOffset(); int64_t SrcValOffset1 = Op1->getSrcValueOffset(); unsigned OrigAlignment0 = Op0->getOriginalAlignment(); @@ -17479,8 +17497,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; - // There is no overlap between these relatively aligned accesses of similar - // size. Return no alias. + // There is no overlap between these relatively aligned accesses of + // similar size. Return no alias. if ((OffAlign0 + NumBytes0) <= OffAlign1 || (OffAlign1 + NumBytes1) <= OffAlign0) return false; @@ -17643,7 +17661,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); + BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) @@ -17669,7 +17687,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { break; // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); + BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG); // Check that the base pointer is the same as the original one. if (!BasePtr.equalBaseIndex(Ptr, DAG)) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index bb1dc17b7a1b..b566c232cbc3 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2965,12 +2965,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { case ISD::ZERO_EXTEND: LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res, DAG.getValueType(AtomicType)); - RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2)); + RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType); ExtRes = LHS; break; case ISD::ANY_EXTEND: LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType); - RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2)); + RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType); break; default: llvm_unreachable("Invalid atomic op extension"); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4c8b63d2f239..3ffc6fa9a059 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7947,11 +7947,8 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, if (VT.getSizeInBits() / 8 != Bytes) return false; - SDValue Loc = LD->getOperand(1); - SDValue BaseLoc = Base->getOperand(1); - - auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this); - auto LocDecomp = BaseIndexOffset::match(Loc, *this); + auto BaseLocDecomp = BaseIndexOffset::match(Base, *this); + auto LocDecomp = BaseIndexOffset::match(LD, *this); int64_t Offset = 0; if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset)) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index d5980919d03c..da1574f60524 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -21,6 +21,9 @@ using namespace llvm; bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG, int64_t &Off) { + // Conservatively fail if we a match failed.. + if (!Base.getNode() || !Other.Base.getNode()) + return false; // Initial Offset difference. Off = Other.Offset - Offset; @@ -72,13 +75,29 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other, } /// Parses tree in Ptr for base, index, offset addresses. -BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) { +BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N, + const SelectionDAG &DAG) { + SDValue Ptr = N->getBasePtr(); + // (((B + I*M) + c)) + c ... SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr); SDValue Index = SDValue(); int64_t Offset = 0; bool IsIndexSignExt = false; + // pre-inc/pre-dec ops are components of EA. + if (N->getAddressingMode() == ISD::PRE_INC) { + if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) + Offset += C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } else if (N->getAddressingMode() == ISD::PRE_DEC) { + if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset())) + Offset -= C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } + // Consume constant adds & ors with appropriate masking. while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) { if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) { diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 224ae1a3236a..b29a33ac1c14 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -132,9 +132,18 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) { setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Darwin 10 and higher has an optimized __bzero. - if (!TT.isMacOSX() || !TT.isMacOSXVersionLT(10, 6) || TT.isArch64Bit()) { - setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero"); + // Some darwins have an optimized __bzero/bzero function. + switch (TT.getArch()) { + case Triple::x86: + case Triple::x86_64: + if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6)) + setLibcallName(RTLIB::BZERO, "__bzero"); + break; + case Triple::aarch64: + setLibcallName(RTLIB::BZERO, "bzero"); + break; + default: + break; } if (darwinHasSinCos(TT)) { diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp index ee067a912e3c..f7170e714b9b 100644 --- a/lib/Linker/IRMover.cpp +++ b/lib/Linker/IRMover.cpp @@ -954,7 +954,12 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV, NewGV->setLinkage(GlobalValue::InternalLinkage); Constant *C = NewGV; - if (DGV) + // Only create a bitcast if necessary. In particular, with + // DebugTypeODRUniquing we may reach metadata in the destination module + // containing a GV from the source module, in which case SGV will be + // the same as DGV and NewGV, and TypeMap.get() will assert since it + // assumes it is being invoked on a type in the source module. + if (DGV && NewGV != SGV) C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType())); if (DGV && NewGV != DGV) { diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp index 82b81ccc24da..5fd5bde9f1eb 100644 --- a/lib/MC/MCCodeView.cpp +++ b/lib/MC/MCCodeView.cpp @@ -76,6 +76,14 @@ bool CodeViewContext::addFile(MCStreamer &OS, unsigned FileNumber, return true; } +MCCVFunctionInfo *CodeViewContext::getCVFunctionInfo(unsigned FuncId) { + if (FuncId >= Functions.size()) + return nullptr; + if (Functions[FuncId].isUnallocatedFunctionInfo()) + return nullptr; + return &Functions[FuncId]; +} + bool CodeViewContext::recordFunctionId(unsigned FuncId) { if (FuncId >= Functions.size()) Functions.resize(FuncId + 1); @@ -247,6 +255,67 @@ void CodeViewContext::emitFileChecksumOffset(MCObjectStreamer &OS, OS.EmitValueImpl(SRE, 4); } +void CodeViewContext::addLineEntry(const MCCVLineEntry &LineEntry) { + size_t Offset = MCCVLines.size(); + auto I = MCCVLineStartStop.insert( + {LineEntry.getFunctionId(), {Offset, Offset + 1}}); + if (!I.second) + I.first->second.second = Offset + 1; + MCCVLines.push_back(LineEntry); +} + +std::vector<MCCVLineEntry> +CodeViewContext::getFunctionLineEntries(unsigned FuncId) { + std::vector<MCCVLineEntry> FilteredLines; + auto I = MCCVLineStartStop.find(FuncId); + if (I != MCCVLineStartStop.end()) { + MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(FuncId); + for (size_t Idx = I->second.first, End = I->second.second; Idx != End; + ++Idx) { + unsigned LocationFuncId = MCCVLines[Idx].getFunctionId(); + if (LocationFuncId == FuncId) { + // This was a .cv_loc directly for FuncId, so record it. + FilteredLines.push_back(MCCVLines[Idx]); + } else { + // Check if the current location is inlined in this function. If it is, + // synthesize a statement .cv_loc at the original inlined call site. + auto I = SiteInfo->InlinedAtMap.find(LocationFuncId); + if (I != SiteInfo->InlinedAtMap.end()) { + MCCVFunctionInfo::LineInfo &IA = I->second; + // Only add the location if it differs from the previous location. + // Large inlined calls will have many .cv_loc entries and we only need + // one line table entry in the parent function. + if (FilteredLines.empty() || + FilteredLines.back().getFileNum() != IA.File || + FilteredLines.back().getLine() != IA.Line || + FilteredLines.back().getColumn() != IA.Col) { + FilteredLines.push_back(MCCVLineEntry( + MCCVLines[Idx].getLabel(), + MCCVLoc(FuncId, IA.File, IA.Line, IA.Col, false, false))); + } + } + } + } + } + return FilteredLines; +} + +std::pair<size_t, size_t> CodeViewContext::getLineExtent(unsigned FuncId) { + auto I = MCCVLineStartStop.find(FuncId); + // Return an empty extent if there are no cv_locs for this function id. + if (I == MCCVLineStartStop.end()) + return {~0ULL, 0}; + return I->second; +} + +ArrayRef<MCCVLineEntry> CodeViewContext::getLinesForExtent(size_t L, size_t R) { + if (R <= L) + return None; + if (L >= MCCVLines.size()) + return None; + return makeArrayRef(&MCCVLines[L], R - L); +} + void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS, unsigned FuncId, const MCSymbol *FuncBegin, diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index c2d3ae31c624..b85b4e082996 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -868,6 +868,40 @@ bool AArch64InstructionSelector::select(MachineInstr &I, if (OpFlags & AArch64II::MO_GOT) { I.setDesc(TII.get(AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); + } else if (TM.getCodeModel() == CodeModel::Large) { + // Materialize the global using movz/movk instructions. + unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + auto InsertPt = std::next(I.getIterator()); + auto MovZ = + BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi)) + .addDef(MovZDstReg); + MovZ->addOperand(MF, I.getOperand(1)); + MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | + AArch64II::MO_NC); + MovZ->addOperand(MF, MachineOperand::CreateImm(0)); + constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); + + auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags, + unsigned Offset, unsigned ForceDstReg) { + unsigned DstReg = + ForceDstReg ? ForceDstReg + : MRI.createVirtualRegister(&AArch64::GPR64RegClass); + auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(), + TII.get(AArch64::MOVKXi)) + .addDef(DstReg) + .addReg(SrcReg); + MovI->addOperand(MF, MachineOperand::CreateGA( + GV, MovZ->getOperand(1).getOffset(), Flags)); + MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); + constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); + return DstReg; + }; + unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(), + AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); + DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); + BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); + I.eraseFromParent(); + return true; } else { I.setDesc(TII.get(AArch64::MOVaddr)); I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE); diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 740861851185..f08c50540656 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -821,7 +821,6 @@ namespace llvm { MutableArrayRef<int> NewMask, unsigned Options = None); OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, MutableArrayRef<int> NewMask); - OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results); OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb, ResultStack &Results); OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb, @@ -1139,25 +1138,6 @@ OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb, return concat(Out[0], Out[1], Results); } -OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) { - DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); - - int VecLen = SM.Mask.size(); - SmallVector<uint8_t,128> UsedBytes(VecLen); - bool HasUnused = false; - for (int I = 0; I != VecLen; ++I) { - if (SM.Mask[I] != -1) - UsedBytes[I] = 0xFF; - else - HasUnused = true; - } - if (!HasUnused) - return Va; - SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode)); - Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)}); - return OpRef::res(Results.top()); -} - OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index f9de65fcb1df..f0e8b11a3d9c 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -142,6 +142,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended. + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); @@ -1154,6 +1157,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; + case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8"; + case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; @@ -8834,6 +8839,42 @@ SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { return Op; } +// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be +// compared to a value that is atomically loaded (atomic loads zero-extend). +SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP && + "Expecting an atomic compare-and-swap here."); + SDLoc dl(Op); + auto *AtomicNode = cast<AtomicSDNode>(Op.getNode()); + EVT MemVT = AtomicNode->getMemoryVT(); + if (MemVT.getSizeInBits() >= 32) + return Op; + + SDValue CmpOp = Op.getOperand(2); + // If this is already correctly zero-extended, leave it alone. + auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits()); + if (DAG.MaskedValueIsZero(CmpOp, HighBits)) + return Op; + + // Clear the high bits of the compare operand. + unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1; + SDValue NewCmpOp = + DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp, + DAG.getConstant(MaskVal, dl, MVT::i32)); + + // Replace the existing compare operand with the properly zero-extended one. + SmallVector<SDValue, 4> Ops; + for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++) + Ops.push_back(AtomicNode->getOperand(i)); + Ops[2] = NewCmpOp; + MachineMemOperand *MMO = AtomicNode->getMemOperand(); + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other); + auto NodeTy = + (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16; + return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -9325,6 +9366,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerREM(Op, DAG); case ISD::BSWAP: return LowerBSWAP(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: + return LowerATOMIC_CMP_SWAP(Op, DAG); } } diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index b119e5b4a564..b3215a84829e 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -430,6 +430,11 @@ namespace llvm { /// The 4xf32 load used for v4i1 constants. QVLFSb, + /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes + /// except they ensure that the compare input is zero-extended for + /// sub-word versions because the atomic loads zero-extend. + ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16, + /// GPRC = TOC_ENTRY GA, TOC /// Loads the entry for GA from the TOC, where the TOC base is given by /// the last operand. @@ -955,6 +960,7 @@ namespace llvm { SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index a932d05b24ee..43dcc4479cf0 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -257,6 +257,13 @@ def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>; def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr, [SDNPHasChain, SDNPOptInGlue]>; +// PPC-specific atomic operations. +def PPCatomicCmpSwap_8 : + SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def PPCatomicCmpSwap_16 : + SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, @@ -1710,6 +1717,11 @@ let usesCustomInserter = 1 in { } } +def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new), + (ATOMIC_CMP_SWAP_I8 xoaddr:$ptr, i32:$old, i32:$new)>; +def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new), + (ATOMIC_CMP_SWAP_I16 xoaddr:$ptr, i32:$old, i32:$new)>; + // Instructions to support atomic operations let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in { def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src), diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index f1ce430f3323..f2ffba7d5418 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2375,6 +2375,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE) .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible) Flags |= Prefix; + if (getLexer().is(AsmToken::EndOfStatement)) { + // We don't have real instr with the given prefix + // let's use the prefix as the instr. + // TODO: there could be several prefixes one after another + Flags = X86::IP_NO_PREFIX; + break; + } Name = Parser.getTok().getString(); Parser.Lex(); // eat the prefix // Hack: we could have something like "rep # some comment" or diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a6f56877bd64..e7d9334abe14 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7893,8 +7893,14 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); - return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV, - SDLoc(V), VT, IndicesVec, SrcVec); + if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) { + SrcVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT), + SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec))); + } + if (VT == MVT::v16i8) + return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec); + return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec); } SDValue @@ -18262,6 +18268,18 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } + // For v64i1 without 64-bit support we need to split and rejoin. + if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { + assert(Subtarget.hasBWI() && "Expected BWI to be legal"); + SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32); + SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32); + SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32); + SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32); + SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo); + SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) @@ -28652,13 +28670,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } + SDValue NewV1 = V1; // Save operand in case early exit happens. if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT) && + NewV1, DL, DAG, Subtarget, Shuffle, + ShuffleSrcVT, ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - Res = DAG.getBitcast(ShuffleSrcVT, V1); + Res = DAG.getBitcast(ShuffleSrcVT, NewV1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); DCI.AddToWorklist(Res.getNode()); @@ -28680,33 +28699,36 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } + SDValue NewV1 = V1; // Save operands in case early exit happens. + SDValue NewV2 = V2; if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - V1, V2, DL, DAG, Subtarget, Shuffle, + NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - V1 = DAG.getBitcast(ShuffleSrcVT, V1); - DCI.AddToWorklist(V1.getNode()); - V2 = DAG.getBitcast(ShuffleSrcVT, V2); - DCI.AddToWorklist(V2.getNode()); - Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2); + NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1); + DCI.AddToWorklist(NewV1.getNode()); + NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2); + DCI.AddToWorklist(NewV2.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, V1, V2, DL, DAG, - Subtarget, Shuffle, ShuffleVT, - PermuteImm) && + NewV1 = V1; // Save operands in case early exit happens. + NewV2 = V2; + if (matchBinaryPermuteVectorShuffle( + MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, + NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - V1 = DAG.getBitcast(ShuffleVT, V1); - DCI.AddToWorklist(V1.getNode()); - V2 = DAG.getBitcast(ShuffleVT, V2); - DCI.AddToWorklist(V2.getNode()); - Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2, + NewV1 = DAG.getBitcast(ShuffleVT, NewV1); + DCI.AddToWorklist(NewV1.getNode()); + NewV2 = DAG.getBitcast(ShuffleVT, NewV2); + DCI.AddToWorklist(NewV2.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, DAG.getConstant(PermuteImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 223eed3048db..967d67a84bc0 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -754,7 +754,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // type remains the same. if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { MVT LegalVT = LT.second; - if (LegalVT.getVectorElementType().getSizeInBits() == + if (LegalVT.isVector() && + LegalVT.getVectorElementType().getSizeInBits() == Tp->getVectorElementType()->getPrimitiveSizeInBits() && LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp index c0cd1ea74a74..026fab5dbd3b 100644 --- a/lib/Transforms/Scalar/GVNHoist.cpp +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -648,7 +648,7 @@ private: // track in a CHI. In the PDom walk, there can be values in the // stack which are not control dependent e.g., nested loop. if (si != RenameStack.end() && si->second.size() && - DT->dominates(Pred, si->second.back()->getParent())) { + DT->properlyDominates(Pred, si->second.back()->getParent())) { C.Dest = BB; // Assign the edge C.I = si->second.pop_back_val(); // Assign the argument DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName() diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index b8fb80b6cc26..525425bd0f0c 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -14,7 +14,6 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/DivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" @@ -177,9 +176,8 @@ class StructurizeCFG : public RegionPass { Region *ParentRegion; DominatorTree *DT; - LoopInfo *LI; - SmallVector<RegionNode *, 8> Order; + std::deque<RegionNode *> Order; BBSet Visited; BBPhiMap DeletedPhis; @@ -204,7 +202,7 @@ class StructurizeCFG : public RegionPass { void gatherPredicates(RegionNode *N); - void collectInfos(); + void analyzeNode(RegionNode *N); void insertConditions(bool Loops); @@ -258,7 +256,6 @@ public: AU.addRequired<DivergenceAnalysis>(); AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); @@ -292,55 +289,17 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { /// \brief Build up the general order of nodes void StructurizeCFG::orderNodes() { - ReversePostOrderTraversal<Region*> RPOT(ParentRegion); - SmallDenseMap<Loop*, unsigned, 8> LoopBlocks; + assert(Visited.empty()); + assert(Predicates.empty()); + assert(Loops.empty()); + assert(LoopPreds.empty()); - // The reverse post-order traversal of the list gives us an ordering close - // to what we want. The only problem with it is that sometimes backedges - // for outer loops will be visited before backedges for inner loops. - for (RegionNode *RN : RPOT) { - BasicBlock *BB = RN->getEntry(); - Loop *Loop = LI->getLoopFor(BB); - ++LoopBlocks[Loop]; + // This must be RPO order for the back edge detection to work + for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) { + // FIXME: Is there a better order to use for structurization? + Order.push_back(RN); + analyzeNode(RN); } - - unsigned CurrentLoopDepth = 0; - Loop *CurrentLoop = nullptr; - for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { - BasicBlock *BB = (*I)->getEntry(); - unsigned LoopDepth = LI->getLoopDepth(BB); - - if (is_contained(Order, *I)) - continue; - - if (LoopDepth < CurrentLoopDepth) { - // Make sure we have visited all blocks in this loop before moving back to - // the outer loop. - - auto LoopI = I; - while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) { - LoopI++; - BasicBlock *LoopBB = (*LoopI)->getEntry(); - if (LI->getLoopFor(LoopBB) == CurrentLoop) { - --BlockCount; - Order.push_back(*LoopI); - } - } - } - - CurrentLoop = LI->getLoopFor(BB); - if (CurrentLoop) - LoopBlocks[CurrentLoop]--; - - CurrentLoopDepth = LoopDepth; - Order.push_back(*I); - } - - // This pass originally used a post-order traversal and then operated on - // the list in reverse. Now that we are using a reverse post-order traversal - // rather than re-working the whole pass to operate on the list in order, - // we just reverse the list and continue to operate on it in reverse. - std::reverse(Order.begin(), Order.end()); } /// \brief Determine the end of the loops @@ -466,32 +425,19 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { } /// \brief Collect various loop and predicate infos -void StructurizeCFG::collectInfos() { - // Reset predicate - Predicates.clear(); - - // and loop infos - Loops.clear(); - LoopPreds.clear(); +void StructurizeCFG::analyzeNode(RegionNode *RN) { + DEBUG(dbgs() << "Visiting: " + << (RN->isSubRegion() ? "SubRegion with entry: " : "") + << RN->getEntry()->getName() << '\n'); - // Reset the visited nodes - Visited.clear(); - - for (RegionNode *RN : reverse(Order)) { - DEBUG(dbgs() << "Visiting: " - << (RN->isSubRegion() ? "SubRegion with entry: " : "") - << RN->getEntry()->getName() << " Loop Depth: " - << LI->getLoopDepth(RN->getEntry()) << "\n"); - - // Analyze all the conditions leading to a node - gatherPredicates(RN); + // Analyze all the conditions leading to a node + gatherPredicates(RN); - // Remember that we've seen this node - Visited.insert(RN->getEntry()); + // Remember that we've seen this node + Visited.insert(RN->getEntry()); - // Find the last back edges - analyzeLoops(RN); - } + // Find the last back edges + analyzeLoops(RN); } /// \brief Insert the missing branch conditions @@ -664,7 +610,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { LLVMContext &Context = Func->getContext(); BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : - Order.back()->getEntry(); + Order.front()->getEntry(); BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, Func, Insert); DT->addNewBlock(Flow, Dominator); @@ -744,7 +690,8 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { /// Take one node from the order vector and wire it up void StructurizeCFG::wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd) { - RegionNode *Node = Order.pop_back_val(); + RegionNode *Node = Order.front(); + Order.pop_front(); Visited.insert(Node->getEntry()); if (isPredictableTrue(Node)) { @@ -768,7 +715,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed, PrevNode = Node; while (!Order.empty() && !Visited.count(LoopEnd) && - dominatesPredicates(Entry, Order.back())) { + dominatesPredicates(Entry, Order.front())) { handleLoops(false, LoopEnd); } @@ -779,7 +726,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed, void StructurizeCFG::handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd) { - RegionNode *Node = Order.back(); + RegionNode *Node = Order.front(); BasicBlock *LoopStart = Node->getEntry(); if (!Loops.count(LoopStart)) { @@ -924,10 +871,9 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { ParentRegion = R; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); orderNodes(); - collectInfos(); + createFlow(); insertConditions(false); insertConditions(true); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6ef54385c452..64f206ea92eb 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2630,9 +2630,12 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); - recordVectorLoopValueForInductionCast(II, LastInduction, Part); + if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); + else + recordVectorLoopValueForInductionCast(II, LastInduction, Part); + LastInduction = cast<Instruction>(addFastMathFlag( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); } @@ -2754,15 +2757,17 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // If we haven't yet vectorized the induction variable, splat the scalar // induction variable, and build the necessary step vectors. + // TODO: Don't do it unless the vectorized IV is really required. if (!VectorizedIV) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); - recordVectorLoopValueForInductionCast(ID, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); + else + recordVectorLoopValueForInductionCast(ID, EntryPart, Part); } } diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index a7ccd3faec44..f301fc361abc 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1347,7 +1347,6 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << Lane << " from " << *Scalar << ".\n"); ExternalUses.emplace_back(Scalar, nullptr, Lane); - continue; } for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -4417,13 +4416,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, None, true); + return tryToVectorizeList(VL, R, true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - ArrayRef<Value *> BuildVector, - bool AllowReorder, - bool NeedExtraction) { + bool AllowReorder) { if (VL.size() < 2) return false; @@ -4517,12 +4514,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, << "\n"); ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); - ArrayRef<Value *> EmptyArray; - ArrayRef<Value *> BuildVectorSlice; - if (!BuildVector.empty()) - BuildVectorSlice = BuildVector.slice(I, OpsWidth); - - R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice); + R.buildTree(Ops); // TODO: check if we can allow reordering for more cases. if (AllowReorder && R.shouldReorder()) { // Conceptually, there is nothing actually preventing us from trying to @@ -4530,7 +4522,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // reductions. However, at this point, we only expect to get here when // there are exactly two operations. assert(Ops.size() == 2); - assert(BuildVectorSlice.empty()); Value *ReorderedOps[] = {Ops[1], Ops[0]}; R.buildTree(ReorderedOps, None); } @@ -4550,31 +4541,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, << " and with tree size " << ore::NV("TreeSize", R.getTreeSize())); - Value *VectorizedRoot = R.vectorizeTree(); - - // Reconstruct the build vector by extracting the vectorized root. This - // way we handle the case where some elements of the vector are - // undefined. - // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) - if (!BuildVectorSlice.empty()) { - // The insert point is the last build vector instruction. The - // vectorized root will precede it. This guarantees that we get an - // instruction. The vectorized tree could have been constant folded. - Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); - unsigned VecIdx = 0; - for (auto &V : BuildVectorSlice) { - IRBuilder<NoFolder> Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); - Instruction *I = cast<Instruction>(V); - assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I)); - Instruction *Extract = - cast<Instruction>(Builder.CreateExtractElement( - VectorizedRoot, Builder.getInt32(VecIdx++))); - I->setOperand(1, Extract); - I->moveAfter(Extract); - InsertAfter = I; - } - } + R.vectorizeTree(); // Move to the next bundle. I += VF - 1; NextInst = I + 1; @@ -5495,11 +5462,9 @@ private: /// /// Returns true if it matches static bool findBuildVector(InsertElementInst *LastInsertElem, - SmallVectorImpl<Value *> &BuildVector, SmallVectorImpl<Value *> &BuildVectorOpds) { Value *V = nullptr; do { - BuildVector.push_back(LastInsertElem); BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); V = LastInsertElem->getOperand(0); if (isa<UndefValue>(V)) @@ -5508,7 +5473,6 @@ static bool findBuildVector(InsertElementInst *LastInsertElem, if (!LastInsertElem || !LastInsertElem->hasOneUse()) return false; } while (true); - std::reverse(BuildVector.begin(), BuildVector.end()); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } @@ -5517,11 +5481,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem, /// /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, - SmallVectorImpl<Value *> &BuildVector, SmallVectorImpl<Value *> &BuildVectorOpds) { Value *V; do { - BuildVector.push_back(IV); BuildVectorOpds.push_back(IV->getInsertedValueOperand()); V = IV->getAggregateOperand(); if (isa<UndefValue>(V)) @@ -5530,7 +5492,6 @@ static bool findBuildAggregate(InsertValueInst *IV, if (!IV || !IV->hasOneUse()) return false; } while (true); - std::reverse(BuildVector.begin(), BuildVector.end()); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } @@ -5706,27 +5667,25 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, if (!R.canMapToVector(IVI->getType(), DL)) return false; - SmallVector<Value *, 16> BuildVector; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds)) + if (!findBuildAggregate(IVI, BuildVectorOpds)) return false; DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true); + return tryToVectorizeList(BuildVectorOpds, R); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { - SmallVector<Value *, 16> BuildVector; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) + if (!findBuildVector(IEI, BuildVectorOpds)) return false; // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, BuildVector); + return tryToVectorizeList(BuildVectorOpds, R); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -5804,8 +5763,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - None, AllowReorder)) { + if (NumElts > 1 && + tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; |
