diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-27 19:50:45 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-27 19:50:54 +0000 |
| commit | 08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (patch) | |
| tree | 041e72e32710b1e742516d8c9f1575bf0116d3e3 /llvm/lib | |
| parent | 4b4fe385e49bd883fd183b5f21c1ea486c722e61 (diff) | |
vendor/llvm-project/llvmorg-15-init-17827-gd77882e66779vendor/llvm-project/llvmorg-15-init-17826-g1f8ae9d7e7e4
Diffstat (limited to 'llvm/lib')
136 files changed, 2807 insertions, 1468 deletions
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp index 6d9084215dee..ded842b92ae1 100644 --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -133,7 +133,8 @@ void CodeMetrics::analyzeBasicBlock( // When preparing for LTO, liberally consider calls as inline // candidates. if (!Call->isNoInline() && IsLoweredToCall && - ((F->hasInternalLinkage() && F->hasOneUse()) || PrepareForLTO)) { + ((F->hasInternalLinkage() && F->hasOneLiveUse()) || + PrepareForLTO)) { ++NumInlineCandidates; } diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 9f8a5e472f01..8192ed56caf0 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -185,8 +185,8 @@ private: public: InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} - virtual void emitInstructionAnnot(const Instruction *I, - formatted_raw_ostream &OS) override; + void emitInstructionAnnot(const Instruction *I, + formatted_raw_ostream &OS) override; }; /// Carry out call site analysis, in order to evaluate inlinability. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 4691aebbdfe1..21fe448218bc 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1591,12 +1591,6 @@ static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) { !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B)))) return nullptr; - // We have (icmp Pred0, A, B) & (icmp Pred1, A, B). - // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we - // can eliminate Op1 from this 'and'. - if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1)) - return Op0; - // Check for any combination of predicates that are guaranteed to be disjoint. if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) || (Pred0 == ICmpInst::ICMP_EQ && ICmpInst::isFalseWhenEqual(Pred1)) || @@ -1616,12 +1610,6 @@ static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) { !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B)))) return nullptr; - // We have (icmp Pred0, A, B) | (icmp Pred1, A, B). - // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we - // can eliminate Op0 from this 'or'. - if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1)) - return Op1; - // Check for any combination of predicates that cover the entire range of // possibilities. if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) || diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index bed684b7652a..aa35f253bc5f 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1500,9 +1500,7 @@ bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, Value *Ptr0 = VL[0]; using DistOrdPair = std::pair<int64_t, int>; - auto Compare = [](const DistOrdPair &L, const DistOrdPair &R) { - return L.first < R.first; - }; + auto Compare = llvm::less_first(); std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); Offsets.emplace(0, 0); int Cnt = 1; diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index 31e4380e4379..413ec6dd4b42 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -115,9 +115,7 @@ struct AllocFnsTy { // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to // know which functions are nounwind, noalias, nocapture parameters, etc. static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = { - {LibFunc_malloc, {MallocLike, 1, 0, -1, -1, MallocFamily::Malloc}}, {LibFunc_vec_malloc, {MallocLike, 1, 0, -1, -1, MallocFamily::VecMalloc}}, - {LibFunc_valloc, {MallocLike, 1, 0, -1, -1, MallocFamily::Malloc}}, {LibFunc_Znwj, {OpNewLike, 1, 0, -1, -1, MallocFamily::CPPNew}}, // new(unsigned int) {LibFunc_ZnwjRKSt9nothrow_t, {MallocLike, 2, 0, -1, -1, MallocFamily::CPPNew}}, // new(unsigned int, nothrow) {LibFunc_ZnwjSt11align_val_t, {OpNewLike, 2, 0, -1, 1, MallocFamily::CPPNewAligned}}, // new(unsigned int, align_val_t) @@ -142,13 +140,9 @@ static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = { {LibFunc_msvc_new_array_int_nothrow, {MallocLike, 2, 0, -1, -1, MallocFamily::MSVCArrayNew}}, // new[](unsigned int, nothrow) {LibFunc_msvc_new_array_longlong, {OpNewLike, 1, 0, -1, -1, MallocFamily::MSVCArrayNew}}, // new[](unsigned long long) {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike, 2, 0, -1, -1, MallocFamily::MSVCArrayNew}}, // new[](unsigned long long, nothrow) - {LibFunc_aligned_alloc, {AlignedAllocLike, 2, 1, -1, 0, MallocFamily::Malloc}}, {LibFunc_memalign, {AlignedAllocLike, 2, 1, -1, 0, MallocFamily::Malloc}}, - {LibFunc_calloc, {CallocLike, 2, 0, 1, -1, MallocFamily::Malloc}}, {LibFunc_vec_calloc, {CallocLike, 2, 0, 1, -1, MallocFamily::VecMalloc}}, - {LibFunc_realloc, {ReallocLike, 2, 1, -1, -1, MallocFamily::Malloc}}, {LibFunc_vec_realloc, {ReallocLike, 2, 1, -1, -1, MallocFamily::VecMalloc}}, - {LibFunc_reallocf, {ReallocLike, 2, 1, -1, -1, MallocFamily::Malloc}}, {LibFunc_strdup, {StrDupLike, 1, -1, -1, -1, MallocFamily::Malloc}}, {LibFunc_dunder_strdup, {StrDupLike, 1, -1, -1, -1, MallocFamily::Malloc}}, {LibFunc_strndup, {StrDupLike, 2, 1, -1, -1, MallocFamily::Malloc}}, @@ -488,7 +482,6 @@ struct FreeFnsTy { // clang-format off static const std::pair<LibFunc, FreeFnsTy> FreeFnData[] = { - {LibFunc_free, {1, MallocFamily::Malloc}}, {LibFunc_vec_free, {1, MallocFamily::VecMalloc}}, {LibFunc_ZdlPv, {1, MallocFamily::CPPNew}}, // operator delete(void*) {LibFunc_ZdaPv, {1, MallocFamily::CPPNewArray}}, // operator delete[](void*) diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index c52b27a38fe9..efe60586979a 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -164,7 +164,8 @@ static void addIntrinsicToSummary( SetVector<FunctionSummary::ConstVCall> &TypeCheckedLoadConstVCalls, DominatorTree &DT) { switch (CI->getCalledFunction()->getIntrinsicID()) { - case Intrinsic::type_test: { + case Intrinsic::type_test: + case Intrinsic::public_type_test: { auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1)); auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata()); if (!TypeId) diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp index 7571bd0059cc..5b0fbca23891 100644 --- a/llvm/lib/Analysis/PHITransAddr.cpp +++ b/llvm/lib/Analysis/PHITransAddr.cpp @@ -21,6 +21,10 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +static cl::opt<bool> EnableAddPhiTranslation( + "gvn-add-phi-translation", cl::init(false), cl::Hidden, + cl::desc("Enable phi-translation of add instructions")); + static bool CanPHITrans(Instruction *Inst) { if (isa<PHINode>(Inst) || isa<GetElementPtrInst>(Inst)) @@ -410,14 +414,14 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB, return Result; } -#if 0 - // FIXME: This code works, but it is unclear that we actually want to insert - // a big chain of computation in order to make a value available in a block. - // This needs to be evaluated carefully to consider its cost trade offs. - // Handle add with a constant RHS. - if (Inst->getOpcode() == Instruction::Add && + if (EnableAddPhiTranslation && Inst->getOpcode() == Instruction::Add && isa<ConstantInt>(Inst->getOperand(1))) { + + // FIXME: This code works, but it is unclear that we actually want to insert + // a big chain of computation in order to make a value available in a block. + // This needs to be evaluated carefully to consider its cost trade offs. + // PHI translate the LHS. Value *OpVal = InsertPHITranslatedSubExpr(Inst->getOperand(0), CurBB, PredBB, DT, NewInsts); @@ -431,7 +435,6 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB, NewInsts.push_back(Res); return Res; } -#endif return nullptr; } diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index d46248aa3889..2958a5054afc 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -11153,20 +11153,6 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, return true; } - // Try to prove (Pred, LHS, RHS) using isImpliedViaGuard. - auto ProveViaGuard = [&](const BasicBlock *Block) { - if (isImpliedViaGuard(Block, Pred, LHS, RHS)) - return true; - if (ProvingStrictComparison) { - auto ProofFn = [&](ICmpInst::Predicate P) { - return isImpliedViaGuard(Block, P, LHS, RHS); - }; - if (SplitAndProve(ProofFn)) - return true; - } - return false; - }; - // Try to prove (Pred, LHS, RHS) using isImpliedCond. auto ProveViaCond = [&](const Value *Condition, bool Inverse) { const Instruction *CtxI = &BB->front(); @@ -11193,9 +11179,6 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, PredBB = BB->getSinglePredecessor(); for (std::pair<const BasicBlock *, const BasicBlock *> Pair(PredBB, BB); Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { - if (ProveViaGuard(Pair.first)) - return true; - const BranchInst *BlockEntryPredicate = dyn_cast<BranchInst>(Pair.first->getTerminator()); if (!BlockEntryPredicate || BlockEntryPredicate->isUnconditional()) @@ -11218,6 +11201,15 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, return true; } + // Check conditions due to any @llvm.experimental.guard intrinsics. + auto *GuardDecl = F.getParent()->getFunction( + Intrinsic::getName(Intrinsic::experimental_guard)); + if (GuardDecl) + for (const auto *GU : GuardDecl->users()) + if (const auto *Guard = dyn_cast<IntrinsicInst>(GU)) + if (Guard->getFunction() == BB->getParent() && DT.dominates(Guard, BB)) + if (ProveViaCond(Guard->getArgOperand(0), false)) + return true; return false; } diff --git a/llvm/lib/Analysis/TypeMetadataUtils.cpp b/llvm/lib/Analysis/TypeMetadataUtils.cpp index 201e64770766..e128187bac49 100644 --- a/llvm/lib/Analysis/TypeMetadataUtils.cpp +++ b/llvm/lib/Analysis/TypeMetadataUtils.cpp @@ -75,7 +75,9 @@ void llvm::findDevirtualizableCallsForTypeTest( SmallVectorImpl<DevirtCallSite> &DevirtCalls, SmallVectorImpl<CallInst *> &Assumes, const CallInst *CI, DominatorTree &DT) { - assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_test); + assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_test || + CI->getCalledFunction()->getIntrinsicID() == + Intrinsic::public_type_test); const Module *M = CI->getParent()->getParent()->getParent(); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 1f3798d1338e..2dd671b4ab9e 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4266,9 +4266,10 @@ bool llvm::getConstantDataArrayInfo(const Value *V, return true; } -/// This function computes the length of a null-terminated C string pointed to -/// by V. If successful, it returns true and returns the string in Str. -/// If unsuccessful, it returns false. +/// Extract bytes from the initializer of the constant array V, which need +/// not be a nul-terminated string. On success, store the bytes in Str and +/// return true. When TrimAtNul is set, Str will contain only the bytes up +/// to but not including the first nul. Return false on failure. bool llvm::getConstantStringInfo(const Value *V, StringRef &Str, uint64_t Offset, bool TrimAtNul) { ConstantDataArraySlice Slice; @@ -6543,7 +6544,6 @@ bool llvm::matchSimpleRecurrence(const BinaryOperator *I, PHINode *&P, static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS, const Value *RHS, const DataLayout &DL, unsigned Depth) { - assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!"); if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS) return true; @@ -6656,14 +6656,12 @@ static Optional<bool> isImpliedCondMatchingOperands(CmpInst::Predicate APred, /// Return true if "icmp APred X, C1" implies "icmp BPred X, C2" is true. /// Return false if "icmp APred X, C1" implies "icmp BPred X, C2" is false. /// Otherwise, return None if we can't infer anything. -static Optional<bool> -isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, - const ConstantInt *C1, - CmpInst::Predicate BPred, - const ConstantInt *C2) { - ConstantRange DomCR = - ConstantRange::makeExactICmpRegion(APred, C1->getValue()); - ConstantRange CR = ConstantRange::makeExactICmpRegion(BPred, C2->getValue()); +static Optional<bool> isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, + const APInt &C1, + CmpInst::Predicate BPred, + const APInt &C2) { + ConstantRange DomCR = ConstantRange::makeExactICmpRegion(APred, C1); + ConstantRange CR = ConstantRange::makeExactICmpRegion(BPred, C2); ConstantRange Intersection = DomCR.intersectWith(CR); ConstantRange Difference = DomCR.difference(CR); if (Intersection.isEmptySet()) @@ -6701,14 +6699,9 @@ static Optional<bool> isImpliedCondICmps(const ICmpInst *LHS, // Can we infer anything when the LHS operands match and the RHS operands are // constants (not necessarily matching)? - if (ALHS == BLHS && isa<ConstantInt>(ARHS) && isa<ConstantInt>(BRHS)) { - if (Optional<bool> Implication = isImpliedCondMatchingImmOperands( - APred, cast<ConstantInt>(ARHS), BPred, cast<ConstantInt>(BRHS))) - return Implication; - // No amount of additional analysis will infer the second condition, so - // early exit. - return None; - } + const APInt *AC, *BC; + if (ALHS == BLHS && match(ARHS, m_APInt(AC)) && match(BRHS, m_APInt(BC))) + return isImpliedCondMatchingImmOperands(APred, *AC, BPred, *BC); if (APred == BPred) return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth); @@ -6761,14 +6754,8 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred, if (RHSOp0->getType()->isVectorTy() != LHS->getType()->isVectorTy()) return None; - Type *OpTy = LHS->getType(); - assert(OpTy->isIntOrIntVectorTy(1) && "Expected integer type only!"); - - // FIXME: Extending the code below to handle vectors. - if (OpTy->isVectorTy()) - return None; - - assert(OpTy->isIntegerTy(1) && "implied by above"); + assert(LHS->getType()->isIntOrIntVectorTy(1) && + "Expected integer type only!"); // Both LHS and RHS are icmps. const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 1d6c21bd66d1..1943b5db94c3 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7788,7 +7788,7 @@ static Expected<bool> getEnableSplitLTOUnitFlag(BitstreamCursor &Stream, case bitc::FS_FLAGS: { // [flags] uint64_t Flags = Record[0]; // Scan flags. - assert(Flags <= 0x7f && "Unexpected bits in flag"); + assert(Flags <= 0xff && "Unexpected bits in flag"); return Flags & 0x8; } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index e0050a47a6f6..32a10ad41d1f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2795,12 +2795,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { DL.getTypeAllocSize(Op->getType()).getFixedSize()) return OpExpr; - // Otherwise the pointer is smaller than the resultant integer, mask off - // the high bits so we are sure to get a proper truncation if the input is - // a constant expr. - unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType()); - const MCExpr *MaskExpr = MCConstantExpr::create(~0ULL >> (64-InBits), Ctx); - return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx); + break; // Error } case Instruction::Sub: { diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.h b/llvm/lib/CodeGen/AsmPrinter/WasmException.h index 2abbe37cb6d9..419b569d123c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WasmException.h +++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.h @@ -28,7 +28,7 @@ public: void endModule() override; void beginFunction(const MachineFunction *MF) override {} - virtual void markFunctionEnd() override; + void markFunctionEnd() override; void endFunction(const MachineFunction *MF) override; protected: diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index f21c1bf4e914..ad51bab8f30b 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -515,9 +515,14 @@ void AtomicExpand::expandAtomicStore(StoreInst *SI) { // It is the responsibility of the target to only signal expansion via // shouldExpandAtomicRMW in cases where this is required and possible. IRBuilder<> Builder(SI); + AtomicOrdering Ordering = SI->getOrdering(); + assert(Ordering != AtomicOrdering::NotAtomic); + AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered + ? AtomicOrdering::Monotonic + : Ordering; AtomicRMWInst *AI = Builder.CreateAtomicRMW( AtomicRMWInst::Xchg, SI->getPointerOperand(), SI->getValueOperand(), - SI->getAlign(), SI->getOrdering()); + SI->getAlign(), RMWOrdering); SI->eraseFromParent(); // Now we have an appropriate swap instruction, lower it as usual. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index b6c762b93ca5..b8f6fc9bbcde 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2568,8 +2568,6 @@ struct ExtAddrMode : public TargetLowering::AddrMode { } }; -} // end anonymous namespace - #ifndef NDEBUG static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { AM.print(OS); @@ -2617,6 +2615,8 @@ LLVM_DUMP_METHOD void ExtAddrMode::dump() const { } #endif +} // end anonymous namespace + namespace { /// This class provides transaction based operation on the IR. diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index da054b9c14fb..05a25bc3078e 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1142,7 +1142,8 @@ bool CombinerHelper::matchCombineDivRem(MachineInstr &MI, if (MI.getParent() == UseMI.getParent() && ((IsDiv && UseMI.getOpcode() == RemOpcode) || (!IsDiv && UseMI.getOpcode() == DivOpcode)) && - matchEqualDefs(MI.getOperand(2), UseMI.getOperand(2))) { + matchEqualDefs(MI.getOperand(2), UseMI.getOperand(2)) && + matchEqualDefs(MI.getOperand(1), UseMI.getOperand(1))) { OtherMI = &UseMI; return true; } diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index dbdcfe0b6f0b..2f9187bbf2ad 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -151,11 +151,11 @@ public: LLVM_DEBUG(dbgs() << "Checking DILocation from " << *CurrInst << " was copied to " << MI); #endif - // We allow insts in the entry block to have a debug loc line of 0 because + // We allow insts in the entry block to have no debug loc because // they could have originated from constants, and we don't want a jumpy // debug experience. assert((CurrInst->getDebugLoc() == MI.getDebugLoc() || - MI.getDebugLoc().getLine() == 0) && + (MI.getParent()->isEntryBlock() && !MI.getDebugLoc())) && "Line info was not transferred to all instructions"); } }; @@ -3020,11 +3020,9 @@ bool IRTranslator::translate(const Instruction &Inst) { bool IRTranslator::translate(const Constant &C, Register Reg) { // We only emit constants into the entry block from here. To prevent jumpy - // debug behaviour set the line to 0. + // debug behaviour remove debug line. if (auto CurrInstDL = CurBuilder->getDL()) - EntryBuilder->setDebugLoc(DILocation::get(C.getContext(), 0, 0, - CurrInstDL.getScope(), - CurrInstDL.getInlinedAt())); + EntryBuilder->setDebugLoc(DebugLoc()); if (auto CI = dyn_cast<ConstantInt>(&C)) EntryBuilder->buildConstant(Reg, *CI); diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index ef49d3888f2b..191596dbf53e 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -1330,7 +1330,7 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) { const MachineOperand &MO = MI.getOperand(0); unsigned InstrNum = MI.getOperand(1).getImm(); - auto EmitBadPHI = [this, &MI, InstrNum](void) -> bool { + auto EmitBadPHI = [this, &MI, InstrNum]() -> bool { // Helper lambda to do any accounting when we fail to find a location for // a DBG_PHI. This can happen if DBG_PHIs are malformed, or refer to a // dead stack slot, for example. @@ -3136,8 +3136,7 @@ bool InstrRefBasedLDV::emitTransfers( MI->getDebugLoc()->getInlinedAt()); Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI); } - llvm::sort(Insts, - [](const auto &A, const auto &B) { return A.first < B.first; }); + llvm::sort(Insts, llvm::less_first()); // Insert either before or after the designated point... if (P.MBB) { diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 2aafb746aa2c..abf36b3f4c67 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -300,13 +300,15 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { SmallVector<unsigned, 8> RegsToErase; bool ReadsPhysRegs = false; bool isOrigDef = false; - unsigned Dest; + Register Dest; + unsigned DestSubReg; // Only optimize rematerialize case when the instruction has one def, since // otherwise we could leave some dead defs in the code. This case is // extremely rare. if (VRM && MI->getOperand(0).isReg() && MI->getOperand(0).isDef() && MI->getDesc().getNumDefs() == 1) { Dest = MI->getOperand(0).getReg(); + DestSubReg = MI->getOperand(0).getSubReg(); unsigned Original = VRM->getOriginal(Dest); LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx); @@ -384,8 +386,18 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { if (isOrigDef && DeadRemats && !HasLiveVRegUses && TII.isTriviallyReMaterializable(*MI)) { LiveInterval &NewLI = createEmptyIntervalFrom(Dest, false); - VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator()); + VNInfo::Allocator &Alloc = LIS.getVNInfoAllocator(); + VNInfo *VNI = NewLI.getNextValue(Idx, Alloc); NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI)); + + if (DestSubReg) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + auto *SR = NewLI.createSubRange( + Alloc, TRI->getSubRegIndexLaneMask(DestSubReg)); + SR->addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), + SR->getNextValue(Idx, Alloc))); + } + pop_back(); DeadRemats->insert(MI); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp index 99494122d608..477310f59112 100644 --- a/llvm/lib/CodeGen/MachineFunctionPass.cpp +++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/PrintPasses.h" using namespace llvm; using namespace ore; @@ -70,6 +71,17 @@ bool MachineFunctionPass::runOnFunction(Function &F) { if (ShouldEmitSizeRemarks) CountBefore = MF.getInstructionCount(); + // For --print-changed, if the function name is a candidate, save the + // serialized MF to be compared later. + // TODO Implement --filter-passes. + SmallString<0> BeforeStr, AfterStr; + bool ShouldPrintChanged = PrintChanged != ChangePrinter::None && + isFunctionInPrintList(MF.getName()); + if (ShouldPrintChanged) { + raw_svector_ostream OS(BeforeStr); + MF.print(OS); + } + bool RV = runOnMachineFunction(MF); if (ShouldEmitSizeRemarks) { @@ -97,6 +109,23 @@ bool MachineFunctionPass::runOnFunction(Function &F) { MFProps.set(SetProperties); MFProps.reset(ClearedProperties); + + // For --print-changed, print if the serialized MF has changed. Modes other + // than quiet/verbose are unimplemented and treated the same as 'quiet'. + if (ShouldPrintChanged) { + raw_svector_ostream OS(AfterStr); + MF.print(OS); + if (BeforeStr != AfterStr) { + StringRef Arg; + if (const PassInfo *PI = Pass::lookupPassInfo(getPassID())) + Arg = PI->getPassArgument(); + errs() << ("*** IR Dump After " + getPassName() + " (" + Arg + ") on " + + MF.getName() + " ***\n" + AfterStr); + } else if (PrintChanged == ChangePrinter::Verbose) { + errs() << ("*** IR Dump After " + getPassName() + " on " + MF.getName() + + " omitted because no change ***\n"); + } + } return RV; } diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 1115c2a27956..87e2f9f20021 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" @@ -69,6 +70,8 @@ static CallInst::TailCallKind getOverridingTailCallKind(const Function &F) { static bool lowerObjCCall(Function &F, const char *NewFn, bool setNonLazyBind = false) { + assert(IntrinsicInst::mayLowerToFunctionCall(F.getIntrinsicID()) && + "Pre-ISel intrinsics do lower into regular function calls"); if (F.use_empty()) return false; @@ -107,7 +110,9 @@ static bool lowerObjCCall(Function &F, const char *NewFn, IRBuilder<> Builder(CI->getParent(), CI->getIterator()); SmallVector<Value *, 8> Args(CI->args()); - CallInst *NewCI = Builder.CreateCall(FCache, Args); + SmallVector<llvm::OperandBundleDef, 1> BundleList; + CI->getOperandBundlesAsDefs(BundleList); + CallInst *NewCI = Builder.CreateCall(FCache, Args, BundleList); NewCI->setName(CI->getName()); // Try to set the most appropriate TailCallKind based on both the current diff --git a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp index 7327f9e52efc..54bb4a31ef49 100644 --- a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp @@ -47,7 +47,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - virtual MachineFunctionProperties getRequiredProperties() const override { + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::IsSSA); } diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 4a54d7ebf8a9..9c6cb7c3a4e2 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -135,6 +135,12 @@ static cl::opt<bool> GreedyRegClassPriorityTrumpsGlobalness( "more important then whether the range is global"), cl::Hidden); +static cl::opt<bool> GreedyReverseLocalAssignment( + "greedy-reverse-local-assignment", + cl::desc("Reverse allocation order of local live ranges, such that " + "shorter local live ranges will tend to be allocated first"), + cl::Hidden); + static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", createGreedyRegisterAllocator); @@ -297,11 +303,10 @@ void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) { } else { // Giant live ranges fall back to the global assignment heuristic, which // prevents excessive spilling in pathological cases. - bool ReverseLocal = TRI->reverseLocalAssignment(); const TargetRegisterClass &RC = *MRI->getRegClass(Reg); - bool ForceGlobal = - !ReverseLocal && (Size / SlotIndex::InstrDist) > - (2 * RegClassInfo.getNumAllocatableRegs(&RC)); + bool ForceGlobal = !ReverseLocalAssignment && + (Size / SlotIndex::InstrDist) > + (2 * RegClassInfo.getNumAllocatableRegs(&RC)); unsigned GlobalBit = 0; if (Stage == RS_Assign && !ForceGlobal && !LI->empty() && @@ -309,7 +314,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) { // Allocate original local ranges in linear instruction order. Since they // are singly defined, this produces optimal coloring in the absence of // global interference and other constraints. - if (!ReverseLocal) + if (!ReverseLocalAssignment) Prio = LI->beginIndex().getInstrDistance(Indexes->getLastIndex()); else { // Allocating bottom up may allow many short LRGs to be assigned first @@ -2528,6 +2533,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { ? GreedyRegClassPriorityTrumpsGlobalness : TRI->regClassPriorityTrumpsGlobalness(*MF); + ReverseLocalAssignment = GreedyReverseLocalAssignment.getNumOccurrences() + ? GreedyReverseLocalAssignment + : TRI->reverseLocalAssignment(); + ExtraInfo.emplace(); EvictAdvisor = getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(*MF, *this); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index 316b12d0213b..483f59ed8e8e 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -270,6 +270,8 @@ private: /// machine function. bool RegClassPriorityTrumpsGlobalness; + bool ReverseLocalAssignment; + public: RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index edb0756e8c3b..654879115ff9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4877,9 +4877,16 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) return Res; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); + // canonicalize constant to RHS (vector doesn't have to splat) + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::SMUL_LOHI, DL, N->getVTList(), N1, N0); + // If the type is twice as wide is legal, transform the mulhu to a wider // multiply plus a shift. if (VT.isSimple() && !VT.isVector()) { @@ -4887,8 +4894,8 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { - SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); - SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); + SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); + SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, @@ -4908,19 +4915,26 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) return Res; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); + // canonicalize constant to RHS (vector doesn't have to splat) + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::UMUL_LOHI, DL, N->getVTList(), N1, N0); + // (umul_lohi N0, 0) -> (0, 0) - if (isNullConstant(N->getOperand(1))) { + if (isNullConstant(N1)) { SDValue Zero = DAG.getConstant(0, DL, VT); return CombineTo(N, Zero, Zero); } // (umul_lohi N0, 1) -> (N0, 0) - if (isOneConstant(N->getOperand(1))) { + if (isOneConstant(N1)) { SDValue Zero = DAG.getConstant(0, DL, VT); - return CombineTo(N, N->getOperand(0), Zero); + return CombineTo(N, N0, Zero); } // If the type is twice as wide is legal, transform the mulhu to a wider @@ -4930,8 +4944,8 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { - SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); - SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); + SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); + SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, @@ -7247,6 +7261,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, // Otherwise if matching a general funnel shift, it should be clear. static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, SelectionDAG &DAG, bool IsRotate) { + const auto &TLI = DAG.getTargetLoweringInfo(); // If EltSize is a power of 2 then: // // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) @@ -7278,19 +7293,20 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, // always invokes undefined behavior for 32-bit X. // // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. + // This allows us to peek through any operations that only affect Mask's + // un-demanded bits. // - // NOTE: We can only do this when matching an AND and not a general - // funnel shift. + // NOTE: We can only do this when matching operations which won't modify the + // least Log2(EltSize) significant bits and not a general funnel shift. unsigned MaskLoBits = 0; - if (IsRotate && Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { - if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { - KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0)); - unsigned Bits = Log2_64(EltSize); - if (NegC->getAPIntValue().getActiveBits() <= Bits && - ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) { - Neg = Neg.getOperand(0); - MaskLoBits = Bits; - } + if (IsRotate && isPowerOf2_64(EltSize)) { + unsigned Bits = Log2_64(EltSize); + APInt DemandedBits = + APInt::getLowBitsSet(Neg.getScalarValueSizeInBits(), Bits); + if (SDValue Inner = + TLI.SimplifyMultipleUseDemandedBits(Neg, DemandedBits, DAG)) { + Neg = Inner; + MaskLoBits = Bits; } } @@ -7302,15 +7318,15 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, return false; SDValue NegOp1 = Neg.getOperand(1); - // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with - // Pos'. The truncation is redundant for the purpose of the equality. - if (MaskLoBits && Pos.getOpcode() == ISD::AND) { - if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) { - KnownBits Known = DAG.computeKnownBits(Pos.getOperand(0)); - if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits && - ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >= - MaskLoBits)) - Pos = Pos.getOperand(0); + // On the RHS of [A], if Pos is the result of operation on Pos' that won't + // affect Mask's demanded bits, just replace Pos with Pos'. These operations + // are redundant for the purpose of the equality. + if (MaskLoBits) { + APInt DemandedBits = + APInt::getLowBitsSet(Pos.getScalarValueSizeInBits(), MaskLoBits); + if (SDValue Inner = + TLI.SimplifyMultipleUseDemandedBits(Pos, DemandedBits, DAG)) { + Pos = Inner; } } @@ -14988,7 +15004,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // FMA nodes have flags that propagate to the created nodes. SelectionDAG::FlagInserter FlagsInserter(DAG, N); - bool UnsafeFPMath = + bool CanReassociate = Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); // Constant fold FMA. @@ -15012,7 +15028,8 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { CostN1 == TargetLowering::NegatibleCost::Cheaper)) return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); - if (UnsafeFPMath) { + // FIXME: use fast math flags instead of Options.UnsafeFPMath + if (Options.UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) @@ -15029,7 +15046,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { !DAG.isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); - if (UnsafeFPMath) { + if (CanReassociate) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && DAG.isConstantFPBuildVectorOrConstantFP(N1) && @@ -15070,7 +15087,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { } } - if (UnsafeFPMath) { + if (CanReassociate) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode( @@ -19697,8 +19714,11 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, // extract. SDValue Op0 = Vec.getOperand(0); SDValue Op1 = Vec.getOperand(1); + APInt SplatVal; if (isAnyConstantBuildVector(Op0, true) || - isAnyConstantBuildVector(Op1, true)) { + ISD::isConstantSplatVector(Op0.getNode(), SplatVal) || + isAnyConstantBuildVector(Op1, true) || + ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) { // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C' // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC) SDLoc DL(ExtElt); @@ -19775,6 +19795,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // converts. } + if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) + return BO; + if (VecVT.isScalableVector()) return SDValue(); @@ -19820,9 +19843,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } } - if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) - return BO; - // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 441437351852..195c0e6a836f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2529,8 +2529,7 @@ bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask, /// DemandedElts. We use this predicate to simplify operations downstream. bool SelectionDAG::MaskedVectorIsZero(SDValue V, const APInt &DemandedElts, unsigned Depth /* = 0 */) const { - APInt Mask = APInt::getAllOnes(V.getScalarValueSizeInBits()); - return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero); + return computeKnownBits(V, DemandedElts, Depth).isZero(); } /// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'. @@ -9089,6 +9088,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, } break; } + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: { + assert(VTList.NumVTs == 2 && Ops.size() == 2 && "Invalid mul lo/hi op!"); + assert(VTList.VTs[0].isInteger() && VTList.VTs[0] == VTList.VTs[1] && + VTList.VTs[0] == Ops[0].getValueType() && + VTList.VTs[0] == Ops[1].getValueType() && + "Binary operator types must match!"); + break; + } case ISD::STRICT_FP_EXTEND: assert(VTList.NumVTs == 2 && Ops.size() == 2 && "Invalid STRICT_FP_EXTEND!"); @@ -11682,6 +11690,35 @@ bool BuildVectorSDNode::isConstant() const { return true; } +Optional<std::pair<APInt, APInt>> +BuildVectorSDNode::isConstantSequence() const { + unsigned NumOps = getNumOperands(); + if (NumOps < 2) + return None; + + if (!isa<ConstantSDNode>(getOperand(0)) || + !isa<ConstantSDNode>(getOperand(1))) + return None; + + unsigned EltSize = getValueType(0).getScalarSizeInBits(); + APInt Start = getConstantOperandAPInt(0).trunc(EltSize); + APInt Stride = getConstantOperandAPInt(1).trunc(EltSize) - Start; + + if (Stride.isZero()) + return None; + + for (unsigned i = 2; i < NumOps; ++i) { + if (!isa<ConstantSDNode>(getOperand(i))) + return None; + + APInt Val = getConstantOperandAPInt(i).trunc(EltSize); + if (Val != (Start + (Stride * i))) + return None; + } + + return std::make_pair(Start, Stride); +} + bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { // Find the first non-undef value in the shuffle mask. unsigned i, e; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 4a3ab00614b3..d1915fd4e7ae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -198,7 +198,7 @@ public: SDAGSwitchLowering(SelectionDAGBuilder *sdb, FunctionLoweringInfo &funcinfo) : SwitchCG::SwitchLowering(funcinfo), SDB(sdb) {} - virtual void addSuccessorWithProb( + void addSuccessorWithProb( MachineBasicBlock *Src, MachineBasicBlock *Dst, BranchProbability Prob = BranchProbability::getUnknown()) override { SDB->addSuccessorWithProb(Src, Dst, Prob); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index cd4f0ae42bcd..6205e74837c0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -654,6 +654,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, SDValue TargetLowering::SimplifyMultipleUseDemandedBits( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const { + EVT VT = Op.getValueType(); + + // Pretend we don't know anything about scalable vectors for now. + // TODO: We can probably do more work on simplifying the operations for + // scalable vectors, but for now we just bail out. + if (VT.isScalableVector()) + return SDValue(); + // Limit search depth. if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); @@ -664,7 +672,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( // Not demanding any bits/elts from Op. if (DemandedBits == 0 || DemandedElts == 0) - return DAG.getUNDEF(Op.getValueType()); + return DAG.getUNDEF(VT); bool IsLE = DAG.getDataLayout().isLittleEndian(); unsigned NumElts = DemandedElts.getBitWidth(); @@ -894,6 +902,13 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( SDValue Op, const APInt &DemandedBits, SelectionDAG &DAG, unsigned Depth) const { EVT VT = Op.getValueType(); + + // Pretend we don't know anything about scalable vectors for now. + // TODO: We can probably do more work on simplifying the operations for + // scalable vectors, but for now we just bail out. + if (VT.isScalableVector()) + return SDValue(); + APInt DemandedElts = VT.isVector() ? APInt::getAllOnes(VT.getVectorNumElements()) : APInt(1, 1); diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index 62b7f629f403..3e14edb5f730 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -2343,7 +2343,7 @@ void DWARFLinker::addObjectFile(DWARFFile &File) { updateAccelKind(*ObjectContexts.back().File.Dwarf); } -bool DWARFLinker::link() { +Error DWARFLinker::link() { assert(Options.NoOutput || TheDwarfEmitter); // A unique ID that identifies each compile unit. @@ -2410,6 +2410,55 @@ bool DWARFLinker::link() { if (!OptContext.File.Dwarf) continue; + // Check whether type units are presented. + if (!OptContext.File.Dwarf->types_section_units().empty()) { + reportWarning("type units are not currently supported: file will " + "be skipped", + OptContext.File); + OptContext.Skip = true; + continue; + } + + // Check for unsupported sections. Following sections can be referenced + // from .debug_info section. Current DWARFLinker implementation does not + // support or update references to these tables. Thus we report warning + // and skip corresponding object file. + if (!OptContext.File.Dwarf->getDWARFObj() + .getRnglistsSection() + .Data.empty()) { + reportWarning("'.debug_rnglists' is not currently supported: file " + "will be skipped", + OptContext.File); + OptContext.Skip = true; + continue; + } + + if (!OptContext.File.Dwarf->getDWARFObj() + .getLoclistsSection() + .Data.empty()) { + reportWarning("'.debug_loclists' is not currently supported: file " + "will be skipped", + OptContext.File); + OptContext.Skip = true; + continue; + } + + if (!OptContext.File.Dwarf->getDWARFObj().getMacroSection().Data.empty()) { + reportWarning("'.debug_macro' is not currently supported: file " + "will be skipped", + OptContext.File); + OptContext.Skip = true; + continue; + } + + if (OptContext.File.Dwarf->getDWARFObj().getMacinfoSection().size() > 1) { + reportWarning("'.debug_macinfo' is not currently supported: file " + "will be skipped", + OptContext.File); + OptContext.Skip = true; + continue; + } + // In a first phase, just read in the debug info and load all clang modules. OptContext.CompileUnits.reserve( OptContext.File.Dwarf->getNumCompileUnits()); @@ -2660,7 +2709,7 @@ bool DWARFLinker::link() { "---------------\n\n"; } - return true; + return Error::success(); } bool DWARFLinker::verify(const DWARFFile &File) { diff --git a/llvm/lib/DWP/DWP.cpp b/llvm/lib/DWP/DWP.cpp index 44e39c019e0c..346f4dfd290d 100644 --- a/llvm/lib/DWP/DWP.cpp +++ b/llvm/lib/DWP/DWP.cpp @@ -18,6 +18,7 @@ #include "llvm/Object/Decompressor.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Support/MemoryBuffer.h" +#include <limits> using namespace llvm; using namespace llvm::object; @@ -654,6 +655,12 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs) { IndexVersion)]; C.Offset = InfoSectionOffset; C.Length = Header.Length + 4; + + if (std::numeric_limits<uint32_t>::max() - InfoSectionOffset < + C.Length) + return make_error<DWPError>( + "debug information section offset is greater than 4GB"); + UnitOffset += C.Length; if (Header.Version < 5 || Header.UnitType == dwarf::DW_UT_split_compile) { diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp index dc07eaeaf615..3a6162db75c4 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp @@ -18,13 +18,19 @@ static const char *CommonSectionName = "__common"; namespace llvm { namespace jitlink { +static Triple createTripleWithCOFFFormat(Triple T) { + T.setObjectFormat(Triple::COFF); + return T; +} + COFFLinkGraphBuilder::COFFLinkGraphBuilder( const object::COFFObjectFile &Obj, Triple TT, LinkGraph::GetEdgeKindNameFunction GetEdgeKindName) : Obj(Obj), - G(std::make_unique<LinkGraph>( - Obj.getFileName().str(), Triple(std::move(TT)), getPointerSize(Obj), - getEndianness(Obj), std::move(GetEdgeKindName))) { + G(std::make_unique<LinkGraph>(Obj.getFileName().str(), + createTripleWithCOFFFormat(TT), + getPointerSize(Obj), getEndianness(Obj), + std::move(GetEdgeKindName))) { LLVM_DEBUG({ dbgs() << "Created COFFLinkGraphBuilder for \"" << Obj.getFileName() << "\"\n"; @@ -128,16 +134,6 @@ Error COFFLinkGraphBuilder::graphifySections() { if (Expected<StringRef> SecNameOrErr = Obj.getSectionName(*Sec)) SectionName = *SecNameOrErr; - bool IsDiscardable = - (*Sec)->Characteristics & - (COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_LNK_INFO); - if (IsDiscardable) { - LLVM_DEBUG(dbgs() << " " << SecIndex << ": \"" << SectionName - << "\" is discardable: " - "No graph section will be created.\n"); - continue; - } - // FIXME: Skip debug info sections LLVM_DEBUG({ @@ -145,6 +141,8 @@ Error COFFLinkGraphBuilder::graphifySections() { << "Creating section for \"" << SectionName << "\"\n"; }); + // FIXME: Revisit crash when dropping IMAGE_SCN_MEM_DISCARDABLE sections + // Get the section's memory protection flags. MemProt Prot = MemProt::None; if ((*Sec)->Characteristics & COFF::IMAGE_SCN_MEM_EXECUTE) @@ -190,6 +188,7 @@ Error COFFLinkGraphBuilder::graphifySymbols() { LLVM_DEBUG(dbgs() << " Creating graph symbols...\n"); SymbolSets.resize(Obj.getNumberOfSections() + 1); + PendingComdatExports.resize(Obj.getNumberOfSections() + 1); GraphSymbols.resize(Obj.getNumberOfSymbols()); for (COFFSymbolIndex SymIndex = 0; @@ -232,18 +231,16 @@ Error COFFLinkGraphBuilder::graphifySymbols() { << getCOFFSectionName(SectionIndex, Sec, *Sym) << " (index: " << SectionIndex << ") \n"; }); - GSym = - &G->addExternalSymbol(SymbolName, Sym->getValue(), Linkage::Strong); + if (!ExternalSymbols.count(SymbolName)) + ExternalSymbols[SymbolName] = + &G->addExternalSymbol(SymbolName, Sym->getValue(), Linkage::Strong); + GSym = ExternalSymbols[SymbolName]; } else if (Sym->isWeakExternal()) { - COFFSymbolIndex TagIndex = - Sym->getAux<object::coff_aux_weak_external>()->TagIndex; - assert(Sym->getAux<object::coff_aux_weak_external>()->Characteristics != - COFF::IMAGE_WEAK_EXTERN_SEARCH_NOLIBRARY && - "IMAGE_WEAK_EXTERN_SEARCH_NOLIBRARY is not supported."); - assert(Sym->getAux<object::coff_aux_weak_external>()->Characteristics != - COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY && - "IMAGE_WEAK_EXTERN_SEARCH_LIBRARY is not supported."); - WeakAliasRequests.push_back({SymIndex, TagIndex, SymbolName}); + auto *WeakExternal = Sym->getAux<object::coff_aux_weak_external>(); + COFFSymbolIndex TagIndex = WeakExternal->TagIndex; + uint32_t Characteristics = WeakExternal->Characteristics; + WeakExternalRequests.push_back( + {SymIndex, TagIndex, Characteristics, SymbolName}); } else { Expected<jitlink::Symbol *> NewGSym = createDefinedSymbol(SymIndex, SymbolName, *Sym, Sec); @@ -279,35 +276,41 @@ Error COFFLinkGraphBuilder::graphifySymbols() { Error COFFLinkGraphBuilder::flushWeakAliasRequests() { // Export the weak external symbols and alias it - for (auto &WeakAlias : WeakAliasRequests) { - if (auto *Target = getGraphSymbol(WeakAlias.Target)) { + for (auto &WeakExternal : WeakExternalRequests) { + if (auto *Target = getGraphSymbol(WeakExternal.Target)) { Expected<object::COFFSymbolRef> AliasSymbol = - Obj.getSymbol(WeakAlias.Alias); + Obj.getSymbol(WeakExternal.Alias); if (!AliasSymbol) return AliasSymbol.takeError(); + // FIXME: IMAGE_WEAK_EXTERN_SEARCH_NOLIBRARY and + // IMAGE_WEAK_EXTERN_SEARCH_LIBRARY are handled in the same way. + Scope S = + WeakExternal.Characteristics == COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS + ? Scope::Default + : Scope::Local; + // FIXME: Support this when there's a way to handle this. if (!Target->isDefined()) return make_error<JITLinkError>("Weak external symbol with external " "symbol as alternative not supported."); jitlink::Symbol *NewSymbol = &G->addDefinedSymbol( - Target->getBlock(), Target->getOffset(), WeakAlias.SymbolName, - Target->getSize(), Linkage::Weak, Scope::Default, - Target->isCallable(), false); - setGraphSymbol(AliasSymbol->getSectionNumber(), WeakAlias.Alias, + Target->getBlock(), Target->getOffset(), WeakExternal.SymbolName, + Target->getSize(), Linkage::Weak, S, Target->isCallable(), false); + setGraphSymbol(AliasSymbol->getSectionNumber(), WeakExternal.Alias, *NewSymbol); LLVM_DEBUG({ - dbgs() << " " << WeakAlias.Alias + dbgs() << " " << WeakExternal.Alias << ": Creating weak external symbol for COFF symbol \"" - << WeakAlias.SymbolName << "\" in section " + << WeakExternal.SymbolName << "\" in section " << AliasSymbol->getSectionNumber() << "\n"; dbgs() << " " << *NewSymbol << "\n"; }); } else return make_error<JITLinkError>("Weak symbol alias requested but actual " "symbol not found for symbol " + - formatv("{0:d}", WeakAlias.Alias)); + formatv("{0:d}", WeakExternal.Alias)); } return Error::success(); } @@ -324,6 +327,8 @@ Error COFFLinkGraphBuilder::calculateImplicitSizeOfSymbols() { SecIndex <= static_cast<COFFSectionIndex>(Obj.getNumberOfSections()); SecIndex++) { auto &SymbolSet = SymbolSets[SecIndex]; + if (SymbolSet.empty()) + continue; jitlink::Block *B = getGraphBlock(SecIndex); orc::ExecutorAddrDiff LastOffset = B->getSize(); orc::ExecutorAddrDiff LastDifferentOffset = B->getSize(); @@ -394,25 +399,35 @@ Expected<Symbol *> COFFLinkGraphBuilder::createDefinedSymbol( formatv("{0:d}", SymIndex)); Block *B = getGraphBlock(Symbol.getSectionNumber()); + if (!B) { + LLVM_DEBUG({ + dbgs() << " " << SymIndex + << ": Skipping graph symbol since section was not created for " + "COFF symbol \"" + << SymbolName << "\" in section " << Symbol.getSectionNumber() + << "\n"; + }); + return nullptr; + } + if (Symbol.isExternal()) { // This is not a comdat sequence, export the symbol as it is - if (!isComdatSection(Section)) + if (!isComdatSection(Section)) { + return &G->addDefinedSymbol( *B, Symbol.getValue(), SymbolName, 0, Linkage::Strong, Scope::Default, Symbol.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION, false); - else { - if (!PendingComdatExport) + } else { + if (!PendingComdatExports[Symbol.getSectionNumber()]) return make_error<JITLinkError>("No pending COMDAT export for symbol " + formatv("{0:d}", SymIndex)); - if (PendingComdatExport->SectionIndex != Symbol.getSectionNumber()) - return make_error<JITLinkError>( - "COMDAT export section number mismatch for symbol " + - formatv("{0:d}", SymIndex)); + return exportCOMDATSymbol(SymIndex, SymbolName, Symbol); } } - if (Symbol.getStorageClass() == COFF::IMAGE_SYM_CLASS_STATIC) { + if (Symbol.getStorageClass() == COFF::IMAGE_SYM_CLASS_STATIC || + Symbol.getStorageClass() == COFF::IMAGE_SYM_CLASS_LABEL) { const object::coff_aux_section_definition *Definition = Symbol.getSectionDefinition(); if (!Definition || !isComdatSection(Section)) { @@ -422,12 +437,14 @@ Expected<Symbol *> COFFLinkGraphBuilder::createDefinedSymbol( Symbol.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION, false); } if (Definition->Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) { - // FIXME: don't dead strip this when parent section is alive - return &G->addDefinedSymbol( + auto Target = Definition->getNumber(Symbol.isBigObj()); + auto GSym = &G->addDefinedSymbol( *B, Symbol.getValue(), SymbolName, 0, Linkage::Strong, Scope::Local, Symbol.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION, false); + getGraphBlock(Target)->addEdge(Edge::KeepAlive, 0, *GSym, 0); + return GSym; } - if (PendingComdatExport) + if (PendingComdatExports[Symbol.getSectionNumber()]) return make_error<JITLinkError>( "COMDAT export request already exists before symbol " + formatv("{0:d}", SymIndex)); @@ -474,10 +491,16 @@ Expected<Symbol *> COFFLinkGraphBuilder::createCOMDATExportRequest( break; } case COFF::IMAGE_COMDAT_SELECT_LARGEST: { - // FIXME: Support IMAGE_COMDAT_SELECT_LARGEST when LinkGraph is able to - // handle this. - return make_error<JITLinkError>( - "IMAGE_COMDAT_SELECT_LARGEST is not supported."); + // FIXME: Support IMAGE_COMDAT_SELECT_LARGEST properly when LinkGraph is + // able to handle this. + LLVM_DEBUG({ + dbgs() << " " << SymIndex + << ": Partially supported IMAGE_COMDAT_SELECT_LARGEST was used" + " in section " + << Symbol.getSectionNumber() << "\n"; + }); + L = Linkage::Weak; + break; } case COFF::IMAGE_COMDAT_SELECT_NEWEST: { // Even link.exe doesn't support this selection properly. @@ -489,7 +512,7 @@ Expected<Symbol *> COFFLinkGraphBuilder::createCOMDATExportRequest( formatv("{0:d}", Definition->Selection)); } } - PendingComdatExport = {SymIndex, Symbol.getSectionNumber(), L}; + PendingComdatExports[Symbol.getSectionNumber()] = {SymIndex, L}; return &G->addAnonymousSymbol(*B, Symbol.getValue(), Definition->Length, false, false); } @@ -499,6 +522,7 @@ Expected<Symbol *> COFFLinkGraphBuilder::exportCOMDATSymbol(COFFSymbolIndex SymIndex, StringRef SymbolName, object::COFFSymbolRef Symbol) { + auto &PendingComdatExport = PendingComdatExports[Symbol.getSectionNumber()]; COFFSymbolIndex TargetIndex = PendingComdatExport->SymbolIndex; Linkage L = PendingComdatExport->Linkage; jitlink::Symbol *Target = getGraphSymbol(TargetIndex); diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h index 4dc1b14dc4a2..f925f6d7aeef 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h @@ -111,19 +111,19 @@ private: // COMDAT sequence. struct ComdatExportRequest { COFFSymbolIndex SymbolIndex; - COFFSectionIndex SectionIndex; jitlink::Linkage Linkage; }; - Optional<ComdatExportRequest> PendingComdatExport; + std::vector<Optional<ComdatExportRequest>> PendingComdatExports; // This represents a pending request to create a weak external symbol with a // name. - struct WeakAliasRequest { + struct WeakExternalRequest { COFFSymbolIndex Alias; COFFSymbolIndex Target; + uint32_t Characteristics; StringRef SymbolName; }; - std::vector<WeakAliasRequest> WeakAliasRequests; + std::vector<WeakExternalRequest> WeakExternalRequests; // Per COFF section jitlink symbol set sorted by offset. // Used for calculating implicit size of defined symbols. @@ -162,6 +162,8 @@ private: Section *CommonSection = nullptr; std::vector<Block *> GraphBlocks; std::vector<Symbol *> GraphSymbols; + + DenseMap<StringRef, Symbol *> ExternalSymbols; }; template <typename RelocHandlerFunction> diff --git a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp index 3d36ad1ed767..e2040dc95acc 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp @@ -12,8 +12,8 @@ #include "llvm/ExecutionEngine/JITLink/COFF_x86_64.h" #include "COFFLinkGraphBuilder.h" -#include "EHFrameSupportImpl.h" #include "JITLinkGeneric.h" +#include "SEHFrameSupport.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/ExecutionEngine/JITLink/x86_64.h" #include "llvm/Object/COFF.h" @@ -26,6 +26,11 @@ using namespace llvm::jitlink; namespace { +enum EdgeKind_coff_x86_64 : Edge::Kind { + PCRel32 = x86_64::FirstPlatformRelocation, + Pointer32NB, +}; + class COFFJITLinker_x86_64 : public JITLinker<COFFJITLinker_x86_64> { friend class JITLinker<COFFJITLinker_x86_64>; @@ -43,27 +48,7 @@ private: class COFFLinkGraphBuilder_x86_64 : public COFFLinkGraphBuilder { private: - uint64_t ImageBase = 0; - enum COFFX86RelocationKind { - COFFAddr32NB, - COFFRel32, - }; - - static Expected<COFFX86RelocationKind> - getRelocationKind(const uint32_t Type) { - switch (Type) { - case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_ADDR32NB: - return COFFAddr32NB; - case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_REL32: - return COFFRel32; - } - - return make_error<JITLinkError>("Unsupported x86_64 relocation:" + - formatv("{0:d}", Type)); - } - Error addRelocations() override { - LLVM_DEBUG(dbgs() << "Processing relocations:\n"); for (const auto &RelSect : sections()) @@ -74,21 +59,9 @@ private: return Error::success(); } - uint64_t getImageBase() { - if (!ImageBase) { - ImageBase = std::numeric_limits<uint64_t>::max(); - for (const auto &Block : getGraph().blocks()) { - if (Block->getAddress().getValue()) - ImageBase = std::min(ImageBase, Block->getAddress().getValue()); - } - } - return ImageBase; - } - Error addSingleRelocation(const object::RelocationRef &Rel, const object::SectionRef &FixupSect, Block &BlockToFix) { - const object::coff_relocation *COFFRel = getObject().getCOFFRelocation(Rel); auto SymbolIt = Rel.getSymbol(); if (SymbolIt == getObject().symbol_end()) { @@ -110,62 +83,122 @@ private: SymIndex, FixupSect.getIndex()), inconvertibleErrorCode()); - Expected<COFFX86RelocationKind> RelocKind = - getRelocationKind(Rel.getType()); - if (!RelocKind) - return RelocKind.takeError(); - int64_t Addend = 0; orc::ExecutorAddr FixupAddress = orc::ExecutorAddr(FixupSect.getAddress()) + Rel.getOffset(); Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge::Kind Kind = Edge::Invalid; + const char *FixupPtr = BlockToFix.getContent().data() + Offset; - switch (*RelocKind) { - case COFFAddr32NB: { - Kind = x86_64::Pointer32; - Offset -= getImageBase(); + switch (Rel.getType()) { + case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_ADDR32NB: { + Kind = EdgeKind_coff_x86_64::Pointer32NB; + Addend = *reinterpret_cast<const support::little32_t *>(FixupPtr); break; } - case COFFRel32: { - Kind = x86_64::BranchPCRel32; + case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_REL32: { + Kind = EdgeKind_coff_x86_64::PCRel32; + Addend = *reinterpret_cast<const support::little32_t *>(FixupPtr); break; } + case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_REL32_1: { + Kind = EdgeKind_coff_x86_64::PCRel32; + Addend = *reinterpret_cast<const support::little32_t *>(FixupPtr); + Addend -= 1; + break; + } + default: { + return make_error<JITLinkError>("Unsupported x86_64 relocation:" + + formatv("{0:d}", Rel.getType())); + } }; Edge GE(Kind, Offset, *GraphSymbol, Addend); LLVM_DEBUG({ dbgs() << " "; - printEdge(dbgs(), BlockToFix, GE, x86_64::getEdgeKindName(Kind)); + printEdge(dbgs(), BlockToFix, GE, getCOFFX86RelocationKindName(Kind)); dbgs() << "\n"; }); BlockToFix.addEdge(std::move(GE)); + return Error::success(); } - /// Return the string name of the given COFF x86_64 edge kind. - const char *getCOFFX86RelocationKindName(COFFX86RelocationKind R) { - switch (R) { - case COFFAddr32NB: - return "COFFAddr32NB"; - case COFFRel32: - return "COFFRel32"; +public: + COFFLinkGraphBuilder_x86_64(const object::COFFObjectFile &Obj, const Triple T) + : COFFLinkGraphBuilder(Obj, std::move(T), getCOFFX86RelocationKindName) {} +}; + +class COFFLinkGraphLowering_x86_64 { +public: + // Lowers COFF x86_64 specific edges to generic x86_64 edges. + Error lowerCOFFRelocationEdges(LinkGraph &G, JITLinkContext &Ctx) { + for (auto *B : G.blocks()) { + for (auto &E : B->edges()) { + switch (E.getKind()) { + case EdgeKind_coff_x86_64::Pointer32NB: { + auto ImageBase = getImageBaseAddress(G, Ctx); + if (!ImageBase) + return ImageBase.takeError(); + E.setAddend(E.getAddend() - *ImageBase); + E.setKind(x86_64::Pointer32); + break; + } + case EdgeKind_coff_x86_64::PCRel32: { + E.setKind(x86_64::PCRel32); + break; + } + default: + break; + } + } } + return Error::success(); } -public: - COFFLinkGraphBuilder_x86_64(const object::COFFObjectFile &Obj, const Triple T) - : COFFLinkGraphBuilder(Obj, std::move(T), x86_64::getEdgeKindName) {} +private: + static StringRef getImageBaseSymbolName() { return "__ImageBase"; } + Expected<JITTargetAddress> getImageBaseAddress(LinkGraph &G, + JITLinkContext &Ctx) { + if (this->ImageBase) + return this->ImageBase; + for (auto *S : G.defined_symbols()) + if (S->getName() == getImageBaseSymbolName()) { + this->ImageBase = S->getAddress().getValue(); + return this->ImageBase; + } + + JITLinkContext::LookupMap Symbols; + Symbols[getImageBaseSymbolName()] = SymbolLookupFlags::RequiredSymbol; + JITTargetAddress ImageBase; + Error Err = Error::success(); + Ctx.lookup(Symbols, + createLookupContinuation([&](Expected<AsyncLookupResult> LR) { + ErrorAsOutParameter EAO(&Err); + if (!LR) { + Err = LR.takeError(); + return; + } + auto &ImageBaseSymbol = LR->begin()->second; + ImageBase = ImageBaseSymbol.getAddress(); + })); + if (Err) + return std::move(Err); + this->ImageBase = ImageBase; + return ImageBase; + } + JITTargetAddress ImageBase = 0; }; -Error buildTables_COFF_x86_64(LinkGraph &G) { - LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n"); +Error lowerEdges_COFF_x86_64(LinkGraph &G, JITLinkContext *Ctx) { + LLVM_DEBUG(dbgs() << "Lowering COFF x86_64 edges:\n"); + COFFLinkGraphLowering_x86_64 GraphLowering; + + if (auto Err = GraphLowering.lowerCOFFRelocationEdges(G, *Ctx)) + return Err; - x86_64::GOTTableManager GOT; - x86_64::PLTTableManager PLT(GOT); - visitExistingEdges(G, GOT, PLT); return Error::success(); } } // namespace @@ -173,6 +206,18 @@ Error buildTables_COFF_x86_64(LinkGraph &G) { namespace llvm { namespace jitlink { +/// Return the string name of the given COFF x86_64 edge kind. +const char *getCOFFX86RelocationKindName(Edge::Kind R) { + switch (R) { + case PCRel32: + return "PCRel32"; + case Pointer32NB: + return "Pointer32NB"; + default: + return x86_64::getEdgeKindName(R); + } +} + Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromCOFFObject_x86_64(MemoryBufferRef ObjectBuffer) { LLVM_DEBUG({ @@ -194,16 +239,16 @@ void link_COFF_x86_64(std::unique_ptr<LinkGraph> G, const Triple &TT = G->getTargetTriple(); if (Ctx->shouldAddDefaultTargetPasses(TT)) { // Add a mark-live pass. - if (auto MarkLive = Ctx->getMarkLivePass(TT)) + if (auto MarkLive = Ctx->getMarkLivePass(TT)) { Config.PrePrunePasses.push_back(std::move(MarkLive)); - else + Config.PrePrunePasses.push_back(SEHFrameKeepAlivePass(".pdata")); + } else Config.PrePrunePasses.push_back(markAllSymbolsLive); - // Add an in-place GOT/Stubs/TLSInfoEntry build pass. - Config.PostPrunePasses.push_back(buildTables_COFF_x86_64); - - // Add GOT/Stubs optimizer pass. - Config.PreFixupPasses.push_back(x86_64::optimizeGOTAndStubAccesses); + // Add COFF edge lowering passes. + JITLinkContext *CtxPtr = Ctx.get(); + Config.PreFixupPasses.push_back( + [CtxPtr](LinkGraph &G) { return lowerEdges_COFF_x86_64(G, CtxPtr); }); } if (auto Err = Ctx->modifyPassConfig(*G, Config)) diff --git a/llvm/lib/ExecutionEngine/JITLink/SEHFrameSupport.h b/llvm/lib/ExecutionEngine/JITLink/SEHFrameSupport.h new file mode 100644 index 000000000000..f7689e4e4043 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/SEHFrameSupport.h @@ -0,0 +1,61 @@ +//===------- SEHFrameSupport.h - JITLink seh-frame utils --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// SEHFrame utils for JITLink. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_SEHFRAMESUPPORT_H +#define LLVM_EXECUTIONENGINE_JITLINK_SEHFRAMESUPPORT_H + +#include "llvm/ADT/Triple.h" +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/Support/Error.h" + +namespace llvm { +namespace jitlink { +/// This pass adds keep-alive edge from SEH frame sections +/// to the parent function content block. +class SEHFrameKeepAlivePass { +public: + SEHFrameKeepAlivePass(StringRef SEHFrameSectionName) + : SEHFrameSectionName(SEHFrameSectionName) {} + + Error operator()(LinkGraph &G) { + auto *S = G.findSectionByName(SEHFrameSectionName); + if (!S) + return Error::success(); + + // Simply consider every block pointed by seh frame block as parants. + // This adds some unnecessary keep-alive edges to unwind info blocks, + // (xdata) but these blocks are usually dead by default, so they wouldn't + // count for the fate of seh frame block. + for (auto *B : S->blocks()) { + auto &DummySymbol = G.addAnonymousSymbol(*B, 0, 0, false, false); + DenseSet<Block *> Children; + for (auto &E : B->edges()) { + auto &Sym = E.getTarget(); + if (!Sym.isDefined()) + continue; + Children.insert(&Sym.getBlock()); + } + for (auto *Child : Children) + Child->addEdge(Edge(Edge::KeepAlive, 0, DummySymbol, 0)); + } + return Error::success(); + } + +private: + StringRef SEHFrameSectionName; +}; + +} // end namespace jitlink +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_JITLINK_SEHFRAMESUPPORT_H
\ No newline at end of file diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index df9979b47e88..393250a5578b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -36,6 +36,8 @@ const char *getEdgeKindName(Edge::Kind K) { return "NegDelta32"; case Delta64FromGOT: return "Delta64FromGOT"; + case PCRel32: + return "PCRel32"; case BranchPCRel32: return "BranchPCRel32"; case BranchPCRel32ToPtrJumpStub: diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp index 356b81b4f1c5..3de15db3f1c6 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp @@ -150,16 +150,39 @@ static Expected<MaterializationUnit::Interface> getCOFFObjectFileSymbolInfo(ExecutionSession &ES, const object::COFFObjectFile &Obj) { MaterializationUnit::Interface I; - + std::vector<Optional<object::coff_aux_section_definition>> ComdatDefs( + Obj.getNumberOfSections() + 1); for (auto &Sym : Obj.symbols()) { Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); if (!SymFlagsOrErr) // TODO: Test this error. return SymFlagsOrErr.takeError(); - // Skip symbols not defined in this object file. - if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) - continue; + // Handle comdat symbols + auto COFFSym = Obj.getCOFFSymbol(Sym); + bool IsWeak = false; + if (auto *Def = COFFSym.getSectionDefinition()) { + auto Sec = Obj.getSection(COFFSym.getSectionNumber()); + if (!Sec) + return Sec.takeError(); + if (((*Sec)->Characteristics & COFF::IMAGE_SCN_LNK_COMDAT) && + Def->Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) { + ComdatDefs[COFFSym.getSectionNumber()] = *Def; + continue; + } + } + if (!COFF::isReservedSectionNumber(COFFSym.getSectionNumber()) && + ComdatDefs[COFFSym.getSectionNumber()]) { + auto Def = ComdatDefs[COFFSym.getSectionNumber()]; + if (Def->Selection != COFF::IMAGE_COMDAT_SELECT_NODUPLICATES) { + IsWeak = true; + } + ComdatDefs[COFFSym.getSectionNumber()] = None; + } else { + // Skip symbols not defined in this object file. + if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) + continue; + } // Skip symbols that are not global. if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) @@ -180,12 +203,13 @@ getCOFFObjectFileSymbolInfo(ExecutionSession &ES, if (!SymFlags) return SymFlags.takeError(); *SymFlags |= JITSymbolFlags::Exported; - auto COFFSym = Obj.getCOFFSymbol(Sym); // Weak external is always a function - if (COFFSym.isWeakExternal()) { + if (COFFSym.isWeakExternal()) *SymFlags |= JITSymbolFlags::Callable; - } + + if (IsWeak) + *SymFlags |= JITSymbolFlags::Weak; I.SymbolFlags[ES.intern(*Name)] = std::move(*SymFlags); } diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index bf13b6c325ec..5d4cfceefb3e 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1424,6 +1424,8 @@ void Pattern::printVariableDefs(const SourceMgr &SM, // Sort variable captures by the order in which they matched the input. // Ranges shouldn't be overlapping, so we can just compare the start. llvm::sort(VarCaptures, [](const VarCapture &A, const VarCapture &B) { + if (&A == &B) + return false; assert(A.Range.Start != B.Range.Start && "unexpected overlapping variable captures"); return A.Range.Start.getPointer() < B.Range.Start.getPointer(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 26171f537244..f5039eb5126c 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1627,6 +1627,10 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val, Align Alignment, AtomicOrdering Ordering, SyncScope::ID SSID) { + assert(Ordering != AtomicOrdering::NotAtomic && + "atomicrmw instructions can only be atomic."); + assert(Ordering != AtomicOrdering::Unordered && + "atomicrmw instructions cannot be unordered."); Op<0>() = Ptr; Op<1>() = Val; setOperation(Operation); diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index c50d6901c9da..8ca75f58e403 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -32,6 +32,39 @@ using namespace llvm; +bool IntrinsicInst::mayLowerToFunctionCall(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::objc_autorelease: + case Intrinsic::objc_autoreleasePoolPop: + case Intrinsic::objc_autoreleasePoolPush: + case Intrinsic::objc_autoreleaseReturnValue: + case Intrinsic::objc_copyWeak: + case Intrinsic::objc_destroyWeak: + case Intrinsic::objc_initWeak: + case Intrinsic::objc_loadWeak: + case Intrinsic::objc_loadWeakRetained: + case Intrinsic::objc_moveWeak: + case Intrinsic::objc_release: + case Intrinsic::objc_retain: + case Intrinsic::objc_retainAutorelease: + case Intrinsic::objc_retainAutoreleaseReturnValue: + case Intrinsic::objc_retainAutoreleasedReturnValue: + case Intrinsic::objc_retainBlock: + case Intrinsic::objc_storeStrong: + case Intrinsic::objc_storeWeak: + case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue: + case Intrinsic::objc_retainedObject: + case Intrinsic::objc_unretainedObject: + case Intrinsic::objc_unretainedPointer: + case Intrinsic::objc_retain_autorelease: + case Intrinsic::objc_sync_enter: + case Intrinsic::objc_sync_exit: + return true; + default: + return false; + } +} + //===----------------------------------------------------------------------===// /// DbgVariableIntrinsic - This is the common base class for debug info /// intrinsics for variables. diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index 0ca40a675fe4..3e82987801c7 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -105,11 +105,13 @@ uint64_t ModuleSummaryIndex::getFlags() const { Flags |= 0x20; if (withDSOLocalPropagation()) Flags |= 0x40; + if (withWholeProgramVisibility()) + Flags |= 0x80; return Flags; } void ModuleSummaryIndex::setFlags(uint64_t Flags) { - assert(Flags <= 0x7f && "Unexpected bits in flag"); + assert(Flags <= 0xff && "Unexpected bits in flag"); // 1 bit: WithGlobalValueDeadStripping flag. // Set on combined index only. if (Flags & 0x1) @@ -139,6 +141,10 @@ void ModuleSummaryIndex::setFlags(uint64_t Flags) { // Set on combined index only. if (Flags & 0x40) setWithDSOLocalPropagation(); + // 1 bit: WithWholeProgramVisibility flag. + // Set on combined index only. + if (Flags & 0x80) + setWithWholeProgramVisibility(); } // Collect for the given module the list of function it defines diff --git a/llvm/lib/IR/PrintPasses.cpp b/llvm/lib/IR/PrintPasses.cpp index 83b8c93e766f..fe2da5ca114f 100644 --- a/llvm/lib/IR/PrintPasses.cpp +++ b/llvm/lib/IR/PrintPasses.cpp @@ -29,6 +29,50 @@ static cl::opt<bool> PrintAfterAll("print-after-all", llvm::cl::desc("Print IR after each pass"), cl::init(false), cl::Hidden); +// Print out the IR after passes, similar to -print-after-all except that it +// only prints the IR after passes that change the IR. Those passes that do not +// make changes to the IR are reported as not making any changes. In addition, +// the initial IR is also reported. Other hidden options affect the output from +// this option. -filter-passes will limit the output to the named passes that +// actually change the IR and other passes are reported as filtered out. The +// specified passes will either be reported as making no changes (with no IR +// reported) or the changed IR will be reported. Also, the -filter-print-funcs +// and -print-module-scope options will do similar filtering based on function +// name, reporting changed IRs as functions(or modules if -print-module-scope is +// specified) for a particular function or indicating that the IR has been +// filtered out. The extra options can be combined, allowing only changed IRs +// for certain passes on certain functions to be reported in different formats, +// with the rest being reported as filtered out. The -print-before-changed +// option will print the IR as it was before each pass that changed it. The +// optional value of quiet will only report when the IR changes, suppressing all +// other messages, including the initial IR. The values "diff" and "diff-quiet" +// will present the changes in a form similar to a patch, in either verbose or +// quiet mode, respectively. The lines that are removed and added are prefixed +// with '-' and '+', respectively. The -filter-print-funcs and -filter-passes +// can be used to filter the output. This reporter relies on the linux diff +// utility to do comparisons and insert the prefixes. For systems that do not +// have the necessary facilities, the error message will be shown in place of +// the expected output. +cl::opt<ChangePrinter> llvm::PrintChanged( + "print-changed", cl::desc("Print changed IRs"), cl::Hidden, + cl::ValueOptional, cl::init(ChangePrinter::None), + cl::values( + clEnumValN(ChangePrinter::Quiet, "quiet", "Run in quiet mode"), + clEnumValN(ChangePrinter::DiffVerbose, "diff", + "Display patch-like changes"), + clEnumValN(ChangePrinter::DiffQuiet, "diff-quiet", + "Display patch-like changes in quiet mode"), + clEnumValN(ChangePrinter::ColourDiffVerbose, "cdiff", + "Display patch-like changes with color"), + clEnumValN(ChangePrinter::ColourDiffQuiet, "cdiff-quiet", + "Display patch-like changes in quiet mode with color"), + clEnumValN(ChangePrinter::DotCfgVerbose, "dot-cfg", + "Create a website with graphical changes"), + clEnumValN(ChangePrinter::DotCfgQuiet, "dot-cfg-quiet", + "Create a website with graphical changes in quiet mode"), + // Sentinel value for unspecified option. + clEnumValN(ChangePrinter::Verbose, "", ""))); + static cl::opt<bool> PrintModuleScope("print-module-scope", cl::desc("When printing IR for print-[before|after]{-all} " diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index a9e04ba760ca..cc7be24c1dbd 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1103,6 +1103,8 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { updateVCallVisibilityInModule(*RegularLTO.CombinedModule, Conf.HasWholeProgramVisibility, DynamicExportSymbols); + updatePublicTypeTestCalls(*RegularLTO.CombinedModule, + Conf.HasWholeProgramVisibility); if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule)) @@ -1482,6 +1484,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, std::set<GlobalValue::GUID> ExportedGUIDs; + if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility)) + ThinLTO.CombinedIndex.setWithWholeProgramVisibility(); // If allowed, upgrade public vcall visibility to linkage unit visibility in // the summaries before whole program devirtualization below. updateVCallVisibilityInIndex(ThinLTO.CombinedIndex, diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index e248e58e4e4e..2e32469b4926 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/FunctionImportUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" @@ -560,6 +561,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // the module, if applicable. Mod.setPartialSampleProfileRatio(CombinedIndex); + updatePublicTypeTestCalls(Mod, CombinedIndex.withWholeProgramVisibility()); + if (Conf.CodeGenOnly) { codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 2abf249cbd62..2f7c485b9fc8 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -520,6 +520,8 @@ bool LTOCodeGenerator::optimize() { // linker option in the old LTO API, but this call allows it to be specified // via the internal option. Must be done before WPD invoked via the optimizer // pipeline run below. + updatePublicTypeTestCalls(*MergedModule, + /* WholeProgramVisibilityEnabledInLTO */ false); updateVCallVisibilityInModule(*MergedModule, /* WholeProgramVisibilityEnabledInLTO */ false, // FIXME: This needs linker information via a @@ -539,6 +541,16 @@ bool LTOCodeGenerator::optimize() { // Add an appropriate DataLayout instance for this module... MergedModule->setDataLayout(TargetMach->createDataLayout()); + if (!SaveIRBeforeOptPath.empty()) { + std::error_code EC; + raw_fd_ostream OS(SaveIRBeforeOptPath, EC, sys::fs::OF_None); + if (EC) + report_fatal_error(Twine("Failed to open ") + SaveIRBeforeOptPath + + " to save optimized bitcode\n"); + WriteBitcodeToFile(*MergedModule, OS, + /* ShouldPreserveUseListOrder */ true); + } + ModuleSummaryIndex CombinedIndex(false); TargetMach = createTargetMachine(); if (!opt(Config, TargetMach.get(), 0, *MergedModule, /*IsThinLTO=*/false, diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index a1041b3c85f5..2c723bef7d12 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -452,6 +452,10 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index, bool DisableCodeGen, StringRef SaveTempsDir, bool Freestanding, unsigned OptLevel, unsigned count, bool DebugPassManager) { + // See comment at call to updateVCallVisibilityInIndex() for why + // WholeProgramVisibilityEnabledInLTO is false. + updatePublicTypeTestCalls(TheModule, + /* WholeProgramVisibilityEnabledInLTO */ false); // "Benchmark"-like optimization: single-source case bool SingleModule = (ModuleMap.size() == 1); @@ -1047,6 +1051,8 @@ void ThinLTOCodeGenerator::run() { // Currently there is no support for enabling whole program visibility via a // linker option in the old LTO API, but this call allows it to be specified // via the internal option. Must be done before WPD below. + if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false)) + Index->setWithWholeProgramVisibility(); updateVCallVisibilityInIndex(*Index, /* WholeProgramVisibilityEnabledInLTO */ false, // FIXME: This needs linker information via a diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 0b4e9866d50a..f6360c4e2f21 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -293,9 +293,8 @@ public: : ELFObjectWriter(std::move(MOTW)), OS(OS), DwoOS(DwoOS), IsLittleEndian(IsLittleEndian) {} - virtual bool checkRelocation(MCContext &Ctx, SMLoc Loc, - const MCSectionELF *From, - const MCSectionELF *To) override { + bool checkRelocation(MCContext &Ctx, SMLoc Loc, const MCSectionELF *From, + const MCSectionELF *To) override { if (isDwoSection(*From)) { Ctx.reportError(Loc, "A dwo section may not contain relocations"); return false; diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp index cf98cb8ff59f..3ee43398ff65 100644 --- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp +++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp @@ -20,6 +20,11 @@ MCDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, return None; } +uint64_t MCDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + return 1; +} + bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value, uint64_t Address, bool IsBranch, uint64_t Offset, uint64_t OpSize, diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index d46ae2247535..8a43a477c1c7 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -253,7 +253,7 @@ class XCOFFObjectWriter : public MCObjectWriter { CsectGroup &getCsectGroup(const MCSectionXCOFF *MCSec); - virtual void reset() override; + void reset() override; void executePostLayoutBinding(MCAssembler &, const MCAsmLayout &) override; diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp index 8b44c09023f1..b127e1b43b8e 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp @@ -434,41 +434,13 @@ Error SectionWriter::visit(const OwnedDataSection &Sec) { return Error::success(); } -static constexpr std::array<uint8_t, 4> ZlibGnuMagic = {{'Z', 'L', 'I', 'B'}}; - -static bool isDataGnuCompressed(ArrayRef<uint8_t> Data) { - return Data.size() > ZlibGnuMagic.size() && - std::equal(ZlibGnuMagic.begin(), ZlibGnuMagic.end(), Data.data()); -} - -template <class ELFT> -static std::tuple<uint64_t, uint64_t> -getDecompressedSizeAndAlignment(ArrayRef<uint8_t> Data) { - const bool IsGnuDebug = isDataGnuCompressed(Data); - const uint64_t DecompressedSize = - IsGnuDebug - ? support::endian::read64be(Data.data() + ZlibGnuMagic.size()) - : reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data.data())->ch_size; - const uint64_t DecompressedAlign = - IsGnuDebug ? 1 - : reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data.data()) - ->ch_addralign; - - return std::make_tuple(DecompressedSize, DecompressedAlign); -} - template <class ELFT> Error ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) { - const size_t DataOffset = isDataGnuCompressed(Sec.OriginalData) - ? (ZlibGnuMagic.size() + sizeof(Sec.Size)) - : sizeof(Elf_Chdr_Impl<ELFT>); - - ArrayRef<uint8_t> CompressedContent(Sec.OriginalData.data() + DataOffset, - Sec.OriginalData.size() - DataOffset); + ArrayRef<uint8_t> Compressed = + Sec.OriginalData.slice(sizeof(Elf_Chdr_Impl<ELFT>)); SmallVector<uint8_t, 128> DecompressedContent; - if (Error Err = - compression::zlib::uncompress(CompressedContent, DecompressedContent, - static_cast<size_t>(Sec.Size))) + if (Error Err = compression::zlib::uncompress(Compressed, DecompressedContent, + static_cast<size_t>(Sec.Size))) return createStringError(errc::invalid_argument, "'" + Sec.Name + "': " + toString(std::move(Err))); @@ -518,7 +490,7 @@ Error BinarySectionWriter::visit(const CompressedSection &Sec) { template <class ELFT> Error ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) { uint8_t *Buf = reinterpret_cast<uint8_t *>(Out.getBufferStart()) + Sec.Offset; - Elf_Chdr_Impl<ELFT> Chdr; + Elf_Chdr_Impl<ELFT> Chdr = {}; switch (Sec.CompressionType) { case DebugCompressionType::None: std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(), Buf); @@ -1731,15 +1703,11 @@ Expected<SectionBase &> ELFBuilder<ELFT>::makeSection(const Elf_Shdr &Shdr) { if (!Name) return Name.takeError(); - if (Name->startswith(".zdebug") || (Shdr.sh_flags & ELF::SHF_COMPRESSED)) { - uint64_t DecompressedSize, DecompressedAlign; - std::tie(DecompressedSize, DecompressedAlign) = - getDecompressedSizeAndAlignment<ELFT>(*Data); - return Obj.addSection<CompressedSection>( - CompressedSection(*Data, DecompressedSize, DecompressedAlign)); - } - - return Obj.addSection<Section>(*Data); + if (!(Shdr.sh_flags & ELF::SHF_COMPRESSED)) + return Obj.addSection<Section>(*Data); + auto *Chdr = reinterpret_cast<const Elf_Chdr_Impl<ELFT> *>(Data->data()); + return Obj.addSection<CompressedSection>( + CompressedSection(*Data, Chdr->ch_size, Chdr->ch_addralign)); } } } diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index 799db5034532..2c3ea3a5f6d6 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -115,13 +115,13 @@ public: Error visit(const OwnedDataSection &Sec) override; Error visit(const StringTableSection &Sec) override; Error visit(const DynamicRelocationSection &Sec) override; - virtual Error visit(const SymbolTableSection &Sec) override = 0; - virtual Error visit(const RelocationSection &Sec) override = 0; - virtual Error visit(const GnuDebugLinkSection &Sec) override = 0; - virtual Error visit(const GroupSection &Sec) override = 0; - virtual Error visit(const SectionIndexSection &Sec) override = 0; - virtual Error visit(const CompressedSection &Sec) override = 0; - virtual Error visit(const DecompressedSection &Sec) override = 0; + Error visit(const SymbolTableSection &Sec) override = 0; + Error visit(const RelocationSection &Sec) override = 0; + Error visit(const GnuDebugLinkSection &Sec) override = 0; + Error visit(const GroupSection &Sec) override = 0; + Error visit(const SectionIndexSection &Sec) override = 0; + Error visit(const CompressedSection &Sec) override = 0; + Error visit(const DecompressedSection &Sec) override = 0; explicit SectionWriter(WritableMemoryBuffer &Buf) : Out(Buf) {} }; diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index b778006cf66e..9ad2c4135167 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -518,6 +518,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_AVR_ARCH_XMEGA7, EF_AVR_ARCH_MASK); BCase(EF_AVR_LINKRELAX_PREPARED); break; + case ELF::EM_LOONGARCH: + BCaseMask(EF_LOONGARCH_BASE_ABI_ILP32S, EF_LOONGARCH_BASE_ABI_MASK); + BCaseMask(EF_LOONGARCH_BASE_ABI_ILP32F, EF_LOONGARCH_BASE_ABI_MASK); + BCaseMask(EF_LOONGARCH_BASE_ABI_ILP32D, EF_LOONGARCH_BASE_ABI_MASK); + BCaseMask(EF_LOONGARCH_BASE_ABI_LP64S, EF_LOONGARCH_BASE_ABI_MASK); + BCaseMask(EF_LOONGARCH_BASE_ABI_LP64F, EF_LOONGARCH_BASE_ABI_MASK); + BCaseMask(EF_LOONGARCH_BASE_ABI_LP64D, EF_LOONGARCH_BASE_ABI_MASK); + break; case ELF::EM_RISCV: BCase(EF_RISCV_RVC); BCaseMask(EF_RISCV_FLOAT_ABI_SOFT, EF_RISCV_FLOAT_ABI); diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 3b3eefcc29ca..945ef512391b 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1249,6 +1249,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // flattening of blocks. OptimizePM.addPass(DivRemPairsPass()); + // Try to annotate calls that were created during optimization. + OptimizePM.addPass(TailCallElimPass()); + // LoopSink (and other loop passes since the last simplifyCFG) might have // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. OptimizePM.addPass( diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index baea0eb53ef9..a0c63fb33369 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -53,64 +53,6 @@ cl::opt<bool> PreservedCFGCheckerInstrumentation::VerifyPreservedCFG( #endif ); -// An option that prints out the IR after passes, similar to -// -print-after-all except that it only prints the IR after passes that -// change the IR. Those passes that do not make changes to the IR are -// reported as not making any changes. In addition, the initial IR is -// also reported. Other hidden options affect the output from this -// option. -filter-passes will limit the output to the named passes -// that actually change the IR and other passes are reported as filtered out. -// The specified passes will either be reported as making no changes (with -// no IR reported) or the changed IR will be reported. Also, the -// -filter-print-funcs and -print-module-scope options will do similar -// filtering based on function name, reporting changed IRs as functions(or -// modules if -print-module-scope is specified) for a particular function -// or indicating that the IR has been filtered out. The extra options -// can be combined, allowing only changed IRs for certain passes on certain -// functions to be reported in different formats, with the rest being -// reported as filtered out. The -print-before-changed option will print -// the IR as it was before each pass that changed it. The optional -// value of quiet will only report when the IR changes, suppressing -// all other messages, including the initial IR. The values "diff" and -// "diff-quiet" will present the changes in a form similar to a patch, in -// either verbose or quiet mode, respectively. The lines that are removed -// and added are prefixed with '-' and '+', respectively. The -// -filter-print-funcs and -filter-passes can be used to filter the output. -// This reporter relies on the linux diff utility to do comparisons and -// insert the prefixes. For systems that do not have the necessary -// facilities, the error message will be shown in place of the expected output. -// -enum class ChangePrinter { - None, - Verbose, - Quiet, - DiffVerbose, - DiffQuiet, - ColourDiffVerbose, - ColourDiffQuiet, - DotCfgVerbose, - DotCfgQuiet, -}; -static cl::opt<ChangePrinter> PrintChanged( - "print-changed", cl::desc("Print changed IRs"), cl::Hidden, - cl::ValueOptional, cl::init(ChangePrinter::None), - cl::values( - clEnumValN(ChangePrinter::Quiet, "quiet", "Run in quiet mode"), - clEnumValN(ChangePrinter::DiffVerbose, "diff", - "Display patch-like changes"), - clEnumValN(ChangePrinter::DiffQuiet, "diff-quiet", - "Display patch-like changes in quiet mode"), - clEnumValN(ChangePrinter::ColourDiffVerbose, "cdiff", - "Display patch-like changes with color"), - clEnumValN(ChangePrinter::ColourDiffQuiet, "cdiff-quiet", - "Display patch-like changes in quiet mode with color"), - clEnumValN(ChangePrinter::DotCfgVerbose, "dot-cfg", - "Create a website with graphical changes"), - clEnumValN(ChangePrinter::DotCfgQuiet, "dot-cfg-quiet", - "Create a website with graphical changes in quiet mode"), - // Sentinel value for unspecified option. - clEnumValN(ChangePrinter::Verbose, "", ""))); - // An option that supports the -print-changed option. See // the description for -print-changed for an explanation of the use // of this option. Note that this option has no effect without -print-changed. diff --git a/llvm/lib/Support/ARMAttributeParser.cpp b/llvm/lib/Support/ARMAttributeParser.cpp index adb5d3f0964d..03c0c7aac423 100644 --- a/llvm/lib/Support/ARMAttributeParser.cpp +++ b/llvm/lib/Support/ARMAttributeParser.cpp @@ -85,7 +85,7 @@ Error ARMAttributeParser::CPU_arch(AttrType tag) { static const char *strings[] = { "Pre-v4", "ARM v4", "ARM v4T", "ARM v5T", "ARM v5TE", "ARM v5TEJ", "ARM v6", "ARM v6KZ", "ARM v6T2", "ARM v6K", "ARM v7", "ARM v6-M", "ARM v6S-M", - "ARM v7E-M", "ARM v8", nullptr, + "ARM v7E-M", "ARM v8-A", "ARM v8-R", "ARM v8-M Baseline", "ARM v8-M Mainline", nullptr, nullptr, nullptr, "ARM v8.1-M Mainline", "ARM v9-A" }; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e070ce2efa6b..72f0fc94940c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -255,6 +255,12 @@ static bool isZeroingInactiveLanes(SDValue Op) { return false; case Intrinsic::aarch64_sve_ptrue: case Intrinsic::aarch64_sve_pnext: + case Intrinsic::aarch64_sve_cmpeq: + case Intrinsic::aarch64_sve_cmpne: + case Intrinsic::aarch64_sve_cmpge: + case Intrinsic::aarch64_sve_cmpgt: + case Intrinsic::aarch64_sve_cmphs: + case Intrinsic::aarch64_sve_cmphi: case Intrinsic::aarch64_sve_cmpeq_wide: case Intrinsic::aarch64_sve_cmpne_wide: case Intrinsic::aarch64_sve_cmpge_wide: @@ -265,6 +271,11 @@ static bool isZeroingInactiveLanes(SDValue Op) { case Intrinsic::aarch64_sve_cmphi_wide: case Intrinsic::aarch64_sve_cmplo_wide: case Intrinsic::aarch64_sve_cmpls_wide: + case Intrinsic::aarch64_sve_fcmpeq: + case Intrinsic::aarch64_sve_fcmpne: + case Intrinsic::aarch64_sve_fcmpge: + case Intrinsic::aarch64_sve_fcmpgt: + case Intrinsic::aarch64_sve_fcmpuo: return true; } } @@ -879,6 +890,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); @@ -974,6 +987,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, VT, Custom); if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::ConstantFP, MVT::f16, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom); @@ -1619,6 +1634,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::ANY_EXTEND, VT, Custom); setOperationAction(ISD::BITCAST, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::BSWAP, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); @@ -11126,6 +11142,20 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + if (useSVEForFixedLengthVectorVT(VT)) { + if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) { + SDLoc DL(Op); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); + SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); + SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); + return convertFromScalableVector(DAG, Op.getValueType(), Seq); + } + + // Revert to common legalisation for all other variants. + return SDValue(); + } + // Try to build a simple constant vector. Op = NormalizeBuildVector(Op, DAG); if (VT.isInteger()) { @@ -12772,6 +12802,12 @@ bool AArch64TargetLowering::shouldSinkOperands( if (isSplatShuffle(II->getOperand(1))) Ops.push_back(&II->getOperandUse(1)); return !Ops.empty(); + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0))) + if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) + Ops.push_back(&II->getOperandUse(0)); + return !Ops.empty(); case Intrinsic::aarch64_sme_write_horiz: case Intrinsic::aarch64_sme_write_vert: case Intrinsic::aarch64_sme_writeq_horiz: @@ -17142,7 +17178,8 @@ static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { assert((N->getOpcode() == AArch64ISD::UUNPKHI || N->getOpcode() == AArch64ISD::UUNPKLO) && "Unexpected Opcode!"); @@ -17151,6 +17188,42 @@ static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG) { if (N->getOperand(0).isUndef()) return DAG.getUNDEF(N->getValueType(0)); + // If this is a masked load followed by an UUNPKLO, fold this into a masked + // extending load. We can do this even if this is already a masked + // {z,}extload. + if (N->getOperand(0).getOpcode() == ISD::MLOAD && + N->getOpcode() == AArch64ISD::UUNPKLO) { + MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0)); + SDValue Mask = MLD->getMask(); + SDLoc DL(N); + + if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD && + SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE && + (MLD->getPassThru()->isUndef() || + isZerosVector(MLD->getPassThru().getNode()))) { + unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); + unsigned PgPattern = Mask->getConstantOperandVal(0); + EVT VT = N->getValueType(0); + + // Ensure we can double the size of the predicate pattern + unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); + if (NumElts && + NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) { + Mask = + getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern); + SDValue PassThru = DAG.getConstant(0, DL, VT); + SDValue NewLoad = DAG.getMaskedLoad( + VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask, + PassThru, MLD->getMemoryVT(), MLD->getMemOperand(), + MLD->getAddressingMode(), ISD::ZEXTLOAD); + + DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1)); + + return NewLoad; + } + } + } + return SDValue(); } @@ -17484,6 +17557,50 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } +static SDValue performMSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); + SDValue Value = MST->getValue(); + SDValue Mask = MST->getMask(); + SDLoc DL(N); + + // If this is a UZP1 followed by a masked store, fold this into a masked + // truncating store. We can do this even if this is already a masked + // truncstore. + if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() && + MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE && + Value.getValueType().isInteger()) { + Value = Value.getOperand(0); + if (Value.getOpcode() == ISD::BITCAST) { + EVT HalfVT = + Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + EVT InVT = Value.getOperand(0).getValueType(); + + if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) { + unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); + unsigned PgPattern = Mask->getConstantOperandVal(0); + + // Ensure we can double the size of the predicate pattern + unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); + if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <= + MinSVESize) { + Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1), + PgPattern); + return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0), + MST->getBasePtr(), MST->getOffset(), Mask, + MST->getMemoryVT(), MST->getMemOperand(), + MST->getAddressingMode(), + /*IsTruncating=*/true); + } + } + } + } + + return SDValue(); +} + /// \return true if part of the index was folded into the Base. static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG) { @@ -18191,7 +18308,9 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) { Op0ExtV, Op1ExtV, Op->getOperand(2)); } -static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performSETCCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -18234,6 +18353,21 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { } } + // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne) + // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne) + if (DCI.isBeforeLegalize() && VT.isScalarInteger() && + (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && + LHS->getOpcode() == ISD::BITCAST) { + EVT ToVT = LHS->getValueType(0); + EVT FromVT = LHS->getOperand(0).getValueType(); + if (FromVT.isFixedLengthVector() && + FromVT.getVectorElementType() == MVT::i1) { + LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0)); + LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS); + return DAG.getSetCC(DL, VT, LHS, RHS, Cond); + } + } + return SDValue(); } @@ -19376,13 +19510,15 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::SETCC: - return performSETCCCombine(N, DAG); + return performSETCCCombine(N, DCI, DAG); case ISD::LOAD: if (performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); + case ISD::MSTORE: + return performMSTORECombine(N, DCI, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return performMaskedGatherScatterCombine(N, DCI, DAG); @@ -19407,7 +19543,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performSpliceCombine(N, DAG); case AArch64ISD::UUNPKLO: case AArch64ISD::UUNPKHI: - return performUnpackCombine(N, DAG); + return performUnpackCombine(N, DAG, Subtarget); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); case AArch64ISD::SETCC_MERGE_ZERO: diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index a7b7e5270888..926e7305bab9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4052,6 +4052,12 @@ def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>, def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>; def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>; +// Pattern for FP16 immediates +let Predicates = [HasFullFP16] in { + def : Pat<(f16 fpimm:$in), + (FMOVWHr (MOVi32imm (bitcast_fpimm_to_i32 f16:$in)))>; +} + //===----------------------------------------------------------------------===// // Floating point conversion instruction. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 4032c4667bc7..9b040860cc3c 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -287,6 +287,8 @@ def AArch64fadda_p_node : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWith def AArch64fadda_p : PatFrags<(ops node:$op1, node:$op2, node:$op3), [(AArch64fadda_p_node node:$op1, node:$op2, node:$op3), (AArch64fadda_p_node (SVEAllActive), node:$op2, + (vselect node:$op1, node:$op3, (splat_vector (f16 fpimm_minus0)))), + (AArch64fadda_p_node (SVEAllActive), node:$op2, (vselect node:$op1, node:$op3, (splat_vector (f32 fpimm_minus0)))), (AArch64fadda_p_node (SVEAllActive), node:$op2, (vselect node:$op1, node:$op3, (splat_vector (f64 fpimm_minus0))))]>; @@ -337,6 +339,22 @@ def AArch64bic : PatFrags<(ops node:$op1, node:$op2), def AArch64subr : PatFrag<(ops node:$op1, node:$op2), (sub node:$op2, node:$op1)>; +def AArch64add_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2), + [(int_aarch64_sve_add node:$pred, node:$op1, node:$op2), + (add node:$op1, (vselect node:$pred, node:$op2, (SVEDup0)))]>; +def AArch64sub_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2), + [(int_aarch64_sve_sub node:$pred, node:$op1, node:$op2), + (sub node:$op1, (vselect node:$pred, node:$op2, (SVEDup0)))]>; +def AArch64mla_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_mla node:$pred, node:$op1, node:$op2, node:$op3), + (add node:$op1, (AArch64mul_p_oneuse node:$pred, node:$op2, node:$op3)), + // add(a, select(mask, mul(b, c), splat(0))) -> mla(a, mask, b, c) + (add node:$op1, (vselect node:$pred, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3), (SVEDup0)))]>; +def AArch64mls_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_mls node:$pred, node:$op1, node:$op2, node:$op3), + (sub node:$op1, (AArch64mul_p_oneuse node:$pred, node:$op2, node:$op3)), + // sub(a, select(mask, mul(b, c), splat(0))) -> mls(a, mask, b, c) + (sub node:$op1, (vselect node:$pred, (AArch64mul_p_oneuse (SVEAllActive), node:$op2, node:$op3), (SVEDup0)))]>; let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; @@ -359,8 +377,8 @@ let Predicates = [HasSVEorSME] in { defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>; defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", AArch64bic>; - defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>; - defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; + defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", AArch64add_m1, DestructiveBinaryComm>; + defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", AArch64sub_m1, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", "ORR_ZPZZ", int_aarch64_sve_orr, DestructiveBinaryComm>; @@ -391,8 +409,8 @@ let Predicates = [HasSVEorSME] in { defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; - defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>; - defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>; + defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", AArch64mla_m1>; + defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", AArch64mls_m1>; // SVE predicated integer reductions. defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>; @@ -712,6 +730,12 @@ let Predicates = [HasSVEorSME] in { (DUP_ZI_D $a, $b)>; // Duplicate immediate FP into all vector elements. + def : Pat<(nxv2f16 (splat_vector (f16 fpimm:$val))), + (DUP_ZR_H (MOVi32imm (bitcast_fpimm_to_i32 f16:$val)))>; + def : Pat<(nxv4f16 (splat_vector (f16 fpimm:$val))), + (DUP_ZR_H (MOVi32imm (bitcast_fpimm_to_i32 f16:$val)))>; + def : Pat<(nxv8f16 (splat_vector (f16 fpimm:$val))), + (DUP_ZR_H (MOVi32imm (bitcast_fpimm_to_i32 f16:$val)))>; def : Pat<(nxv2f32 (splat_vector (f32 fpimm:$val))), (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; def : Pat<(nxv4f32 (splat_vector (f32 fpimm:$val))), diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 1b65589416c3..2f20232e452d 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -350,6 +350,14 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, return MCDisassembler::Fail; } +uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + // AArch64 instructions are always 4 bytes wide, so there's no point + // in skipping any smaller number of bytes if an instruction can't + // be decoded. + return 4; +} + static MCSymbolizer * createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, LLVMSymbolLookupCallback SymbolLookUp, diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index 6761d449a7f4..b9f78546b89b 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -30,6 +30,9 @@ public: MCDisassembler::DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CStream) const override; + + uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp index 04bc91318da8..d655caa80ba8 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp @@ -66,8 +66,8 @@ public: report_fatal_error("Invalid rule identifier"); } - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool AArch64O0PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index ba206bac68d1..dfb531cda7e9 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -355,8 +355,8 @@ public: report_fatal_error("Invalid rule identifier"); } - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index d7959a82c484..eab1de94e9c8 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -997,8 +997,8 @@ public: report_fatal_error("Invalid rule identifier"); } - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool AArch64PostLegalizerLoweringInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 275949c5ee64..50bae68b4d33 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -370,8 +370,8 @@ public: report_fatal_error("Invalid rule identifier"); } - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 7cdd4c4af95e..36daecf634d7 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2958,8 +2958,7 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm, let ElementSize = zprty.ElementSize; } -multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op, - SDPatternOperator outerop, SDPatternOperator mulop> { +multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> { def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>; def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>; def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>; @@ -2969,15 +2968,6 @@ multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op, def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; - - def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)), - (!cast<Instruction>(NAME # _B) $pred, $Op1, $Op2, $Op3)>; - def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)), - (!cast<Instruction>(NAME # _H) $pred, $Op1, $Op2, $Op3)>; - def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)), - (!cast<Instruction>(NAME # _S) $pred, $Op1, $Op2, $Op3)>; - def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)), - (!cast<Instruction>(NAME # _D) $pred, $Op1, $Op2, $Op3)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index b4a8766d682e..56a9a30bc59a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -29,6 +29,8 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" @@ -43,6 +45,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/OptimizedStructLayout.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <tuple> #include <vector> #define DEBUG_TYPE "amdgpu-lower-module-lds" @@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedLists(Module &M, const std::vector<GlobalVariable *> &LocalVars) { + // The verifier rejects used lists containing an inttoptr of a constant + // so remove the variables from these lists before replaceAllUsesWith + SmallPtrSet<Constant *, 32> LocalVarsSet; for (GlobalVariable *LocalVar : LocalVars) if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) @@ -146,12 +152,59 @@ public: } bool runOnModule(Module &M) override { + LLVMContext &Ctx = M.getContext(); CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); + + // Move variables used by functions into amdgcn.module.lds std::vector<GlobalVariable *> ModuleScopeVariables = AMDGPU::findVariablesToLower(M, nullptr); - Changed |= processUsedLDS(CG, M, ModuleScopeVariables); + if (!ModuleScopeVariables.empty()) { + std::string VarName = "llvm.amdgcn.module.lds"; + + GlobalVariable *SGV; + DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; + std::tie(SGV, LDSVarsToConstantGEP) = + createLDSVariableReplacement(M, VarName, ModuleScopeVariables); + + appendToCompilerUsed( + M, {static_cast<GlobalValue *>( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + + removeFromUsedLists(M, ModuleScopeVariables); + replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV, + LDSVarsToConstantGEP, + [](Use &) { return true; }); + + // This ensures the variable is allocated when called functions access it. + // It also lets other passes, specifically PromoteAlloca, accurately + // calculate how much LDS will be used by the kernel after lowering. + + IRBuilder<> Builder(Ctx); + for (Function &Func : M.functions()) { + if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { + const CallGraphNode *N = CG[&Func]; + const bool CalleesRequireModuleLDS = N->size() > 0; + + if (CalleesRequireModuleLDS) { + // If a function this kernel might call requires module LDS, + // annotate the kernel to let later passes know it will allocate + // this structure, even if not apparent from the IR. + markUsedByKernel(Builder, &Func, SGV); + } else { + // However if we are certain this kernel cannot call a function that + // requires module LDS, annotate the kernel so the backend can elide + // the allocation without repeating callgraph walks. + Func.addFnAttr("amdgpu-elide-module-lds"); + } + } + } + + Changed = true; + } + // Move variables used by kernels into per-kernel instances for (Function &F : M.functions()) { if (F.isDeclaration()) continue; @@ -159,9 +212,37 @@ public: // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; + std::vector<GlobalVariable *> KernelUsedVariables = AMDGPU::findVariablesToLower(M, &F); - Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F); + + // Replace all constant uses with instructions if they belong to the + // current kernel. Unnecessary, removing will cause test churn. + for (size_t I = 0; I < KernelUsedVariables.size(); I++) { + GlobalVariable *GV = KernelUsedVariables[I]; + for (User *U : make_early_inc_range(GV->users())) { + if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) + AMDGPU::replaceConstantUsesInFunction(C, &F); + } + GV->removeDeadConstantUsers(); + } + + if (!KernelUsedVariables.empty()) { + std::string VarName = + (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str(); + GlobalVariable *SGV; + DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; + std::tie(SGV, LDSVarsToConstantGEP) = + createLDSVariableReplacement(M, VarName, KernelUsedVariables); + + removeFromUsedLists(M, KernelUsedVariables); + replaceLDSVariablesWithStruct( + M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + return I && I->getFunction() == &F; + }); + Changed = true; + } } return Changed; @@ -212,16 +293,18 @@ private: return Changed; } - bool processUsedLDS(CallGraph const &CG, Module &M, - std::vector<GlobalVariable *> const &LDSVarsToTransform, - Function *F = nullptr) { + std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>> + createLDSVariableReplacement( + Module &M, std::string VarName, + std::vector<GlobalVariable *> const &LDSVarsToTransform) { + // Create a struct instance containing LDSVarsToTransform and map from those + // variables to ConstantExprGEP + // Variables may be introduced to meet alignment requirements. No aliasing + // metadata is useful for these as they have no uses. Erased before return. + LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); - - if (LDSVarsToTransform.empty()) { - // No variables to rewrite, no changes made. - return false; - } + assert(!LDSVarsToTransform.empty()); SmallVector<OptimizedStructLayoutField, 8> LayoutFields; LayoutFields.reserve(LDSVarsToTransform.size()); @@ -234,9 +317,10 @@ private: performOptimizedStructLayout(LayoutFields); std::vector<GlobalVariable *> LocalVars; + BitVector IsPaddingField; LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large + IsPaddingField.reserve(LDSVarsToTransform.size()); { - // This usually won't need to insert any padding, perhaps avoid the alloc uint64_t CurrentOffset = 0; for (size_t I = 0; I < LayoutFields.size(); I++) { GlobalVariable *FGV = static_cast<GlobalVariable *>( @@ -256,10 +340,12 @@ private: M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false)); + IsPaddingField.push_back(true); CurrentOffset += Padding; } LocalVars.push_back(FGV); + IsPaddingField.push_back(false); CurrentOffset += LayoutFields[I].Size; } } @@ -270,9 +356,6 @@ private: LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); - std::string VarName( - F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str() - : "llvm.amdgcn.module.lds"); StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); Align StructAlign = @@ -283,62 +366,65 @@ private: VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); SGV->setAlignment(StructAlign); - if (!F) { - appendToCompilerUsed( - M, {static_cast<GlobalValue *>( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + + DenseMap<GlobalVariable *, Constant *> Map; + Type *I32 = Type::getInt32Ty(Ctx); + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true); + if (IsPaddingField[I]) { + assert(GV->use_empty()); + GV->eraseFromParent(); + } else { + Map[GV] = GEP; + } } + assert(Map.size() == LDSVarsToTransform.size()); + return {SGV, std::move(Map)}; + } - // The verifier rejects used lists containing an inttoptr of a constant - // so remove the variables from these lists before replaceAllUsesWith - removeFromUsedLists(M, LocalVars); + template <typename PredicateTy> + void replaceLDSVariablesWithStruct( + Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform, + GlobalVariable *SGV, + DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP, + PredicateTy Predicate) { + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); // Create alias.scope and their lists. Each field in the new structure // does not alias with all other fields. SmallVector<MDNode *> AliasScopes; SmallVector<Metadata *> NoAliasList; - if (LocalVars.size() > 1) { + const size_t NumberVars = LDSVarsToTransform.size(); + if (NumberVars > 1) { MDBuilder MDB(Ctx); - AliasScopes.reserve(LocalVars.size()); + AliasScopes.reserve(NumberVars); MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); - for (size_t I = 0; I < LocalVars.size(); I++) { + for (size_t I = 0; I < NumberVars; I++) { MDNode *Scope = MDB.createAnonymousAliasScope(Domain); AliasScopes.push_back(Scope); } NoAliasList.append(&AliasScopes[1], AliasScopes.end()); } - // Replace uses of ith variable with a constantexpr to the ith field of the - // instance that will be allocated by AMDGPUMachineFunction - Type *I32 = Type::getInt32Ty(Ctx); - for (size_t I = 0; I < LocalVars.size(); I++) { - GlobalVariable *GV = LocalVars[I]; - Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; - Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); - if (F) { - // Replace all constant uses with instructions if they belong to the - // current kernel. - for (User *U : make_early_inc_range(GV->users())) { - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) - AMDGPU::replaceConstantUsesInFunction(C, F); - } - - GV->removeDeadConstantUsers(); + // Replace uses of ith variable with a constantexpr to the corresponding + // field of the instance that will be allocated by AMDGPUMachineFunction + for (size_t I = 0; I < NumberVars; I++) { + GlobalVariable *GV = LDSVarsToTransform[I]; + Constant *GEP = LDSVarsToConstantGEP[GV]; - GV->replaceUsesWithIf(GEP, [F](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - return I && I->getFunction() == F; - }); - } else { - GV->replaceAllUsesWith(GEP); - } + GV->replaceUsesWithIf(GEP, Predicate); if (GV->use_empty()) { GV->eraseFromParent(); } - uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); - Align A = commonAlignment(StructAlign, Off); + APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); + GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); + uint64_t Offset = APOff.getZExtValue(); + + Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset); if (I) NoAliasList[I - 1] = AliasScopes[I - 1]; @@ -349,32 +435,6 @@ private: refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } - - // This ensures the variable is allocated when called functions access it. - // It also lets other passes, specifically PromoteAlloca, accurately - // calculate how much LDS will be used by the kernel after lowering. - if (!F) { - IRBuilder<> Builder(Ctx); - for (Function &Func : M.functions()) { - if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { - const CallGraphNode *N = CG[&Func]; - const bool CalleesRequireModuleLDS = N->size() > 0; - - if (CalleesRequireModuleLDS) { - // If a function this kernel might call requires module LDS, - // annotate the kernel to let later passes know it will allocate - // this structure, even if not apparent from the IR. - markUsedByKernel(Builder, &Func, SGV); - } else { - // However if we are certain this kernel cannot call a function that - // requires module LDS, annotate the kernel so the backend can elide - // the allocation without repeating callgraph walks. - Func.addFnAttr("amdgpu-elide-module-lds"); - } - } - } - } - return true; } void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index 753f7edc9385..98b5031071cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -29,7 +29,7 @@ public: virtual ~AMDGPUMIRFormatter() = default; /// Implement target specific parsing of target custom pseudo source value. - virtual bool + bool parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index bfe2e9b66ed4..98e9907068f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -191,8 +191,8 @@ public: report_fatal_error("Invalid rule identifier"); } - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 04da14cc4916..859deae86f35 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -9,6 +9,18 @@ /// \file /// This contains a MachineSchedStrategy implementation for maximizing wave /// occupancy on GCN hardware. +/// +/// This pass will apply multiple scheduling stages to the same function. +/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual +/// entry point for the scheduling of those regions is +/// GCNScheduleDAGMILive::runSchedStages. + +/// Generally, the reason for having multiple scheduling stages is to account +/// for the kernel-wide effect of register usage on occupancy. Usually, only a +/// few scheduling regions will have register pressure high enough to limit +/// occupancy for the kernel, so constraints can be relaxed to improve ILP in +/// other regions. +/// //===----------------------------------------------------------------------===// #include "GCNSchedStrategy.h" @@ -20,9 +32,9 @@ using namespace llvm; GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( - const MachineSchedContext *C) : - GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), - HasExcessPressure(false), MF(nullptr) { } + const MachineSchedContext *C) + : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), + HasClusteredNodes(false), HasExcessPressure(false) {} void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -302,210 +314,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { return SU; } -GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, - std::unique_ptr<MachineSchedStrategy> S) : - ScheduleDAGMILive(C, std::move(S)), - ST(MF.getSubtarget<GCNSubtarget>()), - MFI(*MF.getInfo<SIMachineFunctionInfo>()), - StartingOccupancy(MFI.getOccupancy()), - MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { +GCNScheduleDAGMILive::GCNScheduleDAGMILive( + MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) + : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), + MFI(*MF.getInfo<SIMachineFunctionInfo>()), + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { - if (Stage == Collect) { - // Just record regions at the first pass. - Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); - return; - } - - std::vector<MachineInstr*> Unsched; - Unsched.reserve(NumRegionInstrs); - for (auto &I : *this) { - Unsched.push_back(&I); - } - - GCNRegPressure PressureBefore; - if (LIS) { - PressureBefore = Pressure[RegionIdx]; - - LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; - GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); - dbgs() << "Region live-in pressure: "; - llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); - dbgs() << "Region register pressure: "; - PressureBefore.print(dbgs())); - } - - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - // Set HasClusteredNodes to true for late stages where we have already - // collected it. That way pickNode() will not scan SDep's when not needed. - S.HasClusteredNodes = Stage > InitialSchedule; - S.HasExcessPressure = false; - ScheduleDAGMILive::schedule(); - Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); - RescheduleRegions[RegionIdx] = false; - if (Stage == InitialSchedule && S.HasClusteredNodes) - RegionsWithClusters[RegionIdx] = true; - if (S.HasExcessPressure) - RegionsWithHighRP[RegionIdx] = true; - - if (!LIS) - return; - - // Check the results of scheduling. - auto PressureAfter = getRealRegPressure(); - - LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; - PressureAfter.print(dbgs())); - - if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { - Pressure[RegionIdx] = PressureAfter; - RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST) == MinOccupancy; - - LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); - return; - } - - unsigned WavesAfter = - std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST)); - unsigned WavesBefore = - std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST)); - LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore - << ", after " << WavesAfter << ".\n"); - - // We may not be able to keep the current target occupancy because of the just - // scheduled region. We might still be able to revert scheduling if the - // occupancy before was higher, or if the current schedule has register - // pressure higher than the excess limits which could lead to more spilling. - unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); - - // Allow memory bound functions to drop to 4 waves if not limited by an - // attribute. - if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && - WavesAfter >= MFI.getMinAllowedOccupancy()) { - LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " - << MFI.getMinAllowedOccupancy() << " waves\n"); - NewOccupancy = WavesAfter; - } - - if (NewOccupancy < MinOccupancy) { - MinOccupancy = NewOccupancy; - MFI.limitOccupancy(MinOccupancy); - RegionsWithMinOcc.reset(); - LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " - << MinOccupancy << ".\n"); - } - - unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); - unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum(false) > MaxVGPRs || - PressureAfter.getAGPRNum() > MaxVGPRs || - PressureAfter.getSGPRNum() > MaxSGPRs) { - RescheduleRegions[RegionIdx] = true; - RegionsWithHighRP[RegionIdx] = true; - } - - // If this condition is true, then either the occupancy before and after - // scheduling is the same, or we are allowing the occupancy to drop because - // the function is memory bound. Even if we are OK with the current occupancy, - // we still need to verify that we will not introduce any extra chance of - // spilling. - if (WavesAfter >= MinOccupancy) { - if (Stage == UnclusteredReschedule && - !PressureAfter.less(ST, PressureBefore)) { - LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); - } else if (WavesAfter > MFI.getMinWavesPerEU() || - PressureAfter.less(ST, PressureBefore) || - !RescheduleRegions[RegionIdx]) { - Pressure[RegionIdx] = PressureAfter; - RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST) == MinOccupancy; - if (!RegionsWithClusters[RegionIdx] && - (Stage + 1) == UnclusteredReschedule) - RescheduleRegions[RegionIdx] = false; - return; - } else { - LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); - } - } - - RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST) == MinOccupancy; - LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); - RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || - (Stage + 1) != UnclusteredReschedule; - RegionEnd = RegionBegin; - int SkippedDebugInstr = 0; - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) { - ++SkippedDebugInstr; - continue; - } - - if (MI->getIterator() != RegionEnd) { - BB->remove(MI); - BB->insert(RegionEnd, MI); - if (!MI->isDebugInstr()) - LIS->handleMove(*MI, true); - } - // Reset read-undef flags and update them later. - for (auto &Op : MI->operands()) - if (Op.isReg() && Op.isDef()) - Op.setIsUndef(false); - RegisterOperands RegOpers; - RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); - if (!MI->isDebugInstr()) { - if (ShouldTrackLaneMasks) { - // Adjust liveness and add missing dead+read-undef flags. - SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); - RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); - } else { - // Adjust for missing dead-def flags. - RegOpers.detectDeadDefs(*MI, *LIS); - } - } - RegionEnd = MI->getIterator(); - ++RegionEnd; - LLVM_DEBUG(dbgs() << "Scheduling " << *MI); - } - - // After reverting schedule, debug instrs will now be at the end of the block - // and RegionEnd will point to the first debug instr. Increment RegionEnd - // pass debug instrs to the actual end of the scheduling region. - while (SkippedDebugInstr-- > 0) - ++RegionEnd; - - // If Unsched.front() instruction is a debug instruction, this will actually - // shrink the region since we moved all debug instructions to the end of the - // block. Find the first instruction that is not a debug instruction. - RegionBegin = Unsched.front()->getIterator(); - if (RegionBegin->isDebugInstr()) { - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) - continue; - RegionBegin = MI->getIterator(); - break; - } - } - - // Then move the debug instructions back into their correct place and set - // RegionBegin and RegionEnd if needed. - placeDebugValues(); - - Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + // Collect all scheduling regions. The actual scheduling is performed in + // GCNScheduleDAGMILive::finalizeSchedule. + Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); } -GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { +GCNRegPressure +GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } -void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { +void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, + const MachineBasicBlock *MBB) { GCNDownwardRPTracker RPTracker(*LIS); // If the block has the only successor then live-ins of that successor are @@ -542,7 +374,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { RPTracker.reset(*I, &LRS); } - for ( ; ; ) { + for (;;) { I = RPTracker.getNext(); if (Regions[CurRegion].first == I || NonDbgMI == I) { @@ -588,8 +420,9 @@ GCNScheduleDAGMILive::getBBLiveInMap() const { } void GCNScheduleDAGMILive::finalizeSchedule() { - LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - + // Start actual scheduling here. This function is called by the base + // MachineScheduler after all regions have been recorded by + // GCNScheduleDAGMILive::schedule(). LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); RescheduleRegions.resize(Regions.size()); @@ -601,142 +434,470 @@ void GCNScheduleDAGMILive::finalizeSchedule() { RegionsWithHighRP.reset(); RegionsWithMinOcc.reset(); + runSchedStages(); +} + +void GCNScheduleDAGMILive::runSchedStages() { + LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); + InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this); + UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this); + ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule, + *this); + PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this); + GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3}; + if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); - std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; + for (auto *Stage : SchedStages) { + if (!Stage->initGCNSchedStage()) + continue; - do { - Stage++; - RegionIdx = 0; - MachineBasicBlock *MBB = nullptr; + for (auto Region : Regions) { + RegionBegin = Region.first; + RegionEnd = Region.second; + // Setup for scheduling the region and check whether it should be skipped. + if (!Stage->initGCNRegion()) { + Stage->advanceRegion(); + exitRegion(); + continue; + } - if (Stage > InitialSchedule) { - if (!LIS) - break; + ScheduleDAGMILive::schedule(); + Stage->finalizeGCNRegion(); + } - // Retry function scheduling if we found resulting occupancy and it is - // lower than used for first pass scheduling. This will give more freedom - // to schedule low register pressure blocks. - // Code is partially copied from MachineSchedulerBase::scheduleRegions(). + Stage->finalizeGCNSchedStage(); + } +} - if (Stage == UnclusteredReschedule) { - if (RescheduleRegions.none()) - continue; - LLVM_DEBUG(dbgs() << - "Retrying function scheduling without clustering.\n"); - } +#ifndef NDEBUG +raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { + switch (StageID) { + case GCNSchedStageID::InitialSchedule: + OS << "Initial Schedule"; + break; + case GCNSchedStageID::UnclusteredReschedule: + OS << "Unclustered Reschedule"; + break; + case GCNSchedStageID::ClusteredLowOccupancyReschedule: + OS << "Clustered Low Occupancy Reschedule"; + break; + case GCNSchedStageID::PreRARematerialize: + OS << "Pre-RA Rematerialize"; + break; + } + return OS; +} +#endif - if (Stage == ClusteredLowOccupancyReschedule) { - if (StartingOccupancy <= MinOccupancy) - break; +GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)), + MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} - LLVM_DEBUG( - dbgs() - << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); - } +bool GCNSchedStage::initGCNSchedStage() { + if (!DAG.LIS) + return false; - if (Stage == PreRARematerialize) { - if (RegionsWithMinOcc.none() || Regions.size() == 1) - break; + LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); + return true; +} - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - // Check maximum occupancy - if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == - MinOccupancy) - break; +bool UnclusteredRescheduleStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; - // FIXME: This pass will invalidate cached MBBLiveIns for regions - // inbetween the defs and region we sinked the def to. Cached pressure - // for regions where a def is sinked from will also be invalidated. Will - // need to be fixed if there is another pass after this pass. - static_assert(LastStage == PreRARematerialize, - "Passes after PreRARematerialize are not supported"); + if (DAG.RescheduleRegions.none()) + return false; - collectRematerializableInstructions(); - if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) - break; + SavedMutations.swap(DAG.Mutations); - LLVM_DEBUG( - dbgs() << "Retrying function scheduling with improved occupancy of " - << MinOccupancy << " from rematerializing\n"); - } - } + LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n"); + return true; +} - if (Stage == UnclusteredReschedule) - SavedMutations.swap(Mutations); +bool ClusteredLowOccStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; - for (auto Region : Regions) { - if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) && - !RescheduleRegions[RegionIdx]) || - (Stage == ClusteredLowOccupancyReschedule && - !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { + // Don't bother trying to improve ILP in lower RP regions if occupancy has not + // been dropped. All regions will have already been scheduled with the ideal + // occupancy targets. + if (DAG.StartingOccupancy <= DAG.MinOccupancy) + return false; - ++RegionIdx; - continue; - } + LLVM_DEBUG( + dbgs() << "Retrying function scheduling with lowest recorded occupancy " + << DAG.MinOccupancy << ".\n"); + return true; +} - RegionBegin = Region.first; - RegionEnd = Region.second; +bool PreRARematStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; - if (RegionBegin->getParent() != MBB) { - if (MBB) finishBlock(); - MBB = RegionBegin->getParent(); - startBlock(MBB); - if (Stage == InitialSchedule) - computeBlockPressure(MBB); - } + if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) + return false; - unsigned NumRegionInstrs = std::distance(begin(), end()); - enterRegion(MBB, begin(), end(), NumRegionInstrs); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + // Check maximum occupancy + if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == + DAG.MinOccupancy) + return false; - // Skip empty scheduling regions (0 or 1 schedulable instructions). - if (begin() == end() || begin() == std::prev(end())) { - exitRegion(); - ++RegionIdx; - continue; - } + // FIXME: This pass will invalidate cached MBBLiveIns for regions + // inbetween the defs and region we sinked the def to. Cached pressure + // for regions where a def is sinked from will also be invalidated. Will + // need to be fixed if there is another pass after this pass. - LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); - LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " - << MBB->getName() << "\n From: " << *begin() - << " To: "; - if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; - else dbgs() << "End"; - dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + collectRematerializableInstructions(); + if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) + return false; - schedule(); + LLVM_DEBUG( + dbgs() << "Retrying function scheduling with improved occupancy of " + << DAG.MinOccupancy << " from rematerializing\n"); + return true; +} + +void GCNSchedStage::finalizeGCNSchedStage() { + DAG.finishBlock(); + LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); +} + +void UnclusteredRescheduleStage::finalizeGCNSchedStage() { + SavedMutations.swap(DAG.Mutations); + + GCNSchedStage::finalizeGCNSchedStage(); +} + +bool GCNSchedStage::initGCNRegion() { + // Check whether this new region is also a new block. + if (DAG.RegionBegin->getParent() != CurrentMBB) + setupNewBlock(); + + unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); + DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); + + // Skip empty scheduling regions (0 or 1 schedulable instructions). + if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) + return false; + + LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); + LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) + << " " << CurrentMBB->getName() + << "\n From: " << *DAG.begin() << " To: "; + if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + + // Save original instruction order before scheduling for possible revert. + Unsched.clear(); + Unsched.reserve(DAG.NumRegionInstrs); + for (auto &I : DAG) + Unsched.push_back(&I); + + PressureBefore = DAG.Pressure[RegionIdx]; + + LLVM_DEBUG( + dbgs() << "Pressure before scheduling:\nRegion live-ins:"; + GCNRPTracker::printLiveRegs(dbgs(), DAG.LiveIns[RegionIdx], DAG.MRI); + dbgs() << "Region live-in pressure: "; + llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs()); + dbgs() << "Region register pressure: "; PressureBefore.print(dbgs())); + + // Set HasClusteredNodes to true for late stages where we have already + // collected it. That way pickNode() will not scan SDep's when not needed. + S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule; + S.HasExcessPressure = false; + + return true; +} + +bool UnclusteredRescheduleStage::initGCNRegion() { + if (!DAG.RescheduleRegions[RegionIdx]) + return false; + + return GCNSchedStage::initGCNRegion(); +} + +bool ClusteredLowOccStage::initGCNRegion() { + // We may need to reschedule this region if it doesn't have clusters so it + // wasn't rescheduled in the last stage, or if we found it was testing + // critical register pressure limits in the unclustered reschedule stage. The + // later is because we may not have been able to raise the min occupancy in + // the previous stage so the region may be overly constrained even if it was + // already rescheduled. + if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx]) + return false; + + return GCNSchedStage::initGCNRegion(); +} + +bool PreRARematStage::initGCNRegion() { + if (!DAG.RescheduleRegions[RegionIdx]) + return false; + + return GCNSchedStage::initGCNRegion(); +} + +void GCNSchedStage::setupNewBlock() { + if (CurrentMBB) + DAG.finishBlock(); + + CurrentMBB = DAG.RegionBegin->getParent(); + DAG.startBlock(CurrentMBB); + // Get real RP for the region if it hasn't be calculated before. After the + // initial schedule stage real RP will be collected after scheduling. + if (StageID == GCNSchedStageID::InitialSchedule) + DAG.computeBlockPressure(RegionIdx, CurrentMBB); +} + +void GCNSchedStage::finalizeGCNRegion() { + DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd); + DAG.RescheduleRegions[RegionIdx] = false; + if (S.HasExcessPressure) + DAG.RegionsWithHighRP[RegionIdx] = true; + + // Revert scheduling if we have dropped occupancy or there is some other + // reason that the original schedule is better. + checkScheduling(); + + DAG.exitRegion(); + RegionIdx++; +} + +void InitialScheduleStage::finalizeGCNRegion() { + // Record which regions have clustered nodes for the next unclustered + // reschedule stage. + assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); + if (S.HasClusteredNodes) + DAG.RegionsWithClusters[RegionIdx] = true; + + GCNSchedStage::finalizeGCNRegion(); +} + +void GCNSchedStage::checkScheduling() { + // Check the results of scheduling. + PressureAfter = DAG.getRealRegPressure(RegionIdx); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; + PressureAfter.print(dbgs())); + + if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && + PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { + DAG.Pressure[RegionIdx] = PressureAfter; + DAG.RegionsWithMinOcc[RegionIdx] = + PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; + + // Early out if we have achieve the occupancy target. + LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); + return; + } + + unsigned WavesAfter = + std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST)); + unsigned WavesBefore = + std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); + LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore + << ", after " << WavesAfter << ".\n"); + + // We may not be able to keep the current target occupancy because of the just + // scheduled region. We might still be able to revert scheduling if the + // occupancy before was higher, or if the current schedule has register + // pressure higher than the excess limits which could lead to more spilling. + unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + + // Allow memory bound functions to drop to 4 waves if not limited by an + // attribute. + if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && + WavesAfter >= MFI.getMinAllowedOccupancy()) { + LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " + << MFI.getMinAllowedOccupancy() << " waves\n"); + NewOccupancy = WavesAfter; + } + + if (NewOccupancy < DAG.MinOccupancy) { + DAG.MinOccupancy = NewOccupancy; + MFI.limitOccupancy(DAG.MinOccupancy); + DAG.RegionsWithMinOcc.reset(); + LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " + << DAG.MinOccupancy << ".\n"); + } - exitRegion(); - ++RegionIdx; + unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); + if (PressureAfter.getVGPRNum(false) > MaxVGPRs || + PressureAfter.getAGPRNum() > MaxVGPRs || + PressureAfter.getSGPRNum() > MaxSGPRs) { + DAG.RescheduleRegions[RegionIdx] = true; + DAG.RegionsWithHighRP[RegionIdx] = true; + } + + // Revert if this region's schedule would cause a drop in occupancy or + // spilling. + if (shouldRevertScheduling(WavesAfter)) { + revertScheduling(); + } else { + DAG.Pressure[RegionIdx] = PressureAfter; + DAG.RegionsWithMinOcc[RegionIdx] = + PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; + } +} + +bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { + if (WavesAfter < DAG.MinOccupancy) + return true; + + return false; +} + +bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + if (mayCauseSpilling(WavesAfter)) + return true; + + assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); + // Don't reschedule the region in the next stage if it doesn't have clusters. + if (!DAG.RegionsWithClusters[RegionIdx]) + DAG.RescheduleRegions[RegionIdx] = false; + + return false; +} + +bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + // If RP is not reduced in the unclustred reschedule stage, revert to the old + // schedule. + if (!PressureAfter.less(ST, PressureBefore)) { + LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); + return true; + } + + return false; +} + +bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + if (mayCauseSpilling(WavesAfter)) + return true; + + return false; +} + +bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + if (mayCauseSpilling(WavesAfter)) + return true; + + return false; +} + +bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { + if (WavesAfter <= MFI.getMinWavesPerEU() && + !PressureAfter.less(ST, PressureBefore) && + DAG.RescheduleRegions[RegionIdx]) { + LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); + return true; + } + + return false; +} + +void GCNSchedStage::revertScheduling() { + DAG.RegionsWithMinOcc[RegionIdx] = + PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; + LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + DAG.RescheduleRegions[RegionIdx] = + DAG.RegionsWithClusters[RegionIdx] || + (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule; + DAG.RegionEnd = DAG.RegionBegin; + int SkippedDebugInstr = 0; + for (MachineInstr *MI : Unsched) { + if (MI->isDebugInstr()) { + ++SkippedDebugInstr; + continue; + } + + if (MI->getIterator() != DAG.RegionEnd) { + DAG.BB->remove(MI); + DAG.BB->insert(DAG.RegionEnd, MI); + if (!MI->isDebugInstr()) + DAG.LIS->handleMove(*MI, true); + } + + // Reset read-undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); + if (!MI->isDebugInstr()) { + if (DAG.ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *DAG.LIS); + } } - finishBlock(); + DAG.RegionEnd = MI->getIterator(); + ++DAG.RegionEnd; + LLVM_DEBUG(dbgs() << "Scheduling " << *MI); + } + + // After reverting schedule, debug instrs will now be at the end of the block + // and RegionEnd will point to the first debug instr. Increment RegionEnd + // pass debug instrs to the actual end of the scheduling region. + while (SkippedDebugInstr-- > 0) + ++DAG.RegionEnd; + + // If Unsched.front() instruction is a debug instruction, this will actually + // shrink the region since we moved all debug instructions to the end of the + // block. Find the first instruction that is not a debug instruction. + DAG.RegionBegin = Unsched.front()->getIterator(); + if (DAG.RegionBegin->isDebugInstr()) { + for (MachineInstr *MI : Unsched) { + if (MI->isDebugInstr()) + continue; + DAG.RegionBegin = MI->getIterator(); + break; + } + } + + // Then move the debug instructions back into their correct place and set + // RegionBegin and RegionEnd if needed. + DAG.placeDebugValues(); - if (Stage == UnclusteredReschedule) - SavedMutations.swap(Mutations); - } while (Stage != LastStage); + DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd); } -void GCNScheduleDAGMILive::collectRematerializableInstructions() { - const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); - for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { +void PreRARematStage::collectRematerializableInstructions() { + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); + for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); - if (!LIS->hasInterval(Reg)) + if (!DAG.LIS->hasInterval(Reg)) continue; // TODO: Handle AGPR and SGPR rematerialization - if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) || - !MRI.hasOneNonDBGUse(Reg)) + if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || + !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) continue; - MachineOperand *Op = MRI.getOneDef(Reg); + MachineOperand *Op = DAG.MRI.getOneDef(Reg); MachineInstr *Def = Op->getParent(); if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) continue; - MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg); + MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); if (Def->getParent() == UseI->getParent()) continue; @@ -744,10 +905,10 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() { // live-through or used inside regions at MinOccupancy. This means that the // register must be in the live-in set for the region. bool AddedToRematList = false; - for (unsigned I = 0, E = Regions.size(); I != E; ++I) { - auto It = LiveIns[I].find(Reg); - if (It != LiveIns[I].end() && !It->second.none()) { - if (RegionsWithMinOcc[I]) { + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + auto It = DAG.LiveIns[I].find(Reg); + if (It != DAG.LiveIns[I].end() && !It->second.none()) { + if (DAG.RegionsWithMinOcc[I]) { RematerializableInsts[I][Def] = UseI; AddedToRematList = true; } @@ -762,8 +923,8 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() { } } -bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, - const TargetInstrInfo *TII) { +bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, + const TargetInstrInfo *TII) { // Temporary copies of cached variables we will be modifying and replacing if // sinking succeeds. SmallVector< @@ -772,9 +933,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; DenseMap<unsigned, GCNRegPressure> NewPressure; BitVector NewRescheduleRegions; + LiveIntervals *LIS = DAG.LIS; - NewRegions.resize(Regions.size()); - NewRescheduleRegions.resize(Regions.size()); + NewRegions.resize(DAG.Regions.size()); + NewRescheduleRegions.resize(DAG.Regions.size()); // Collect only regions that has a rematerializable def as a live-in. SmallSet<unsigned, 16> ImpactedRegions; @@ -784,16 +946,16 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // Make copies of register pressure and live-ins cache that will be updated // as we rematerialize. for (auto Idx : ImpactedRegions) { - NewPressure[Idx] = Pressure[Idx]; - NewLiveIns[Idx] = LiveIns[Idx]; + NewPressure[Idx] = DAG.Pressure[Idx]; + NewLiveIns[Idx] = DAG.LiveIns[Idx]; } - NewRegions = Regions; + NewRegions = DAG.Regions; NewRescheduleRegions.reset(); DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; bool Improved = false; for (auto I : ImpactedRegions) { - if (!RegionsWithMinOcc[I]) + if (!DAG.RegionsWithMinOcc[I]) continue; Improved = false; @@ -802,12 +964,12 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // TODO: Handle occupancy drop due to AGPR and SGPR. // Check if cause of occupancy drop is due to VGPR usage and not SGPR. - if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) + if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) break; // The occupancy of this region could have been improved by a previous // iteration's sinking of defs. - if (NewPressure[I].getOccupancy(ST) > MinOccupancy) { + if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { NewRescheduleRegions[I] = true; Improved = true; continue; @@ -827,7 +989,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); // If in the most optimistic scenario, we cannot improve occupancy, then do // not attempt to sink any instructions. - if (OptimisticOccupancy <= MinOccupancy) + if (OptimisticOccupancy <= DAG.MinOccupancy) break; unsigned ImproveOccupancy = 0; @@ -842,7 +1004,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // call LiveRangeEdit::allUsesAvailableAt() and // LiveRangeEdit::canRematerializeAt(). TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - Def->getOperand(0).getSubReg(), *Def, *TRI); + Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); MachineInstr *NewMI = &*(--InsertPos); LIS->InsertMachineInstrInMaps(*NewMI); LIS->removeInterval(Reg); @@ -851,11 +1013,11 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // Update region boundaries in scheduling region we sinked from since we // may sink an instruction that was at the beginning or end of its region - updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, - /*Removing =*/true); + DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, + /*Removing =*/true); // Update region boundaries in region we sinked to. - updateRegionBoundaries(NewRegions, InsertPos, NewMI); + DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); LaneBitmask PrevMask = NewLiveIns[I][Reg]; // FIXME: Also update cached pressure for where the def was sinked from. @@ -863,9 +1025,9 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // the reg from all regions as a live-in. for (auto Idx : RematDefToLiveInRegions[Def]) { NewLiveIns[Idx].erase(Reg); - if (InsertPos->getParent() != Regions[Idx].first->getParent()) { + if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { // Def is live-through and not used in this block. - NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI); + NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); } else { // Def is used and rematerialized into this block. GCNDownwardRPTracker RPT(*LIS); @@ -879,7 +1041,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, SinkedDefs.push_back(Def); ImproveOccupancy = NewPressure[I].getOccupancy(ST); - if (ImproveOccupancy > MinOccupancy) + if (ImproveOccupancy > DAG.MinOccupancy) break; } @@ -888,7 +1050,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, for (auto TrackedIdx : RematDefToLiveInRegions[Def]) RematerializableInsts[TrackedIdx].erase(Def); - if (ImproveOccupancy <= MinOccupancy) + if (ImproveOccupancy <= DAG.MinOccupancy) break; NewRescheduleRegions[I] = true; @@ -917,7 +1079,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, MachineInstr *OldMI = Entry.second; // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. - BBLiveInMap.erase(OldMI); + DAG.BBLiveInMap.erase(OldMI); // Remove OldMI and update LIS Register Reg = MI->getOperand(0).getReg(); @@ -929,22 +1091,22 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // Update live-ins, register pressure, and regions caches. for (auto Idx : ImpactedRegions) { - LiveIns[Idx] = NewLiveIns[Idx]; - Pressure[Idx] = NewPressure[Idx]; - MBBLiveIns.erase(Regions[Idx].first->getParent()); + DAG.LiveIns[Idx] = NewLiveIns[Idx]; + DAG.Pressure[Idx] = NewPressure[Idx]; + DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); } - Regions = NewRegions; - RescheduleRegions = NewRescheduleRegions; + DAG.Regions = NewRegions; + DAG.RescheduleRegions = NewRescheduleRegions; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - MFI.increaseOccupancy(MF, ++MinOccupancy); + MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); return true; } // Copied from MachineLICM -bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) { - if (!TII->isTriviallyReMaterializable(MI)) +bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { + if (!DAG.TII->isTriviallyReMaterializable(MI)) return false; for (const MachineOperand &MO : MI.operands()) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index c3db849cf81a..7aadf89e0bf7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -28,8 +28,6 @@ class GCNSubtarget; /// heuristics to determine excess/critical pressure sets. Its goal is to /// maximize kernel occupancy (i.e. maximum number of waves per simd). class GCNMaxOccupancySchedStrategy final : public GenericScheduler { - friend class GCNScheduleDAGMILive; - SUnit *pickNodeBidirectional(bool &IsTopNode); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, @@ -42,15 +40,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { unsigned SGPRPressure, unsigned VGPRPressure); std::vector<unsigned> Pressure; + std::vector<unsigned> MaxPressure; unsigned SGPRExcessLimit; + unsigned VGPRExcessLimit; - unsigned SGPRCriticalLimit; - unsigned VGPRCriticalLimit; unsigned TargetOccupancy; + MachineFunction *MF; + +public: // schedule() have seen a clustered memory operation. Set it to false // before a region scheduling to know if the region had such clusters. bool HasClusteredNodes; @@ -59,28 +60,53 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { // register pressure for actual scheduling heuristics. bool HasExcessPressure; - MachineFunction *MF; + unsigned SGPRCriticalLimit; + + unsigned VGPRCriticalLimit; -public: GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; void initialize(ScheduleDAGMI *DAG) override; + unsigned getTargetOccupancy() { return TargetOccupancy; } + void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } }; -class GCNScheduleDAGMILive final : public ScheduleDAGMILive { +enum class GCNSchedStageID : unsigned { + InitialSchedule = 0, + UnclusteredReschedule = 1, + ClusteredLowOccupancyReschedule = 2, + PreRARematerialize = 3, + LastStage = PreRARematerialize +}; + +#ifndef NDEBUG +raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); +#endif + +inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) { + assert(Stage != GCNSchedStageID::PreRARematerialize); + Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1); + return Stage; +} + +inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) { + return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1); +} - enum : unsigned { - Collect, - InitialSchedule, - UnclusteredReschedule, - ClusteredLowOccupancyReschedule, - PreRARematerialize, - LastStage = PreRARematerialize - }; +inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) { + return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS); +} + +class GCNScheduleDAGMILive final : public ScheduleDAGMILive { + friend class GCNSchedStage; + friend class InitialScheduleStage; + friend class UnclusteredRescheduleStage; + friend class ClusteredLowOccStage; + friend class PreRARematStage; const GCNSubtarget &ST; @@ -92,12 +118,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Minimal real occupancy recorder for the function. unsigned MinOccupancy; - // Scheduling stage number. - unsigned Stage; - - // Current region index. - size_t RegionIdx; - // Vector of regions recorder for later rescheduling SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> Regions; @@ -121,6 +141,148 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Region pressure cache. SmallVector<GCNRegPressure, 32> Pressure; + // Temporary basic block live-in cache. + DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns; + + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap; + + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const; + + // Return current region pressure. + GCNRegPressure getRealRegPressure(unsigned RegionIdx) const; + + // Compute and cache live-ins and pressure for all regions in block. + void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB); + + // Update region boundaries when removing MI or inserting NewMI before MI. + void updateRegionBoundaries( + SmallVectorImpl<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>> &RegionBoundaries, + MachineBasicBlock::iterator MI, MachineInstr *NewMI, + bool Removing = false); + + void runSchedStages(); + +public: + GCNScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr<MachineSchedStrategy> S); + + void schedule() override; + + void finalizeSchedule() override; +}; + +// GCNSchedStrategy applies multiple scheduling stages to a function. +class GCNSchedStage { +protected: + GCNScheduleDAGMILive &DAG; + + GCNMaxOccupancySchedStrategy &S; + + MachineFunction &MF; + + SIMachineFunctionInfo &MFI; + + const GCNSubtarget &ST; + + const GCNSchedStageID StageID; + + // The current block being scheduled. + MachineBasicBlock *CurrentMBB = nullptr; + + // Current region index. + unsigned RegionIdx = 0; + + // Record the original order of instructions before scheduling. + std::vector<MachineInstr *> Unsched; + + // RP before scheduling the current region. + GCNRegPressure PressureBefore; + + // RP after scheduling the current region. + GCNRegPressure PressureAfter; + + GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG); + +public: + // Initialize state for a scheduling stage. Returns false if the current stage + // should be skipped. + virtual bool initGCNSchedStage(); + + // Finalize state after finishing a scheduling pass on the function. + virtual void finalizeGCNSchedStage(); + + // Setup for scheduling a region. Returns false if the current region should + // be skipped. + virtual bool initGCNRegion(); + + // Track whether a new region is also a new MBB. + void setupNewBlock(); + + // Finalize state after scheudling a region. + virtual void finalizeGCNRegion(); + + // Check result of scheduling. + void checkScheduling(); + + // Returns true if scheduling should be reverted. + virtual bool shouldRevertScheduling(unsigned WavesAfter); + + // Returns true if the new schedule may result in more spilling. + bool mayCauseSpilling(unsigned WavesAfter); + + // Attempt to revert scheduling for this region. + void revertScheduling(); + + void advanceRegion() { RegionIdx++; } + + virtual ~GCNSchedStage() = default; +}; + +class InitialScheduleStage : public GCNSchedStage { +public: + void finalizeGCNRegion() override; + + bool shouldRevertScheduling(unsigned WavesAfter) override; + + InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + +class UnclusteredRescheduleStage : public GCNSchedStage { +private: + std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; + +public: + bool initGCNSchedStage() override; + + void finalizeGCNSchedStage() override; + + bool initGCNRegion() override; + + bool shouldRevertScheduling(unsigned WavesAfter) override; + + UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + +// Retry function scheduling if we found resulting occupancy and it is +// lower than used for other scheduling passes. This will give more freedom +// to schedule low register pressure blocks. +class ClusteredLowOccStage : public GCNSchedStage { +public: + bool initGCNSchedStage() override; + + bool initGCNRegion() override; + + bool shouldRevertScheduling(unsigned WavesAfter) override; + + ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + +class PreRARematStage : public GCNSchedStage { +private: // Each region at MinOccupancy will have their own list of trivially // rematerializable instructions we can remat to reduce RP. The list maps an // instruction to the position we should remat before, usually the MI using @@ -132,12 +294,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // that has the defined reg as a live-in. DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions; - // Temporary basic block live-in cache. - DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns; - - DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap; - DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const; - // Collect all trivially rematerializable VGPR instructions with a single def // and single use outside the defining block into RematerializableInsts. void collectRematerializableInstructions(); @@ -150,26 +306,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { bool sinkTriviallyRematInsts(const GCNSubtarget &ST, const TargetInstrInfo *TII); - // Return current region pressure. - GCNRegPressure getRealRegPressure() const; - - // Compute and cache live-ins and pressure for all regions in block. - void computeBlockPressure(const MachineBasicBlock *MBB); - - // Update region boundaries when removing MI or inserting NewMI before MI. - void updateRegionBoundaries( - SmallVectorImpl<std::pair<MachineBasicBlock::iterator, - MachineBasicBlock::iterator>> &RegionBoundaries, - MachineBasicBlock::iterator MI, MachineInstr *NewMI, - bool Removing = false); - public: - GCNScheduleDAGMILive(MachineSchedContext *C, - std::unique_ptr<MachineSchedStrategy> S); + bool initGCNSchedStage() override; - void schedule() override; + bool initGCNRegion() override; - void finalizeSchedule() override; + bool shouldRevertScheduling(unsigned WavesAfter) override; + + PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index e093d78b2cc6..d9d7d4efa8c3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -309,6 +309,11 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; } +static bool isVCMPX64(const MCInstrDesc &Desc) { + return (Desc.TSFlags & SIInstrFlags::VOP3) && + Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC); +} + void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -326,6 +331,17 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, Encoding |= getImplicitOpSelHiEncoding(Opcode); } + // GFX11 v_cmpx opcodes promoted to VOP3 have implied dst=EXEC. + // Documentation requires dst to be encoded as EXEC (0x7E), + // but it looks like the actual value encoded for dst operand + // is ignored by HW. It was decided to define dst as "do not care" + // in td files to allow disassembler accept any dst value. + // However, dst is encoded as EXEC for compatibility with SP3. + if (AMDGPU::isGFX11Plus(STI) && isVCMPX64(Desc)) { + assert((Encoding & 0xFF) == 0); + Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO); + } + for (unsigned i = 0; i < bytes; i++) { OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i)); } diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index e7706fa0ef5c..1ed79add64c9 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -54,8 +54,8 @@ public: MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *IsFast = nullptr) const override; - virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT, - bool LegalOperations) const override { + bool canCombineTruncStore(EVT ValVT, EVT MemVT, + bool LegalOperations) const override { // R600 has "custom" lowering for truncating stores despite not supporting // those instructions. If we allow that custom lowering in the DAG combiner // then all truncates are merged into truncating stores, giving worse code diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 438e8b200ecc..f7d139adc63b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2132,7 +2132,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const { - if (Subtarget->hasUserSGPRInit16Bug()) { + if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { + // Note: user SGPRs are handled by the front-end for graphics shaders // Pad up the used user SGPRs with dead inputs. unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); @@ -2195,7 +2196,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } - assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16); + assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || + Info.getNumPreloadedSGPRs() >= 16); } static void reservePrivateMemoryRegs(const TargetMachine &TM, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d1fecc1afc7f..e0101f53880f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -487,10 +487,10 @@ public: AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; - virtual const TargetRegisterClass * - getRegClassFor(MVT VT, bool isDivergent) const override; - virtual bool requiresUniformRegister(MachineFunction &MF, - const Value *V) const override; + const TargetRegisterClass *getRegClassFor(MVT VT, + bool isDivergent) const override; + bool requiresUniformRegister(MachineFunction &MF, + const Value *V) const override; Align getPrefLoopAlignment(MachineLoop *ML) const override; void allocateHSAUserSGPRs(CCState &CCInfo, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index ffe8dce79816..fccb08f86e6d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -349,7 +349,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> { def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "SGPR%u_LO16", 0, 105))> { - let AllocationPriority = 9; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; let HasSGPR = 1; @@ -368,7 +368,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. - let AllocationPriority = 9; + let AllocationPriority = 0; let GeneratePressureSet = 0; let HasSGPR = 1; } @@ -528,14 +528,14 @@ def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; let HasVGPR = 1 in { def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (sequence "VGPR%u_LO16", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; } def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (sequence "VGPR%u_HI16", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; } @@ -544,7 +544,7 @@ def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 32; let Weight = 1; } @@ -588,7 +588,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 32; let Weight = 1; } @@ -653,7 +653,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2 SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, @@ -663,42 +663,42 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { - let AllocationPriority = 10; + let AllocationPriority = 0; let HasSGPR = 1; } @@ -712,7 +712,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; - let AllocationPriority = 11; + let AllocationPriority = 1; let HasSGPR = 1; } @@ -725,14 +725,14 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> { let CopyCost = 1; - let AllocationPriority = 13; + let AllocationPriority = 1; let HasSGPR = 1; } def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; - let AllocationPriority = 13; + let AllocationPriority = 1; let HasSGPR = 1; } @@ -750,7 +750,7 @@ def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, let HasSGPR = 1; } -multiclass SRegClass<int numRegs, int priority, +multiclass SRegClass<int numRegs, list<ValueType> regTypes, SIRegisterTuples regList, SIRegisterTuples ttmpList = regList, @@ -760,7 +760,7 @@ multiclass SRegClass<int numRegs, int priority, defvar sgprName = !strconcat("SGPR_", suffix); defvar ttmpName = !strconcat("TTMP_", suffix); - let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in { + let AllocationPriority = !sub(numRegs, 1), CopyCost = copyCost, HasSGPR = 1 in { def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> { } @@ -781,14 +781,14 @@ multiclass SRegClass<int numRegs, int priority, } } -defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; -defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; -defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; -defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; -defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; -defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; +defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; +defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; +defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; +defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { @@ -803,7 +803,7 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = numRegs; + let AllocationPriority = !sub(numRegs, 1); let Weight = numRegs; } diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index d489a089ac78..5973d32c91d6 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -718,7 +718,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1); + dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1); let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel), (ins))); let Asm64 = "$sdst, $src0_modifiers, $src1"; diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 9acd49292268..f81495985405 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -139,6 +139,9 @@ public: ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CStream) const override; + uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const override; + private: DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, @@ -739,6 +742,33 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, } } +uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes, + uint64_t Address) const { + // In Arm state, instructions are always 4 bytes wide, so there's no + // point in skipping any smaller number of bytes if an instruction + // can't be decoded. + if (!STI.getFeatureBits()[ARM::ModeThumb]) + return 4; + + // In a Thumb instruction stream, a halfword is a standalone 2-byte + // instruction if and only if its value is less than 0xE800. + // Otherwise, it's the first halfword of a 4-byte instruction. + // + // So, if we can see the upcoming halfword, we can judge on that + // basis, and maybe skip a whole 4-byte instruction that we don't + // know how to decode, without accidentally trying to interpret its + // second half as something else. + // + // If we don't have the instruction data available, we just have to + // recommend skipping the minimum sensible distance, which is 2 + // bytes. + if (Bytes.size() < 2) + return 2; + + uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0]; + return Insn16 < 0xE800 ? 2 : 4; +} + DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp new file mode 100644 index 000000000000..1985bee8e0ae --- /dev/null +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -0,0 +1,324 @@ +//===- DXILOpBuilder.cpp - Helper class for build DIXLOp functions --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This file contains class to help build DXIL op functions. +//===----------------------------------------------------------------------===// + +#include "DXILOpBuilder.h" +#include "DXILConstants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/DXILOperationCommon.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; +using namespace llvm::DXIL; + +constexpr StringLiteral DXILOpNamePrefix = "dx.op."; + +namespace { + +enum OverloadKind : uint16_t { + VOID = 1, + HALF = 1 << 1, + FLOAT = 1 << 2, + DOUBLE = 1 << 3, + I1 = 1 << 4, + I8 = 1 << 5, + I16 = 1 << 6, + I32 = 1 << 7, + I64 = 1 << 8, + UserDefineType = 1 << 9, + ObjectType = 1 << 10, +}; + +} // namespace + +static const char *getOverloadTypeName(OverloadKind Kind) { + switch (Kind) { + case OverloadKind::HALF: + return "f16"; + case OverloadKind::FLOAT: + return "f32"; + case OverloadKind::DOUBLE: + return "f64"; + case OverloadKind::I1: + return "i1"; + case OverloadKind::I8: + return "i8"; + case OverloadKind::I16: + return "i16"; + case OverloadKind::I32: + return "i32"; + case OverloadKind::I64: + return "i64"; + case OverloadKind::VOID: + case OverloadKind::ObjectType: + case OverloadKind::UserDefineType: + break; + } + llvm_unreachable("invalid overload type for name"); + return "void"; +} + +static OverloadKind getOverloadKind(Type *Ty) { + Type::TypeID T = Ty->getTypeID(); + switch (T) { + case Type::VoidTyID: + return OverloadKind::VOID; + case Type::HalfTyID: + return OverloadKind::HALF; + case Type::FloatTyID: + return OverloadKind::FLOAT; + case Type::DoubleTyID: + return OverloadKind::DOUBLE; + case Type::IntegerTyID: { + IntegerType *ITy = cast<IntegerType>(Ty); + unsigned Bits = ITy->getBitWidth(); + switch (Bits) { + case 1: + return OverloadKind::I1; + case 8: + return OverloadKind::I8; + case 16: + return OverloadKind::I16; + case 32: + return OverloadKind::I32; + case 64: + return OverloadKind::I64; + default: + llvm_unreachable("invalid overload type"); + return OverloadKind::VOID; + } + } + case Type::PointerTyID: + return OverloadKind::UserDefineType; + case Type::StructTyID: + return OverloadKind::ObjectType; + default: + llvm_unreachable("invalid overload type"); + return OverloadKind::VOID; + } +} + +static std::string getTypeName(OverloadKind Kind, Type *Ty) { + if (Kind < OverloadKind::UserDefineType) { + return getOverloadTypeName(Kind); + } else if (Kind == OverloadKind::UserDefineType) { + StructType *ST = cast<StructType>(Ty); + return ST->getStructName().str(); + } else if (Kind == OverloadKind::ObjectType) { + StructType *ST = cast<StructType>(Ty); + return ST->getStructName().str(); + } else { + std::string Str; + raw_string_ostream OS(Str); + Ty->print(OS); + return OS.str(); + } +} + +// Static properties. +struct OpCodeProperty { + DXIL::OpCode OpCode; + // Offset in DXILOpCodeNameTable. + unsigned OpCodeNameOffset; + DXIL::OpCodeClass OpCodeClass; + // Offset in DXILOpCodeClassNameTable. + unsigned OpCodeClassNameOffset; + uint16_t OverloadTys; + llvm::Attribute::AttrKind FuncAttr; + int OverloadParamIndex; // parameter index which control the overload. + // When < 0, should be only 1 overload type. + unsigned NumOfParameters; // Number of parameters include return value. + unsigned ParameterTableOffset; // Offset in ParameterTable. +}; + +// Include getOpCodeClassName getOpCodeProperty, getOpCodeName and +// getOpCodeParameterKind which generated by tableGen. +#define DXIL_OP_OPERATION_TABLE +#include "DXILOperation.inc" +#undef DXIL_OP_OPERATION_TABLE + +static std::string constructOverloadName(OverloadKind Kind, Type *Ty, + const OpCodeProperty &Prop) { + if (Kind == OverloadKind::VOID) { + return (Twine(DXILOpNamePrefix) + getOpCodeClassName(Prop)).str(); + } + return (Twine(DXILOpNamePrefix) + getOpCodeClassName(Prop) + "." + + getTypeName(Kind, Ty)) + .str(); +} + +static std::string constructOverloadTypeName(OverloadKind Kind, + StringRef TypeName) { + if (Kind == OverloadKind::VOID) + return TypeName.str(); + + assert(Kind < OverloadKind::UserDefineType && "invalid overload kind"); + return (Twine(TypeName) + getOverloadTypeName(Kind)).str(); +} + +static StructType *getOrCreateStructType(StringRef Name, + ArrayRef<Type *> EltTys, + LLVMContext &Ctx) { + StructType *ST = StructType::getTypeByName(Ctx, Name); + if (ST) + return ST; + + return StructType::create(Ctx, EltTys, Name); +} + +static StructType *getResRetType(Type *OverloadTy, LLVMContext &Ctx) { + OverloadKind Kind = getOverloadKind(OverloadTy); + std::string TypeName = constructOverloadTypeName(Kind, "dx.types.ResRet."); + Type *FieldTypes[5] = {OverloadTy, OverloadTy, OverloadTy, OverloadTy, + Type::getInt32Ty(Ctx)}; + return getOrCreateStructType(TypeName, FieldTypes, Ctx); +} + +static StructType *getHandleType(LLVMContext &Ctx) { + return getOrCreateStructType("dx.types.Handle", Type::getInt8PtrTy(Ctx), Ctx); +} + +static Type *getTypeFromParameterKind(ParameterKind Kind, Type *OverloadTy) { + auto &Ctx = OverloadTy->getContext(); + switch (Kind) { + case ParameterKind::VOID: + return Type::getVoidTy(Ctx); + case ParameterKind::HALF: + return Type::getHalfTy(Ctx); + case ParameterKind::FLOAT: + return Type::getFloatTy(Ctx); + case ParameterKind::DOUBLE: + return Type::getDoubleTy(Ctx); + case ParameterKind::I1: + return Type::getInt1Ty(Ctx); + case ParameterKind::I8: + return Type::getInt8Ty(Ctx); + case ParameterKind::I16: + return Type::getInt16Ty(Ctx); + case ParameterKind::I32: + return Type::getInt32Ty(Ctx); + case ParameterKind::I64: + return Type::getInt64Ty(Ctx); + case ParameterKind::OVERLOAD: + return OverloadTy; + case ParameterKind::RESOURCE_RET: + return getResRetType(OverloadTy, Ctx); + case ParameterKind::DXIL_HANDLE: + return getHandleType(Ctx); + default: + break; + } + llvm_unreachable("Invalid parameter kind"); + return nullptr; +} + +static FunctionType *getDXILOpFunctionType(const OpCodeProperty *Prop, + Type *OverloadTy) { + SmallVector<Type *> ArgTys; + + auto ParamKinds = getOpCodeParameterKind(*Prop); + + for (unsigned I = 0; I < Prop->NumOfParameters; ++I) { + ParameterKind Kind = ParamKinds[I]; + ArgTys.emplace_back(getTypeFromParameterKind(Kind, OverloadTy)); + } + return FunctionType::get( + ArgTys[0], ArrayRef<Type *>(&ArgTys[1], ArgTys.size() - 1), false); +} + +static FunctionCallee getOrCreateDXILOpFunction(DXIL::OpCode DXILOp, + Type *OverloadTy, Module &M) { + const OpCodeProperty *Prop = getOpCodeProperty(DXILOp); + + OverloadKind Kind = getOverloadKind(OverloadTy); + // FIXME: find the issue and report error in clang instead of check it in + // backend. + if ((Prop->OverloadTys & (uint16_t)Kind) == 0) { + llvm_unreachable("invalid overload"); + } + + std::string FnName = constructOverloadName(Kind, OverloadTy, *Prop); + // Dependent on name to dedup. + if (auto *Fn = M.getFunction(FnName)) + return FunctionCallee(Fn); + + FunctionType *DXILOpFT = getDXILOpFunctionType(Prop, OverloadTy); + return M.getOrInsertFunction(FnName, DXILOpFT); +} + +namespace llvm { +namespace DXIL { + +CallInst *DXILOpBuilder::createDXILOpCall(DXIL::OpCode OpCode, Type *OverloadTy, + llvm::iterator_range<Use *> Args) { + auto Fn = getOrCreateDXILOpFunction(OpCode, OverloadTy, M); + SmallVector<Value *> FullArgs; + FullArgs.emplace_back(B.getInt32((int32_t)OpCode)); + FullArgs.append(Args.begin(), Args.end()); + return B.CreateCall(Fn, FullArgs); +} + +Type *DXILOpBuilder::getOverloadTy(DXIL::OpCode OpCode, FunctionType *FT, + bool NoOpCodeParam) { + + const OpCodeProperty *Prop = getOpCodeProperty(OpCode); + if (Prop->OverloadParamIndex < 0) { + auto &Ctx = FT->getContext(); + // When only has 1 overload type, just return it. + switch (Prop->OverloadTys) { + case OverloadKind::VOID: + return Type::getVoidTy(Ctx); + case OverloadKind::HALF: + return Type::getHalfTy(Ctx); + case OverloadKind::FLOAT: + return Type::getFloatTy(Ctx); + case OverloadKind::DOUBLE: + return Type::getDoubleTy(Ctx); + case OverloadKind::I1: + return Type::getInt1Ty(Ctx); + case OverloadKind::I8: + return Type::getInt8Ty(Ctx); + case OverloadKind::I16: + return Type::getInt16Ty(Ctx); + case OverloadKind::I32: + return Type::getInt32Ty(Ctx); + case OverloadKind::I64: + return Type::getInt64Ty(Ctx); + default: + llvm_unreachable("invalid overload type"); + return nullptr; + } + } + + // Prop->OverloadParamIndex is 0, overload type is FT->getReturnType(). + Type *OverloadType = FT->getReturnType(); + if (Prop->OverloadParamIndex != 0) { + // Skip Return Type and Type for DXIL opcode. + const unsigned SkipedParam = NoOpCodeParam ? 2 : 1; + OverloadType = FT->getParamType(Prop->OverloadParamIndex - SkipedParam); + } + + auto ParamKinds = getOpCodeParameterKind(*Prop); + auto Kind = ParamKinds[Prop->OverloadParamIndex]; + // For ResRet and CBufferRet, OverloadTy is in field of StructType. + if (Kind == ParameterKind::CBUFFER_RET || + Kind == ParameterKind::RESOURCE_RET) { + auto *ST = cast<StructType>(OverloadType); + OverloadType = ST->getElementType(0); + } + return OverloadType; +} + +const char *DXILOpBuilder::getOpCodeName(DXIL::OpCode DXILOp) { + return ::getOpCodeName(DXILOp); +} +} // namespace DXIL +} // namespace llvm diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h new file mode 100644 index 000000000000..0cc39e845b71 --- /dev/null +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h @@ -0,0 +1,46 @@ +//===- DXILOpBuilder.h - Helper class for build DIXLOp functions ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This file contains class to help build DXIL op functions. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_DIRECTX_DXILOPBUILDER_H +#define LLVM_LIB_TARGET_DIRECTX_DXILOPBUILDER_H + +#include "DXILConstants.h" +#include "llvm/ADT/iterator_range.h" + +namespace llvm { +class Module; +class IRBuilderBase; +class CallInst; +class Value; +class Type; +class FunctionType; +class Use; + +namespace DXIL { + +class DXILOpBuilder { +public: + DXILOpBuilder(Module &M, IRBuilderBase &B) : M(M), B(B) {} + CallInst *createDXILOpCall(DXIL::OpCode OpCode, Type *OverloadTy, + llvm::iterator_range<Use *> Args); + Type *getOverloadTy(DXIL::OpCode OpCode, FunctionType *FT, + bool NoOpCodeParam); + static const char *getOpCodeName(DXIL::OpCode DXILOp); + +private: + Module &M; + IRBuilderBase &B; +}; + +} // namespace DXIL +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 11b89e4ec890..20c08f47745d 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "DXILConstants.h" +#include "DXILOpBuilder.h" #include "DirectX.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Passes.h" @@ -28,168 +29,12 @@ using namespace llvm; using namespace llvm::DXIL; -constexpr StringLiteral DXILOpNamePrefix = "dx.op."; - -enum OverloadKind : uint16_t { - VOID = 1, - HALF = 1 << 1, - FLOAT = 1 << 2, - DOUBLE = 1 << 3, - I1 = 1 << 4, - I8 = 1 << 5, - I16 = 1 << 6, - I32 = 1 << 7, - I64 = 1 << 8, - UserDefineType = 1 << 9, - ObjectType = 1 << 10, -}; - -static const char *getOverloadTypeName(OverloadKind Kind) { - switch (Kind) { - case OverloadKind::HALF: - return "f16"; - case OverloadKind::FLOAT: - return "f32"; - case OverloadKind::DOUBLE: - return "f64"; - case OverloadKind::I1: - return "i1"; - case OverloadKind::I8: - return "i8"; - case OverloadKind::I16: - return "i16"; - case OverloadKind::I32: - return "i32"; - case OverloadKind::I64: - return "i64"; - case OverloadKind::VOID: - case OverloadKind::ObjectType: - case OverloadKind::UserDefineType: - break; - } - llvm_unreachable("invalid overload type for name"); - return "void"; -} - -static OverloadKind getOverloadKind(Type *Ty) { - Type::TypeID T = Ty->getTypeID(); - switch (T) { - case Type::VoidTyID: - return OverloadKind::VOID; - case Type::HalfTyID: - return OverloadKind::HALF; - case Type::FloatTyID: - return OverloadKind::FLOAT; - case Type::DoubleTyID: - return OverloadKind::DOUBLE; - case Type::IntegerTyID: { - IntegerType *ITy = cast<IntegerType>(Ty); - unsigned Bits = ITy->getBitWidth(); - switch (Bits) { - case 1: - return OverloadKind::I1; - case 8: - return OverloadKind::I8; - case 16: - return OverloadKind::I16; - case 32: - return OverloadKind::I32; - case 64: - return OverloadKind::I64; - default: - llvm_unreachable("invalid overload type"); - return OverloadKind::VOID; - } - } - case Type::PointerTyID: - return OverloadKind::UserDefineType; - case Type::StructTyID: - return OverloadKind::ObjectType; - default: - llvm_unreachable("invalid overload type"); - return OverloadKind::VOID; - } -} - -static std::string getTypeName(OverloadKind Kind, Type *Ty) { - if (Kind < OverloadKind::UserDefineType) { - return getOverloadTypeName(Kind); - } else if (Kind == OverloadKind::UserDefineType) { - StructType *ST = cast<StructType>(Ty); - return ST->getStructName().str(); - } else if (Kind == OverloadKind::ObjectType) { - StructType *ST = cast<StructType>(Ty); - return ST->getStructName().str(); - } else { - std::string Str; - raw_string_ostream OS(Str); - Ty->print(OS); - return OS.str(); - } -} - -// Static properties. -struct OpCodeProperty { - DXIL::OpCode OpCode; - // Offset in DXILOpCodeNameTable. - unsigned OpCodeNameOffset; - DXIL::OpCodeClass OpCodeClass; - // Offset in DXILOpCodeClassNameTable. - unsigned OpCodeClassNameOffset; - uint16_t OverloadTys; - llvm::Attribute::AttrKind FuncAttr; -}; - -// Include getOpCodeClassName getOpCodeProperty and getOpCodeName which -// generated by tableGen. -#define DXIL_OP_OPERATION_TABLE -#include "DXILOperation.inc" -#undef DXIL_OP_OPERATION_TABLE - -static std::string constructOverloadName(OverloadKind Kind, Type *Ty, - const OpCodeProperty &Prop) { - if (Kind == OverloadKind::VOID) { - return (Twine(DXILOpNamePrefix) + getOpCodeClassName(Prop)).str(); - } - return (Twine(DXILOpNamePrefix) + getOpCodeClassName(Prop) + "." + - getTypeName(Kind, Ty)) - .str(); -} - -static FunctionCallee createDXILOpFunction(DXIL::OpCode DXILOp, Function &F, - Module &M) { - const OpCodeProperty *Prop = getOpCodeProperty(DXILOp); - - // Get return type as overload type for DXILOp. - // Only simple mapping case here, so return type is good enough. - Type *OverloadTy = F.getReturnType(); - - OverloadKind Kind = getOverloadKind(OverloadTy); - // FIXME: find the issue and report error in clang instead of check it in - // backend. - if ((Prop->OverloadTys & (uint16_t)Kind) == 0) { - llvm_unreachable("invalid overload"); - } - - std::string FnName = constructOverloadName(Kind, OverloadTy, *Prop); - assert(!M.getFunction(FnName) && "Function already exists"); - - auto &Ctx = M.getContext(); - Type *OpCodeTy = Type::getInt32Ty(Ctx); - - SmallVector<Type *> ArgTypes; - // DXIL has i32 opcode as first arg. - ArgTypes.emplace_back(OpCodeTy); - FunctionType *FT = F.getFunctionType(); - ArgTypes.append(FT->param_begin(), FT->param_end()); - FunctionType *DXILOpFT = FunctionType::get(OverloadTy, ArgTypes, false); - return M.getOrInsertFunction(FnName, DXILOpFT); -} - static void lowerIntrinsic(DXIL::OpCode DXILOp, Function &F, Module &M) { - auto DXILOpFn = createDXILOpFunction(DXILOp, F, M); IRBuilder<> B(M.getContext()); Value *DXILOpArg = B.getInt32(static_cast<unsigned>(DXILOp)); + DXILOpBuilder DXILB(M, B); + Type *OverloadTy = + DXILB.getOverloadTy(DXILOp, F.getFunctionType(), /*NoOpCodeParam*/ true); for (User *U : make_early_inc_range(F.users())) { CallInst *CI = dyn_cast<CallInst>(U); if (!CI) @@ -199,8 +44,8 @@ static void lowerIntrinsic(DXIL::OpCode DXILOp, Function &F, Module &M) { Args.emplace_back(DXILOpArg); Args.append(CI->arg_begin(), CI->arg_end()); B.SetInsertPoint(CI); - CallInst *DXILCI = B.CreateCall(DXILOpFn, Args); - LLVM_DEBUG(DXILCI->setName(getOpCodeName(DXILOp))); + CallInst *DXILCI = DXILB.createDXILOpCall(DXILOp, OverloadTy, CI->args()); + CI->replaceAllUsesWith(DXILCI); CI->eraseFromParent(); } diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp index e2a41515de38..a873662f730d 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp @@ -260,9 +260,7 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F, return LU->getOperandNo() > RU->getOperandNo(); }); - if (llvm::is_sorted(List, [](const Entry &L, const Entry &R) { - return L.second < R.second; - })) + if (llvm::is_sorted(List, llvm::less_second())) // Order is already correct. return; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h index 1e50385a7b4b..505c90f66f43 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h @@ -95,7 +95,6 @@ public: void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl); void SelectStore(SDNode *N); void SelectSHL(SDNode *N); - void SelectZeroExtend(SDNode *N); void SelectIntrinsicWChain(SDNode *N); void SelectIntrinsicWOChain(SDNode *N); void SelectConstant(SDNode *N); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 9561dfe8a35d..1dc6a4cb9c89 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -107,9 +107,6 @@ class HexagonTargetLowering : public TargetLowering { const HexagonTargetMachine &HTM; const HexagonSubtarget &Subtarget; - bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize) - const; - public: explicit HexagonTargetLowering(const TargetMachine &TM, const HexagonSubtarget &ST); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index c8e6276aa4de..b8671f26d124 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2253,15 +2253,6 @@ bool HexagonInstrInfo::isDuplexPair(const MachineInstr &MIa, return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG)); } -bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr &MI) const { - if (MI.mayLoadOrStore() || MI.isCompare()) - return true; - - // Multiply - unsigned SchedClass = MI.getDesc().getSchedClass(); - return is_TC4x(SchedClass) || is_TC3x(SchedClass); -} - bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const { return (Opcode == Hexagon::ENDLOOP0 || Opcode == Hexagon::ENDLOOP1); @@ -2417,43 +2408,6 @@ bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr &MI, } } -bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI, - const MachineInstr &ESMI) const { - bool isLate = isLateResultInstr(LRMI); - bool isEarly = isEarlySourceInstr(ESMI); - - LLVM_DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- ")); - LLVM_DEBUG(LRMI.dump()); - LLVM_DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- ")); - LLVM_DEBUG(ESMI.dump()); - - if (isLate && isEarly) { - LLVM_DEBUG(dbgs() << "++Is Late Result feeding Early Source\n"); - return true; - } - - return false; -} - -bool HexagonInstrInfo::isLateResultInstr(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case TargetOpcode::EXTRACT_SUBREG: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: - case TargetOpcode::REG_SEQUENCE: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::COPY: - case TargetOpcode::INLINEASM: - case TargetOpcode::PHI: - return false; - default: - break; - } - - unsigned SchedClass = MI.getDesc().getSchedClass(); - return !is_TC1(SchedClass); -} - bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr &MI) const { // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply // resource, but all operands can be received late like an ALU instruction. diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 2af09c857d86..703a894132bb 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -363,7 +363,6 @@ public: bool isDotCurInst(const MachineInstr &MI) const; bool isDotNewInst(const MachineInstr &MI) const; bool isDuplexPair(const MachineInstr &MIa, const MachineInstr &MIb) const; - bool isEarlySourceInstr(const MachineInstr &MI) const; bool isEndLoopN(unsigned Opcode) const; bool isExpr(unsigned OpType) const; bool isExtendable(const MachineInstr &MI) const; @@ -375,9 +374,6 @@ public: bool isIndirectL4Return(const MachineInstr &MI) const; bool isJumpR(const MachineInstr &MI) const; bool isJumpWithinBranchRange(const MachineInstr &MI, unsigned offset) const; - bool isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI, - const MachineInstr &ESMI) const; - bool isLateResultInstr(const MachineInstr &MI) const; bool isLateSourceInstr(const MachineInstr &MI) const; bool isLoopN(const MachineInstr &MI) const; bool isMemOp(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index d11f5a9080a0..9793c7bc3532 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -248,7 +248,7 @@ public: addExpr(Inst, getImm()); } }; -} // end anonymous namespace +} // end namespace #define GET_REGISTER_MATCHER #define GET_SUBTARGET_FEATURE_NAME diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp index 215d061f11f2..beb757c78596 100644 --- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp +++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp @@ -39,7 +39,7 @@ public: ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CStream) const override; }; -} // end anonymous namespace +} // end namespace static MCDisassembler *createLoongArchDisassembler(const Target &T, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h index caa7bd31e28b..e6c9c24dd1b2 100644 --- a/llvm/lib/Target/LoongArch/LoongArch.h +++ b/llvm/lib/Target/LoongArch/LoongArch.h @@ -33,6 +33,6 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO, const AsmPrinter &AP); FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM); -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCH_H diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h index 014b666de711..72d8e006a0bb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -52,5 +52,5 @@ private: const DebugLoc &DL, Register DestReg, Register SrcReg, int64_t Val, MachineInstr::MIFlag Flag) const; }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h index 7ad329a64424..8c9357d75979 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h @@ -55,6 +55,6 @@ public: #include "LoongArchGenDAGISel.inc" }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHISELDAGTODAG_H diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 279550482675..141f1fd3a55d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -45,7 +45,7 @@ enum NodeType : unsigned { BSTRPICK, }; -} // namespace LoongArchISD +} // end namespace LoongArchISD class LoongArchTargetLowering : public TargetLowering { const LoongArchSubtarget &Subtarget; diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h index 02c9156e2b87..cca130c3bc3a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h @@ -45,6 +45,6 @@ struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo { Register getFrameRegister(const MachineFunction &MF) const override; }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHREGISTERINFO_H diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h index 95c2c676cc3c..fbe7a176b371 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -84,6 +84,6 @@ public: unsigned getGRLen() const { return GRLen; } LoongArchABI::ABI getTargetABI() const { return TargetABI; } }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHSUBTARGET_H diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 2d08d5c674bc..7ba5848e0997 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -103,7 +103,7 @@ public: void addIRPasses() override; bool addInstSelector() override; }; -} // namespace +} // end namespace TargetPassConfig * LoongArchTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 77bbfb095747..a5f0b816c972 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -58,6 +58,6 @@ public: std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHASMBACKEND_H diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp index f0c985883125..de2ba2833414 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp @@ -35,6 +35,6 @@ ABI getTargetABI(StringRef ABIName) { // FIXME: other register? MCRegister getBPReg() { return LoongArch::R31; } -} // namespace LoongArchABI +} // end namespace LoongArchABI -} // namespace llvm +} // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h index e26f22de0cbc..fee247a0c02c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h @@ -37,8 +37,8 @@ ABI getTargetABI(StringRef ABIName); // Returns the register used to hold the stack pointer after realignment. MCRegister getBPReg(); -} // namespace LoongArchABI +} // end namespace LoongArchABI -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHBASEINFO_H diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 95e1314f363a..1850b0d8a756 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -33,7 +33,7 @@ protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} // namespace +} // end namespace LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h index 727fc6a3e1f3..0cbb3d73cd03 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.h @@ -44,6 +44,6 @@ private: void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHINSTPRINTER_H diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h index 1cf8a2fdf8aa..ed1abbf46153 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h @@ -25,6 +25,6 @@ public: explicit LoongArchMCAsmInfo(const Triple &TargetTriple); }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCASMINFO_H diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index 9c6a4f39b9ea..01a370a90403 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -69,7 +69,7 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; }; -} // end anonymous namespace +} // end namespace unsigned LoongArchMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index e50761ab1e27..8d71235f6a81 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -95,7 +95,7 @@ public: } }; -} // end anonymous namespace +} // end namespace static MCInstrAnalysis *createLoongArchInstrAnalysis(const MCInstrInfo *Info) { return new LoongArchMCInstrAnalysis(Info); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h index a606ccdbc47c..ab35a0096c8a 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h @@ -38,7 +38,7 @@ MCAsmBackend *createLoongArchAsmBackend(const Target &T, std::unique_ptr<MCObjectTargetWriter> createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); -} // namespace llvm +} // end namespace llvm // Defines symbolic names for LoongArch registers. #define GET_REGINFO_ENUM diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h index 945aa91e40c0..be1b425894de 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h @@ -24,7 +24,7 @@ using InstSeq = SmallVector<Inst, 4>; // Helper to generate an instruction sequence that will materialise the given // immediate value into a register. InstSeq generateInstSeq(int64_t Val); -} // namespace LoongArchMatInt -} // namespace llvm +} // end namespace LoongArchMatInt +} // end namespace llvm #endif diff --git a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h index 6fc13d52c065..b24cf879512c 100644 --- a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h +++ b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.h @@ -16,6 +16,6 @@ class Target; Target &getTheLoongArch32Target(); Target &getTheLoongArch64Target(); -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_TARGETINFO_LOONGARCHTARGETINFO_H diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp index cb6d53ec0a12..5dc2bf07ddd5 100644 --- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -31,8 +31,8 @@ public: : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false, /*EnableOptSize*/ false, /*EnableMinSize*/ false) {} - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool MipsPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index b700a9ede39b..a19253da440e 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -81,6 +82,20 @@ bool RISCVCodeGenPrepare::optimizeZExt(ZExtInst *ZExt) { return true; } + // Convert (zext (abs(i32 X, i1 1))) -> (sext (abs(i32 X, i1 1))). If abs of + // INT_MIN is poison, the sign bit is zero. + using namespace PatternMatch; + if (match(Src, m_Intrinsic<Intrinsic::abs>(m_Value(), m_One()))) { + auto *SExt = new SExtInst(Src, ZExt->getType(), "", ZExt); + SExt->takeName(ZExt); + SExt->setDebugLoc(ZExt->getDebugLoc()); + + ZExt->replaceAllUsesWith(SExt); + ZExt->eraseFromParent(); + ++NumZExtToSExt; + return true; + } + return false; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1702546b58a6..baa19e81e436 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1313,6 +1313,25 @@ bool RISCVTargetLowering::shouldSinkOperands( return true; } +bool RISCVTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { + unsigned Opc = VecOp.getOpcode(); + + // Assume target opcodes can't be scalarized. + // TODO - do we have any exceptions? + if (Opc >= ISD::BUILTIN_OP_END) + return false; + + // If the vector op is not supported, try to convert to scalar. + EVT VecVT = VecOp.getValueType(); + if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) + return true; + + // If the vector op is supported, but the scalar op is not, the transform may + // not be worthwhile. + EVT ScalarVT = VecVT.getScalarType(); + return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); +} + bool RISCVTargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // In order to maximise the opportunity for common subexpression elimination, @@ -1387,18 +1406,28 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS, } } - // Convert X > -1 to X >= 0. - if (CC == ISD::SETGT && isAllOnesConstant(RHS)) { - RHS = DAG.getConstant(0, DL, RHS.getValueType()); - CC = ISD::SETGE; - return; - } - // Convert X < 1 to 0 >= X. - if (CC == ISD::SETLT && isOneConstant(RHS)) { - RHS = LHS; - LHS = DAG.getConstant(0, DL, RHS.getValueType()); - CC = ISD::SETGE; - return; + if (auto *RHSC = dyn_cast<ConstantSDNode>(RHS)) { + int64_t C = RHSC->getSExtValue(); + switch (CC) { + default: break; + case ISD::SETGT: + // Convert X > -1 to X >= 0. + if (C == -1) { + RHS = DAG.getConstant(0, DL, RHS.getValueType()); + CC = ISD::SETGE; + return; + } + break; + case ISD::SETLT: + // Convert X < 1 to 0 <= X. + if (C == 1) { + RHS = LHS; + LHS = DAG.getConstant(0, DL, RHS.getValueType()); + CC = ISD::SETGE; + return; + } + break; + } } switch (CC) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 5e15176de59c..6ecf8b8324d4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -376,6 +376,7 @@ public: SelectionDAG &DAG) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const override; + bool shouldScalarizeBinop(SDValue VecOp) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 4aa9ded5b3a2..beb49f5f6249 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -134,14 +134,13 @@ public: getSerializableDirectMachineOperandTargetFlags() const override; // Return true if the function can safely be outlined from. - virtual bool - isFunctionSafeToOutlineFrom(MachineFunction &MF, - bool OutlineFromLinkOnceODRs) const override; + bool isFunctionSafeToOutlineFrom(MachineFunction &MF, + bool OutlineFromLinkOnceODRs) const override; // Return true if MBB is safe to outline from, and return any target-specific // information in Flags. - virtual bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, - unsigned &Flags) const override; + bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; @@ -150,17 +149,15 @@ public: std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override; // Return if/how a given MachineInstr should be outlined. - virtual outliner::InstrType - getOutliningType(MachineBasicBlock::iterator &MBBI, - unsigned Flags) const override; + outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MBBI, + unsigned Flags) const override; // Insert a custom frame for outlined functions. - virtual void - buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, - const outliner::OutlinedFunction &OF) const override; + void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, + const outliner::OutlinedFunction &OF) const override; // Insert a call to an outlined function into a given basic block. - virtual MachineBasicBlock::iterator + MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index d204c85d6179..cd1da4360002 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -696,52 +696,36 @@ def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb), //===----------------------------------------------------------------------===// let EmitPriority = 0 in { -let Predicates = [HasStdExtC, HasStdExtD] in -def : InstAlias<"c.fld $rd, (${rs1})", (C_FLD FPR64C:$rd, GPRC:$rs1, 0)>; - +let Predicates = [HasStdExtC] in { def : InstAlias<"c.lw $rd, (${rs1})", (C_LW GPRC:$rd, GPRC:$rs1, 0)>; - -let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in -def : InstAlias<"c.flw $rd, (${rs1})", (C_FLW FPR32C:$rd, GPRC:$rs1, 0)>; - -let Predicates = [HasStdExtC, IsRV64] in -def : InstAlias<"c.ld $rd, (${rs1})", (C_LD GPRC:$rd, GPRC:$rs1, 0)>; - -let Predicates = [HasStdExtC, HasStdExtD] in -def : InstAlias<"c.fsd $rs2, (${rs1})", (C_FSD FPR64C:$rs2, GPRC:$rs1, 0)>; - def : InstAlias<"c.sw $rs2, (${rs1})", (C_SW GPRC:$rs2, GPRC:$rs1, 0)>; - -let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in -def : InstAlias<"c.fsw $rs2, (${rs1})", (C_FSW FPR32C:$rs2, GPRC:$rs1, 0)>; - -let Predicates = [HasStdExtC, IsRV64] in -def : InstAlias<"c.sd $rs2, (${rs1})", (C_SD GPRC:$rs2, GPRC:$rs1, 0)>; - -let Predicates = [HasStdExtC, HasStdExtD] in -def : InstAlias<"c.fldsp $rd, (${rs1})", (C_FLDSP FPR64C:$rd, SP:$rs1, 0)>; - def : InstAlias<"c.lwsp $rd, (${rs1})", (C_LWSP GPRC:$rd, SP:$rs1, 0)>; +def : InstAlias<"c.swsp $rs2, (${rs1})", (C_SWSP GPRC:$rs2, SP:$rs1, 0)>; +} -let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in -def : InstAlias<"c.flwsp $rd, (${rs1})", (C_FLWSP FPR32C:$rd, SP:$rs1, 0)>; - -let Predicates = [HasStdExtC, IsRV64] in +let Predicates = [HasStdExtC, IsRV64] in { +def : InstAlias<"c.ld $rd, (${rs1})", (C_LD GPRC:$rd, GPRC:$rs1, 0)>; +def : InstAlias<"c.sd $rs2, (${rs1})", (C_SD GPRC:$rs2, GPRC:$rs1, 0)>; def : InstAlias<"c.ldsp $rd, (${rs1})", (C_LDSP GPRC:$rd, SP:$rs1, 0)>; +def : InstAlias<"c.sdsp $rs2, (${rs1})", (C_SDSP GPRC:$rs2, SP:$rs1, 0)>; +} -let Predicates = [HasStdExtC, HasStdExtD] in -def : InstAlias<"c.fsdsp $rs2, (${rs1})", (C_FSDSP FPR64C:$rs2, SP:$rs1, 0)>; - -def : InstAlias<"c.swsp $rs2, (${rs1})", (C_SWSP GPRC:$rs2, SP:$rs1, 0)>; - -let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in +let Predicates = [HasStdExtC, HasStdExtF, IsRV32] in { +def : InstAlias<"c.flw $rd, (${rs1})", (C_FLW FPR32C:$rd, GPRC:$rs1, 0)>; +def : InstAlias<"c.fsw $rs2, (${rs1})", (C_FSW FPR32C:$rs2, GPRC:$rs1, 0)>; +def : InstAlias<"c.flwsp $rd, (${rs1})", (C_FLWSP FPR32C:$rd, SP:$rs1, 0)>; def : InstAlias<"c.fswsp $rs2, (${rs1})", (C_FSWSP FPR32C:$rs2, SP:$rs1, 0)>; +} -let Predicates = [HasStdExtC, IsRV64] in -def : InstAlias<"c.sdsp $rs2, (${rs1})", (C_SDSP GPRC:$rs2, SP:$rs1, 0)>; +let Predicates = [HasStdExtC, HasStdExtD] in { +def : InstAlias<"c.fld $rd, (${rs1})", (C_FLD FPR64C:$rd, GPRC:$rs1, 0)>; +def : InstAlias<"c.fsd $rs2, (${rs1})", (C_FSD FPR64C:$rs2, GPRC:$rs1, 0)>; +def : InstAlias<"c.fldsp $rd, (${rs1})", (C_FLDSP FPR64C:$rd, SP:$rs1, 0)>; +def : InstAlias<"c.fsdsp $rs2, (${rs1})", (C_FSDSP FPR64C:$rs2, SP:$rs1, 0)>; } +} // EmitPriority = 0 -//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===/i // Compress Instruction tablegen backend. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h index 087646fb5ed9..4b2a403c5c5b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h @@ -44,8 +44,7 @@ public: TargetTransformInfo getTargetTransformInfo(const Function &F) const override; - virtual bool isNoopAddrSpaceCast(unsigned SrcAS, - unsigned DstAS) const override; + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DstAS) const override; yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; yaml::MachineFunctionInfo * diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h index 93ffa9847f06..db0936f3f56b 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -81,25 +81,22 @@ public: /// Particular to z/OS when in 64 bit mode class SystemZXPLINK64Registers : public SystemZCallingConventionRegisters { public: - int getReturnFunctionAddressRegister() override final { - return SystemZ::R7D; - }; + int getReturnFunctionAddressRegister() final { return SystemZ::R7D; }; - int getStackPointerRegister() override final { return SystemZ::R4D; }; + int getStackPointerRegister() final { return SystemZ::R4D; }; - int getFramePointerRegister() override final { return SystemZ::R8D; }; + int getFramePointerRegister() final { return SystemZ::R8D; }; int getAddressOfCalleeRegister() { return SystemZ::R6D; }; - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF) const override final; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const final; const uint32_t *getCallPreservedMask(const MachineFunction &MF, - CallingConv::ID CC) const override final; + CallingConv::ID CC) const final; - int getCallFrameSize() override final { return 128; } + int getCallFrameSize() final { return 128; } - int getStackPointerBias() override final { return 2048; } + int getStackPointerBias() final { return 2048; } /// Destroys the object. Bogus destructor overriding base class destructor ~SystemZXPLINK64Registers() = default; @@ -109,23 +106,20 @@ public: /// Particular when on zLinux in 64 bit mode class SystemZELFRegisters : public SystemZCallingConventionRegisters { public: - int getReturnFunctionAddressRegister() override final { - return SystemZ::R14D; - }; + int getReturnFunctionAddressRegister() final { return SystemZ::R14D; }; - int getStackPointerRegister() override final { return SystemZ::R15D; }; + int getStackPointerRegister() final { return SystemZ::R15D; }; - int getFramePointerRegister() override final { return SystemZ::R11D; }; + int getFramePointerRegister() final { return SystemZ::R11D; }; - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF) const override final; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const final; const uint32_t *getCallPreservedMask(const MachineFunction &MF, - CallingConv::ID CC) const override final; + CallingConv::ID CC) const final; - int getCallFrameSize() override final { return SystemZMC::ELFCallFrameSize; } + int getCallFrameSize() final { return SystemZMC::ELFCallFrameSize; } - int getStackPointerBias() override final { return 0; } + int getStackPointerBias() final { return 0; } /// Destroys the object. Bogus destructor overriding base class destructor ~SystemZELFRegisters() = default; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 2636acaf1604..ab6d6b4f7ef1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -577,8 +577,9 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB, CallParams.removeOperand(0); // For funcrefs, call_indirect is done through __funcref_call_table and the - // funcref is always installed in slot 0 of the table, therefore instead of having - // the function pointer added at the end of the params list, a zero (the index in + // funcref is always installed in slot 0 of the table, therefore instead of + // having the function pointer added at the end of the params list, a zero + // (the index in // __funcref_call_table is added). if (IsFuncrefCall) { Register RegZero = @@ -1156,7 +1157,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that LowerGlobalAddress // doesn't at MO_GOT which is not needed for direct calls. - GlobalAddressSDNode* GA = cast<GlobalAddressSDNode>(Callee); + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Callee); Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, getPointerTy(DAG.getDataLayout()), GA->getOffset()); @@ -1719,20 +1720,12 @@ WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op, const GlobalValue *GV = GA->getGlobal(); - // Currently Emscripten does not support dynamic linking with threads. - // Therefore, if we have thread-local storage, only the local-exec model - // is possible. - // TODO: remove this and implement proper TLS models once Emscripten - // supports dynamic linking with threads. - if (GV->getThreadLocalMode() != GlobalValue::LocalExecTLSModel && - !Subtarget->getTargetTriple().isOSEmscripten()) { - report_fatal_error("only -ftls-model=local-exec is supported for now on " - "non-Emscripten OSes: variable " + - GV->getName(), - false); - } - - auto model = GV->getThreadLocalMode(); + // Currently only Emscripten supports dynamic linking with threads. Therefore, + // on other targets, if we have thread-local storage, only the local-exec + // model is possible. + auto model = Subtarget->getTargetTriple().isOSEmscripten() + ? GV->getThreadLocalMode() + : GlobalValue::LocalExecTLSModel; // Unsupported TLS modes assert(model != GlobalValue::NotThreadLocal); @@ -1791,8 +1784,7 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op, if (GV->getValueType()->isFunctionTy()) { BaseName = MF.createExternalSymbolName("__table_base"); OperandFlags = WebAssemblyII::MO_TABLE_BASE_REL; - } - else { + } else { BaseName = MF.createExternalSymbolName("__memory_base"); OperandFlags = WebAssemblyII::MO_MEMORY_BASE_REL; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5a4533c4bac4..b080ab7e138c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1041,6 +1041,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); + setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); @@ -1255,6 +1256,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); + setOperationAction(ISD::SMULO, MVT::v2i32, Custom); // We directly match byte blends in the backend as they match the VSELECT // condition form. @@ -19302,6 +19304,44 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { return false; } +static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + MVT VT = V1.getSimpleValueType().getScalarType(); + if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) + return false; + + // i8 is better to be widen to i16, because there is PBLENDW for vXi16 + // when the vector bit size is 128 or 256. + if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512) + return false; + + auto HasMaskOperation = [&](SDValue V) { + // TODO: Currently we only check limited opcode. We probably extend + // it to all binary operation by checking TLI.isBinOp(). + switch (V->getOpcode()) { + default: + return false; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::XOR: + break; + } + if (!V->hasOneUse()) + return false; + + return true; + }; + + if (HasMaskOperation(V1) || HasMaskOperation(V2)) + return true; + + return false; +} + // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask, @@ -19377,6 +19417,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + !canCombineAsMaskOperation(V1, V2, Subtarget) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. @@ -32379,6 +32420,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res); return; } + case ISD::SMULO: + case ISD::UMULO: { + EVT VT = N->getValueType(0); + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT == MVT::v2i32 && "Unexpected VT!"); + bool IsSigned = N->getOpcode() == ISD::SMULO; + unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1); + // Extract the high 32 bits from each result using PSHUFD. + // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD. + SDValue Hi = DAG.getBitcast(MVT::v4i32, Res); + Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1}); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi, + DAG.getIntPtrConstant(0, dl)); + + // Truncate the low bits of the result. This will become PSHUFD. + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + + SDValue HiCmp; + if (IsSigned) { + // SMULO overflows if the high bits don't match the sign of the low. + HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT)); + } else { + // UMULO overflows if the high bits are non-zero. + HiCmp = DAG.getConstant(0, dl, VT); + } + SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE); + + // Widen the result with by padding with undef. + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, + DAG.getUNDEF(VT)); + Results.push_back(Res); + Results.push_back(Ovf); + return; + } case X86ISD::VPMADDWD: { // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); @@ -37522,8 +37600,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, break; } if (IsBlend) { - if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && - DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { + if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) && + DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) { Shuffle = ISD::OR; SrcVT = DstVT = MaskVT.changeTypeToInteger(); return true; @@ -41191,7 +41269,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Src = Op.getOperand(0); APInt DemandedUpperElts = DemandedElts; DemandedUpperElts.clearLowBits(1); - if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero()) + if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1)) return TLO.CombineTo(Op, Src); break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index af110884049b..85e5d0ba4c34 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1409,7 +1409,7 @@ namespace llvm { Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; - virtual bool needsFixedCatchObjects() const override; + bool needsFixedCatchObjects() const override; /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 98da00c39bdb..81729e3618d8 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -544,7 +544,7 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableDirectMachineOperandTargetFlags() const override; - virtual outliner::OutlinedFunction getOutliningCandidateInfo( + outliner::OutlinedFunction getOutliningCandidateInfo( std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override; bool isFunctionSafeToOutlineFrom(MachineFunction &MF, diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 1fd8b88dd776..35adaa3bde65 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -427,27 +428,73 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { return true; } +/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids +/// pessimistic codegen that has to account for setting errno and can enable +/// vectorization. +static bool +foldSqrt(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI) { + // Match a call to sqrt mathlib function. + auto *Call = dyn_cast<CallInst>(&I); + if (!Call) + return false; + + Module *M = Call->getModule(); + LibFunc Func; + if (!TLI.getLibFunc(*Call, Func) || !isLibFuncEmittable(M, &TLI, Func)) + return false; + + if (Func != LibFunc_sqrt && Func != LibFunc_sqrtf && Func != LibFunc_sqrtl) + return false; + + // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created, + // and (3) we would not end up lowering to a libcall anyway (which could + // change the value of errno), then: + // (1) the operand arg must not be less than -0.0. + // (2) errno won't be set. + // (3) it is safe to convert this to an intrinsic call. + // TODO: Check if the arg is known non-negative. + Type *Ty = Call->getType(); + if (TTI.haveFastSqrt(Ty) && Call->hasNoNaNs()) { + IRBuilder<> Builder(&I); + IRBuilderBase::FastMathFlagGuard Guard(Builder); + Builder.setFastMathFlags(Call->getFastMathFlags()); + + Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty); + Value *NewSqrt = Builder.CreateCall(Sqrt, Call->getArgOperand(0), "sqrt"); + I.replaceAllUsesWith(NewSqrt); + + // Explicitly erase the old call because a call with side effects is not + // trivially dead. + I.eraseFromParent(); + return true; + } + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. static bool foldUnusualPatterns(Function &F, DominatorTree &DT, - TargetTransformInfo &TTI) { + TargetTransformInfo &TTI, + TargetLibraryInfo &TLI) { bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) continue; - // Do not delete instructions under here and invalidate the iterator. + // Walk the block backwards for efficiency. We're matching a chain of // use->defs, so we're more likely to succeed by starting from the bottom. // Also, we want to avoid matching partial patterns. // TODO: It would be more efficient if we removed dead instructions // iteratively in this loop rather than waiting until the end. - for (Instruction &I : llvm::reverse(BB)) { + for (Instruction &I : make_early_inc_range(llvm::reverse(BB))) { MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedFunnelShift(I, DT); MadeChange |= tryToRecognizePopCount(I); MadeChange |= tryToFPToSat(I, TTI); + MadeChange |= foldSqrt(I, TTI, TLI); } } @@ -467,7 +514,7 @@ static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI, const DataLayout &DL = F.getParent()->getDataLayout(); TruncInstCombine TIC(AC, TLI, DL, DT); MadeChange |= TIC.run(F); - MadeChange |= foldUnusualPatterns(F, DT, TTI); + MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI); return MadeChange; } diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 62cfc3294968..8c77b6937737 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -249,7 +249,8 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM, {LLVMContext::MD_range, LLVMContext::MD_nonnull, LLVMContext::MD_dereferenceable, LLVMContext::MD_dereferenceable_or_null, - LLVMContext::MD_align, LLVMContext::MD_noundef}); + LLVMContext::MD_align, LLVMContext::MD_noundef, + LLVMContext::MD_nontemporal}); } Args.push_back(LI); ArgAttrVec.push_back(AttributeSet()); @@ -631,8 +632,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR, // Sort parts by offset. append_range(ArgPartsVec, ArgParts); - sort(ArgPartsVec, - [](const auto &A, const auto &B) { return A.first < B.first; }); + sort(ArgPartsVec, llvm::less_first()); // Make sure the parts are non-overlapping. int64_t Offset = ArgPartsVec[0].first; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 660ff3ee9563..83252fec3ea8 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3328,7 +3328,7 @@ struct AANoAliasReturned final : AANoAliasImpl { } /// See AbstractAttribute::updateImpl(...). - virtual ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus updateImpl(Attributor &A) override { auto CheckReturnValue = [&](Value &RV) -> bool { if (Constant *C = dyn_cast<Constant>(&RV)) @@ -3427,7 +3427,7 @@ struct AAIsDeadValueImpl : public AAIsDead { } /// See AbstractAttribute::getAsStr(). - virtual const std::string getAsStr() const override { + const std::string getAsStr() const override { return isAssumedDead() ? "assumed-dead" : "assumed-live"; } @@ -4500,9 +4500,8 @@ struct AAAlignImpl : AAAlign { // to avoid making the alignment explicit if it did not improve. /// See AbstractAttribute::getDeducedAttributes - virtual void - getDeducedAttributes(LLVMContext &Ctx, - SmallVectorImpl<Attribute> &Attrs) const override { + void getDeducedAttributes(LLVMContext &Ctx, + SmallVectorImpl<Attribute> &Attrs) const override { if (getAssumedAlign() > 1) Attrs.emplace_back( Attribute::getWithAlignment(Ctx, Align(getAssumedAlign()))); @@ -4709,7 +4708,7 @@ struct AANoReturnImpl : public AANoReturn { } /// See AbstractAttribute::updateImpl(Attributor &A). - virtual ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus updateImpl(Attributor &A) override { auto CheckForNoReturn = [](Instruction &) { return false; }; bool UsedAssumedInformation = false; if (!A.checkForAllInstructions(CheckForNoReturn, *this, @@ -4972,9 +4971,8 @@ struct AANoCaptureImpl : public AANoCapture { ChangeStatus updateImpl(Attributor &A) override; /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...). - virtual void - getDeducedAttributes(LLVMContext &Ctx, - SmallVectorImpl<Attribute> &Attrs) const override { + void getDeducedAttributes(LLVMContext &Ctx, + SmallVectorImpl<Attribute> &Attrs) const override { if (!isAssumedNoCaptureMaybeReturned()) return; @@ -6848,7 +6846,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl { : AAPrivatizablePtrImpl(IRP, A) {} /// See AbstractAttribute::initialize(...). - virtual void initialize(Attributor &A) override { + void initialize(Attributor &A) override { // TODO: We can privatize more than arguments. indicatePessimisticFixpoint(); } @@ -7222,7 +7220,7 @@ struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl { : AAMemoryBehaviorImpl(IRP, A) {} /// See AbstractAttribute::updateImpl(Attributor &A). - virtual ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus updateImpl(Attributor &A) override; /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { @@ -7934,7 +7932,7 @@ struct AAMemoryLocationFunction final : public AAMemoryLocationImpl { : AAMemoryLocationImpl(IRP, A) {} /// See AbstractAttribute::updateImpl(Attributor &A). - virtual ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus updateImpl(Attributor &A) override { const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition(), DepClassTy::NONE); @@ -9332,13 +9330,13 @@ struct AANoUndefCallSiteReturned final struct AACallEdgesImpl : public AACallEdges { AACallEdgesImpl(const IRPosition &IRP, Attributor &A) : AACallEdges(IRP, A) {} - virtual const SetVector<Function *> &getOptimisticEdges() const override { + const SetVector<Function *> &getOptimisticEdges() const override { return CalledFunctions; } - virtual bool hasUnknownCallee() const override { return HasUnknownCallee; } + bool hasUnknownCallee() const override { return HasUnknownCallee; } - virtual bool hasNonAsmUnknownCallee() const override { + bool hasNonAsmUnknownCallee() const override { return HasUnknownCalleeNonAsm; } diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 49077f92884f..50710eaa1b57 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -931,10 +931,9 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // a value can't capture arguments. Don't analyze them. if (F->onlyReadsMemory() && F->doesNotThrow() && F->getReturnType()->isVoidTy()) { - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; - ++A) { - if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { - A->addAttr(Attribute::NoCapture); + for (Argument &A : F->args()) { + if (A.getType()->isPointerTy() && !A.hasNoCaptureAttr()) { + A.addAttr(Attribute::NoCapture); ++NumNoCapture; Changed.insert(F); } @@ -942,44 +941,43 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, continue; } - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; - ++A) { - if (!A->getType()->isPointerTy()) + for (Argument &A : F->args()) { + if (!A.getType()->isPointerTy()) continue; bool HasNonLocalUses = false; - if (!A->hasNoCaptureAttr()) { + if (!A.hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); - PointerMayBeCaptured(&*A, &Tracker); + PointerMayBeCaptured(&A, &Tracker); if (!Tracker.Captured) { if (Tracker.Uses.empty()) { // If it's trivially not captured, mark it nocapture now. - A->addAttr(Attribute::NoCapture); + A.addAttr(Attribute::NoCapture); ++NumNoCapture; Changed.insert(F); } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. - ArgumentGraphNode *Node = AG[&*A]; + ArgumentGraphNode *Node = AG[&A]; for (Argument *Use : Tracker.Uses) { Node->Uses.push_back(AG[Use]); - if (Use != &*A) + if (Use != &A) HasNonLocalUses = true; } } } // Otherwise, it's captured. Don't bother doing SCC analysis on it. } - if (!HasNonLocalUses && !A->onlyReadsMemory()) { + if (!HasNonLocalUses && !A.onlyReadsMemory()) { // Can we determine that it's readonly/readnone/writeonly without doing // an SCC? Note that we don't allow any calls at all here, or else our // result will be dependent on the iteration order through the // functions in the SCC. SmallPtrSet<Argument *, 8> Self; - Self.insert(&*A); - Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); + Self.insert(&A); + Attribute::AttrKind R = determinePointerAccessAttrs(&A, Self); if (R != Attribute::None) - if (addAccessAttr(A, R)) + if (addAccessAttr(&A, R)) Changed.insert(F); } } @@ -1017,12 +1015,10 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, } bool SCCCaptured = false; - for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); - I != E && !SCCCaptured; ++I) { - ArgumentGraphNode *Node = *I; - if (Node->Uses.empty()) { - if (!Node->Definition->hasNoCaptureAttr()) - SCCCaptured = true; + for (ArgumentGraphNode *Node : ArgumentSCC) { + if (Node->Uses.empty() && !Node->Definition->hasNoCaptureAttr()) { + SCCCaptured = true; + break; } } if (SCCCaptured) @@ -1035,9 +1031,7 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, ArgumentSCCNodes.insert(I->Definition); } - for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); - I != E && !SCCCaptured; ++I) { - ArgumentGraphNode *N = *I; + for (ArgumentGraphNode *N : ArgumentSCC) { for (ArgumentGraphNode *Use : N->Uses) { Argument *A = Use->Definition; if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) @@ -1045,12 +1039,14 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, SCCCaptured = true; break; } + if (SCCCaptured) + break; } if (SCCCaptured) continue; - for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { - Argument *A = ArgumentSCC[i]->Definition; + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed.insert(A->getParent()); @@ -1078,16 +1074,17 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, }; Attribute::AttrKind AccessAttr = Attribute::ReadNone; - for (unsigned i = 0, e = ArgumentSCC.size(); - i != e && AccessAttr != Attribute::None; ++i) { - Argument *A = ArgumentSCC[i]->Definition; + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; Attribute::AttrKind K = determinePointerAccessAttrs(A, ArgumentSCCNodes); AccessAttr = meetAccessAttr(AccessAttr, K); + if (AccessAttr == Attribute::None) + break; } if (AccessAttr != Attribute::None) { - for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { - Argument *A = ArgumentSCC[i]->Definition; + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; if (addAccessAttr(A, AccessAttr)) Changed.insert(A->getParent()); } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index ec26db8bfc0b..6df0409256bb 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -470,8 +470,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Sort by offset. SmallVector<std::pair<uint64_t, Type *>, 16> TypesVector; append_range(TypesVector, Types); - sort(TypesVector, - [](const auto &A, const auto &B) { return A.first < B.first; }); + sort(TypesVector, llvm::less_first()); // Check that the types are non-overlapping. uint64_t Offset = 0; diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 6bf25df101fa..e3e4908f085b 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1778,35 +1778,48 @@ void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) { Old->replaceUsesWithIf(New, isDirectCall); } +static void dropTypeTests(Module &M, Function &TypeTestFunc) { + for (Use &U : llvm::make_early_inc_range(TypeTestFunc.uses())) { + auto *CI = cast<CallInst>(U.getUser()); + // Find and erase llvm.assume intrinsics for this llvm.type.test call. + for (Use &CIU : llvm::make_early_inc_range(CI->uses())) + if (auto *Assume = dyn_cast<AssumeInst>(CIU.getUser())) + Assume->eraseFromParent(); + // If the assume was merged with another assume, we might have a use on a + // phi (which will feed the assume). Simply replace the use on the phi + // with "true" and leave the merged assume. + if (!CI->use_empty()) { + assert( + all_of(CI->users(), [](User *U) -> bool { return isa<PHINode>(U); })); + CI->replaceAllUsesWith(ConstantInt::getTrue(M.getContext())); + } + CI->eraseFromParent(); + } +} + bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); - if (DropTypeTests && TypeTestFunc) { - for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) { - auto *CI = cast<CallInst>(U.getUser()); - // Find and erase llvm.assume intrinsics for this llvm.type.test call. - for (Use &CIU : llvm::make_early_inc_range(CI->uses())) - if (auto *Assume = dyn_cast<AssumeInst>(CIU.getUser())) - Assume->eraseFromParent(); - // If the assume was merged with another assume, we might have a use on a - // phi (which will feed the assume). Simply replace the use on the phi - // with "true" and leave the merged assume. - if (!CI->use_empty()) { - assert(all_of(CI->users(), - [](User *U) -> bool { return isa<PHINode>(U); })); - CI->replaceAllUsesWith(ConstantInt::getTrue(M.getContext())); - } - CI->eraseFromParent(); + if (DropTypeTests) { + if (TypeTestFunc) + dropTypeTests(M, *TypeTestFunc); + // Normally we'd have already removed all @llvm.public.type.test calls, + // except for in the case where we originally were performing ThinLTO but + // decided not to in the backend. + Function *PublicTypeTestFunc = + M.getFunction(Intrinsic::getName(Intrinsic::public_type_test)); + if (PublicTypeTestFunc) + dropTypeTests(M, *PublicTypeTestFunc); + if (TypeTestFunc || PublicTypeTestFunc) { + // We have deleted the type intrinsics, so we no longer have enough + // information to reason about the liveness of virtual function pointers + // in GlobalDCE. + for (GlobalVariable &GV : M.globals()) + GV.eraseMetadata(LLVMContext::MD_vcall_visibility); + return true; } - - // We have deleted the type intrinsics, so we no longer have enough - // information to reason about the liveness of virtual function pointers - // in GlobalDCE. - for (GlobalVariable &GV : M.globals()) - GV.eraseMetadata(LLVMContext::MD_vcall_visibility); - - return true; + return false; } // If only some of the modules were split, we cannot correctly perform diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 0b42fc151991..ef2384faa273 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -499,18 +499,6 @@ struct OMPInformationCache : public InformationCache { } #include "llvm/Frontend/OpenMP/OMPKinds.def" - // Remove the `noinline` attribute from `__kmpc`, `_OMP::` and `omp_` - // functions, except if `optnone` is present. - if (isOpenMPDevice(M)) { - for (Function &F : M) { - for (StringRef Prefix : {"__kmpc", "_ZN4_OMP", "omp_"}) - if (F.hasFnAttribute(Attribute::NoInline) && - F.getName().startswith(Prefix) && - !F.hasFnAttribute(Attribute::OptimizeNone)) - F.removeFnAttr(Attribute::NoInline); - } - } - // TODO: We should attach the attributes defined in OMPKinds.def. } diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp index 26fb7d676429..0453af184a72 100644 --- a/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/llvm/lib/Transforms/IPO/SCCP.cpp @@ -148,7 +148,7 @@ struct FunctionSpecializationLegacyPass : public ModulePass { AU.addRequired<TargetTransformInfoWrapperPass>(); } - virtual bool runOnModule(Module &M) override { + bool runOnModule(Module &M) override { if (skipModule(M)) return false; diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index a360a768a2bc..ef7af551a328 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -132,6 +132,14 @@ void promoteTypeIds(Module &M, StringRef ModuleId) { } } + if (Function *PublicTypeTestFunc = + M.getFunction(Intrinsic::getName(Intrinsic::public_type_test))) { + for (const Use &U : PublicTypeTestFunc->uses()) { + auto CI = cast<CallInst>(U.getUser()); + ExternalizeTypeId(CI, 1); + } + } + if (Function *TypeCheckedLoadFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) { for (const Use &U : TypeCheckedLoadFunc->uses()) { diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index ad00c116ce0a..18efe99f7cb4 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -773,15 +773,14 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M, return PreservedAnalyses::none(); } +namespace llvm { // Enable whole program visibility if enabled by client (e.g. linker) or // internal option, and not force disabled. -static bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { +bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { return (WholeProgramVisibilityEnabledInLTO || WholeProgramVisibility) && !DisableWholeProgramVisibility; } -namespace llvm { - /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definitions to linkage unit visibility in /// Module IR (for regular or hybrid LTO). @@ -790,7 +789,7 @@ void updateVCallVisibilityInModule( const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; - for (GlobalVariable &GV : M.globals()) + for (GlobalVariable &GV : M.globals()) { // Add linkage unit visibility to any variable with type metadata, which are // the vtable definitions. We won't have an existing vcall_visibility // metadata on vtable definitions with public visibility. @@ -800,6 +799,34 @@ void updateVCallVisibilityInModule( // linker, as we have no information on their eventual use. !DynamicExportSymbols.count(GV.getGUID())) GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit); + } +} + +void updatePublicTypeTestCalls(Module &M, + bool WholeProgramVisibilityEnabledInLTO) { + Function *PublicTypeTestFunc = + M.getFunction(Intrinsic::getName(Intrinsic::public_type_test)); + if (!PublicTypeTestFunc) + return; + if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) { + Function *TypeTestFunc = + Intrinsic::getDeclaration(&M, Intrinsic::type_test); + for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) { + auto *CI = cast<CallInst>(U.getUser()); + auto *NewCI = CallInst::Create( + TypeTestFunc, {CI->getArgOperand(0), CI->getArgOperand(1)}, None, "", + CI); + CI->replaceAllUsesWith(NewCI); + CI->eraseFromParent(); + } + } else { + auto *True = ConstantInt::getTrue(M.getContext()); + for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) { + auto *CI = cast<CallInst>(U.getUser()); + CI->replaceAllUsesWith(True); + CI->eraseFromParent(); + } + } } /// If whole program visibility asserted, then upgrade all public vcall diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 827b25533513..664226ec187b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -597,10 +597,9 @@ public: /// demanded bits. bool SimplifyDemandedInstructionBits(Instruction &Inst); - virtual Value * - SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, - unsigned Depth = 0, - bool AllowMultipleUsers = false) override; + Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, + APInt &UndefElts, unsigned Depth = 0, + bool AllowMultipleUsers = false) override; /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index cf2754b1dd60..3274e36ab71a 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1232,7 +1232,9 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) { // dynamic alloca instrumentation for them as well. !AI.isUsedWithInAlloca() && // swifterror allocas are register promoted by ISel - !AI.isSwiftError()); + !AI.isSwiftError() && + // safe allocas are not interesting + !(SSGI && SSGI->isSafe(AI))); ProcessedAllocas[&AI] = IsInteresting; return IsInteresting; diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index fd2eaee8b47d..013a119c5096 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -213,10 +213,12 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { bool LoopDataPrefetch::run() { // If PrefetchDistance is not set, don't run the pass. This gives an // opportunity for targets to run this pass for selected subtargets only - // (whose TTI sets PrefetchDistance). - if (getPrefetchDistance() == 0) + // (whose TTI sets PrefetchDistance and CacheLineSize). + if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) { + LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize " + "for loop data prefetch.\n"); return false; - assert(TTI->getCacheLineSize() && "Cache line size is not set for target"); + } bool MadeChange = false; diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index c05906649f16..f1e1359255bd 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -338,6 +338,9 @@ class LowerMatrixIntrinsics { Value *extractVector(unsigned I, unsigned J, unsigned NumElts, IRBuilder<> &Builder) const { Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I); + assert(cast<FixedVectorType>(Vec->getType())->getNumElements() >= + NumElts && + "Extracted vector will contain poison values"); return Builder.CreateShuffleVector( Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0), "block"); @@ -1423,13 +1426,13 @@ public: FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize); MatrixTy TileResult; // Insert in the inner loop header. - Builder.SetInsertPoint(TI.InnerLoopHeader->getTerminator()); + Builder.SetInsertPoint(TI.KLoop.Header->getTerminator()); // Create PHI nodes for the result columns to accumulate across iterations. SmallVector<PHINode *, 4> ColumnPhis; for (unsigned I = 0; I < TileSize; I++) { auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I)); Phi->addIncoming(ConstantAggregateZero::get(TileVecTy), - TI.RowLoopHeader->getSingleSuccessor()); + TI.RowLoop.Header->getSingleSuccessor()); TileResult.addVector(Phi); ColumnPhis.push_back(Phi); } @@ -1438,27 +1441,29 @@ public: // Res += Load(CurrentRow, K) * Load(K, CurrentColumn) Builder.SetInsertPoint(InnerBody->getTerminator()); // Load tiles of the operands. - MatrixTy A = loadMatrix(LPtr, {}, false, LShape, TI.CurrentRow, TI.CurrentK, - {TileSize, TileSize}, EltType, Builder); - MatrixTy B = loadMatrix(RPtr, {}, false, RShape, TI.CurrentK, TI.CurrentCol, - {TileSize, TileSize}, EltType, Builder); + MatrixTy A = + loadMatrix(LPtr, {}, false, LShape, TI.RowLoop.Index, TI.KLoop.Index, + {TileSize, TileSize}, EltType, Builder); + MatrixTy B = + loadMatrix(RPtr, {}, false, RShape, TI.KLoop.Index, TI.ColumnLoop.Index, + {TileSize, TileSize}, EltType, Builder); emitMatrixMultiply(TileResult, A, B, Builder, true, false, getFastMathFlags(MatMul)); // Store result after the inner loop is done. - Builder.SetInsertPoint(TI.RowLoopLatch->getTerminator()); + Builder.SetInsertPoint(TI.RowLoop.Latch->getTerminator()); storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(), Store->isVolatile(), {LShape.NumRows, RShape.NumColumns}, - TI.CurrentRow, TI.CurrentCol, EltType, Builder); + TI.RowLoop.Index, TI.ColumnLoop.Index, EltType, Builder); for (unsigned I = 0; I < TileResult.getNumVectors(); I++) - ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.InnerLoopLatch); + ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.KLoop.Latch); // Force unrolling of a few iterations of the inner loop, to make sure there // is enough work per iteration. // FIXME: The unroller should make this decision directly instead, but // currently the cost-model is not up to the task. unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize); - addStringMetadataToLoop(LI->getLoopFor(TI.InnerLoopHeader), + addStringMetadataToLoop(LI->getLoopFor(TI.KLoop.Header), "llvm.loop.unroll.count", InnerLoopUnrollCount); } diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index 240fb5e60687..cd2ce8ce336e 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -147,27 +147,27 @@ XorOpnd::XorOpnd(Value *V) { /// Instruction::isAssociative() because it includes operations like fsub. /// (This routine is only intended to be called for floating-point operations.) static bool hasFPAssociativeFlags(Instruction *I) { - assert(I && I->getType()->isFPOrFPVectorTy() && "Should only check FP ops"); + assert(I && isa<FPMathOperator>(I) && "Should only check FP ops"); return I->hasAllowReassoc() && I->hasNoSignedZeros(); } /// Return true if V is an instruction of the specified opcode and if it /// only has one use. static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { - auto *I = dyn_cast<Instruction>(V); - if (I && I->hasOneUse() && I->getOpcode() == Opcode) - if (!isa<FPMathOperator>(I) || hasFPAssociativeFlags(I)) - return cast<BinaryOperator>(I); + auto *BO = dyn_cast<BinaryOperator>(V); + if (BO && BO->hasOneUse() && BO->getOpcode() == Opcode) + if (!isa<FPMathOperator>(BO) || hasFPAssociativeFlags(BO)) + return BO; return nullptr; } static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, unsigned Opcode2) { - auto *I = dyn_cast<Instruction>(V); - if (I && I->hasOneUse() && - (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2)) - if (!isa<FPMathOperator>(I) || hasFPAssociativeFlags(I)) - return cast<BinaryOperator>(I); + auto *BO = dyn_cast<BinaryOperator>(V); + if (BO && BO->hasOneUse() && + (BO->getOpcode() == Opcode1 || BO->getOpcode() == Opcode2)) + if (!isa<FPMathOperator>(BO) || hasFPAssociativeFlags(BO)) + return BO; return nullptr; } @@ -778,7 +778,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I, Constant *Undef = UndefValue::get(I->getType()); NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), Undef, Undef, "", I); - if (NewOp->getType()->isFPOrFPVectorTy()) + if (isa<FPMathOperator>(NewOp)) NewOp->setFastMathFlags(I->getFastMathFlags()); } else { NewOp = NodesToRewrite.pop_back_val(); @@ -2227,7 +2227,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) { // Don't optimize floating-point instructions unless they have the // appropriate FastMathFlags for reassociation enabled. - if (I->getType()->isFPOrFPVectorTy() && !hasFPAssociativeFlags(I)) + if (isa<FPMathOperator>(I) && !hasFPAssociativeFlags(I)) return; // Do not reassociate boolean (i1) expressions. We want to preserve the diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 00387ec426bf..878f9477a29d 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -825,6 +825,35 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart, } } +/// Bundle operands of the inlined function must be added to inlined call sites. +static void PropagateOperandBundles(Function::iterator InlinedBB, + Instruction *CallSiteEHPad) { + for (Instruction &II : llvm::make_early_inc_range(*InlinedBB)) { + CallBase *I = dyn_cast<CallBase>(&II); + if (!I) + continue; + // Skip call sites which already have a "funclet" bundle. + if (I->getOperandBundle(LLVMContext::OB_funclet)) + continue; + // Skip call sites which are nounwind intrinsics (as long as they don't + // lower into regular function calls in the course of IR transformations). + auto *CalledFn = + dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts()); + if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow() && + !IntrinsicInst::mayLowerToFunctionCall(CalledFn->getIntrinsicID())) + continue; + + SmallVector<OperandBundleDef, 1> OpBundles; + I->getOperandBundlesAsDefs(OpBundles); + OpBundles.emplace_back("funclet", CallSiteEHPad); + + Instruction *NewInst = CallBase::Create(I, OpBundles, I); + NewInst->takeName(I); + I->replaceAllUsesWith(NewInst); + I->eraseFromParent(); + } +} + namespace { /// Utility for cloning !noalias and !alias.scope metadata. When a code region /// using scoped alias metadata is inlined, the aliasing relationships may not @@ -2304,38 +2333,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Update the lexical scopes of the new funclets and callsites. // Anything that had 'none' as its parent is now nested inside the callsite's // EHPad. - if (CallSiteEHPad) { for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { - // Add bundle operands to any top-level call sites. - SmallVector<OperandBundleDef, 1> OpBundles; - for (Instruction &II : llvm::make_early_inc_range(*BB)) { - CallBase *I = dyn_cast<CallBase>(&II); - if (!I) - continue; - - // Skip call sites which are nounwind intrinsics. - auto *CalledFn = - dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts()); - if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow()) - continue; - - // Skip call sites which already have a "funclet" bundle. - if (I->getOperandBundle(LLVMContext::OB_funclet)) - continue; - - I->getOperandBundlesAsDefs(OpBundles); - OpBundles.emplace_back("funclet", CallSiteEHPad); - - Instruction *NewInst = CallBase::Create(I, OpBundles, I); - NewInst->takeName(I); - I->replaceAllUsesWith(NewInst); - I->eraseFromParent(); - - OpBundles.clear(); - } + // Add bundle operands to inlined call sites. + PropagateOperandBundles(BB, CallSiteEHPad); // It is problematic if the inlinee has a cleanupret which unwinds to // caller and we inline it into a call site which doesn't unwind but into diff --git a/llvm/lib/Transforms/Utils/MatrixUtils.cpp b/llvm/lib/Transforms/Utils/MatrixUtils.cpp index 6a137630deeb..e218773cf5da 100644 --- a/llvm/lib/Transforms/Utils/MatrixUtils.cpp +++ b/llvm/lib/Transforms/Utils/MatrixUtils.cpp @@ -70,35 +70,35 @@ BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit, BasicBlock *TileInfo::CreateTiledLoops(BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, DomTreeUpdater &DTU, LoopInfo &LI) { - Loop *ColLoop = LI.AllocateLoop(); - Loop *RowLoop = LI.AllocateLoop(); - Loop *InnerLoop = LI.AllocateLoop(); - RowLoop->addChildLoop(InnerLoop); - ColLoop->addChildLoop(RowLoop); + Loop *ColumnLoopInfo = LI.AllocateLoop(); + Loop *RowLoopInfo = LI.AllocateLoop(); + Loop *KLoopInfo = LI.AllocateLoop(); + RowLoopInfo->addChildLoop(KLoopInfo); + ColumnLoopInfo->addChildLoop(RowLoopInfo); if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(ColLoop); + ParentL->addChildLoop(ColumnLoopInfo); else - LI.addTopLevelLoop(ColLoop); + LI.addTopLevelLoop(ColumnLoopInfo); BasicBlock *ColBody = CreateLoop(Start, End, B.getInt64(NumColumns), B.getInt64(TileSize), - "cols", B, DTU, ColLoop, LI); - BasicBlock *ColLatch = ColBody->getSingleSuccessor(); + "cols", B, DTU, ColumnLoopInfo, LI); + ColumnLoop.Latch = ColBody->getSingleSuccessor(); BasicBlock *RowBody = - CreateLoop(ColBody, ColLatch, B.getInt64(NumRows), B.getInt64(TileSize), - "rows", B, DTU, RowLoop, LI); - RowLoopLatch = RowBody->getSingleSuccessor(); + CreateLoop(ColBody, ColumnLoop.Latch, B.getInt64(NumRows), + B.getInt64(TileSize), "rows", B, DTU, RowLoopInfo, LI); + RowLoop.Latch = RowBody->getSingleSuccessor(); BasicBlock *InnerBody = - CreateLoop(RowBody, RowLoopLatch, B.getInt64(NumInner), - B.getInt64(TileSize), "inner", B, DTU, InnerLoop, LI); - InnerLoopLatch = InnerBody->getSingleSuccessor(); - ColumnLoopHeader = ColBody->getSinglePredecessor(); - RowLoopHeader = RowBody->getSinglePredecessor(); - InnerLoopHeader = InnerBody->getSinglePredecessor(); - CurrentRow = &*RowLoopHeader->begin(); - CurrentCol = &*ColumnLoopHeader->begin(); - CurrentK = &*InnerLoopHeader->begin(); + CreateLoop(RowBody, RowLoop.Latch, B.getInt64(NumInner), + B.getInt64(TileSize), "inner", B, DTU, KLoopInfo, LI); + KLoop.Latch = InnerBody->getSingleSuccessor(); + ColumnLoop.Header = ColBody->getSinglePredecessor(); + RowLoop.Header = RowBody->getSinglePredecessor(); + KLoop.Header = InnerBody->getSinglePredecessor(); + RowLoop.Index = &*RowLoop.Header->begin(); + ColumnLoop.Index = &*ColumnLoop.Header->begin(); + KLoop.Index = &*KLoop.Header->begin(); return InnerBody; } diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index bca3b0538c5d..03087d8370d5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -75,39 +75,109 @@ static bool callHasFP128Argument(const CallInst *CI) { }); } -static Value *convertStrToNumber(CallInst *CI, StringRef &Str, Value *EndPtr, - int64_t Base, IRBuilderBase &B) { +// Convert the entire string Str representing an integer in Base, up to +// the terminating nul if present, to a constant according to the rules +// of strtoul[l] or, when AsSigned is set, of strtol[l]. On success +// return the result, otherwise null. +// The function assumes the string is encoded in ASCII and carefully +// avoids converting sequences (including "") that the corresponding +// library call might fail and set errno for. +static Value *convertStrToInt(CallInst *CI, StringRef &Str, Value *EndPtr, + uint64_t Base, bool AsSigned, IRBuilderBase &B) { if (Base < 2 || Base > 36) - // handle special zero base if (Base != 0) + // Fail for an invalid base (required by POSIX). return nullptr; - char *End; - std::string nptr = Str.str(); - errno = 0; - long long int Result = strtoll(nptr.c_str(), &End, Base); - if (errno) - return nullptr; + // Strip leading whitespace. + for (unsigned i = 0; i != Str.size(); ++i) + if (!isSpace((unsigned char)Str[i])) { + Str = Str.substr(i); + break; + } - // if we assume all possible target locales are ASCII supersets, - // then if strtoll successfully parses a number on the host, - // it will also successfully parse the same way on the target - if (*End != '\0') + if (Str.empty()) + // Fail for empty subject sequences (POSIX allows but doesn't require + // strtol[l]/strtoul[l] to fail with EINVAL). return nullptr; - if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result)) - return nullptr; + // Strip but remember the sign. + bool Negate = Str[0] == '-'; + if (Str[0] == '-' || Str[0] == '+') { + Str = Str.drop_front(); + if (Str.empty()) + // Fail for a sign with nothing after it. + return nullptr; + } + + // Set Max to the absolute value of the minimum (for signed), or + // to the maximum (for unsigned) value representable in the type. + Type *RetTy = CI->getType(); + unsigned NBits = RetTy->getPrimitiveSizeInBits(); + uint64_t Max = AsSigned && Negate ? 1 : 0; + Max += AsSigned ? maxIntN(NBits) : maxUIntN(NBits); + + // Autodetect Base if it's zero and consume the "0x" prefix. + if (Str.size() > 1) { + if (Str[0] == '0') { + if (toUpper((unsigned char)Str[1]) == 'X') { + if (Str.size() == 2 || (Base && Base != 16)) + // Fail if Base doesn't allow the "0x" prefix or for the prefix + // alone that implementations like BSD set errno to EINVAL for. + return nullptr; + + Str = Str.drop_front(2); + Base = 16; + } + else if (Base == 0) + Base = 8; + } else if (Base == 0) + Base = 10; + } + else if (Base == 0) + Base = 10; + + // Convert the rest of the subject sequence, not including the sign, + // to its uint64_t representation (this assumes the source character + // set is ASCII). + uint64_t Result = 0; + for (unsigned i = 0; i != Str.size(); ++i) { + unsigned char DigVal = Str[i]; + if (isDigit(DigVal)) + DigVal = DigVal - '0'; + else { + DigVal = toUpper(DigVal); + if (isAlpha(DigVal)) + DigVal = DigVal - 'A' + 10; + else + return nullptr; + } + + if (DigVal >= Base) + // Fail if the digit is not valid in the Base. + return nullptr; + + // Add the digit and fail if the result is not representable in + // the (unsigned form of the) destination type. + bool VFlow; + Result = SaturatingMultiplyAdd(Result, Base, (uint64_t)DigVal, &VFlow); + if (VFlow || Result > Max) + return nullptr; + } if (EndPtr) { // Store the pointer to the end. - uint64_t ILen = End - nptr.c_str(); - Value *Off = B.getInt64(ILen); + Value *Off = B.getInt64(Str.size()); Value *StrBeg = CI->getArgOperand(0); Value *StrEnd = B.CreateInBoundsGEP(B.getInt8Ty(), StrBeg, Off, "endptr"); B.CreateStore(StrEnd, EndPtr); } - return ConstantInt::get(CI->getType(), Result); + if (Negate) + // Unsigned negation doesn't overflow. + Result = -Result; + + return ConstantInt::get(RetTy, Result); } static bool isOnlyUsedInComparisonWithZero(Value *V) { @@ -2531,27 +2601,35 @@ Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) { ConstantInt::get(CI->getType(), 0x7F)); } +// Fold calls to atoi, atol, and atoll. Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) { + CI->addParamAttr(0, Attribute::NoCapture); + StringRef Str; if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; - return convertStrToNumber(CI, Str, nullptr, 10, B); + return convertStrToInt(CI, Str, nullptr, 10, /*AsSigned=*/true, B); } -Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { - StringRef Str; - if (!getConstantStringInfo(CI->getArgOperand(0), Str)) - return nullptr; - +// Fold calls to strtol, strtoll, strtoul, and strtoull. +Value *LibCallSimplifier::optimizeStrToInt(CallInst *CI, IRBuilderBase &B, + bool AsSigned) { Value *EndPtr = CI->getArgOperand(1); - if (isa<ConstantPointerNull>(EndPtr)) + if (isa<ConstantPointerNull>(EndPtr)) { + // With a null EndPtr, this function won't capture the main argument. + // It would be readonly too, except that it still may write to errno. + CI->addParamAttr(0, Attribute::NoCapture); EndPtr = nullptr; - else if (!isKnownNonZero(EndPtr, DL)) + } else if (!isKnownNonZero(EndPtr, DL)) + return nullptr; + + StringRef Str; + if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) { - return convertStrToNumber(CI, Str, EndPtr, CInt->getSExtValue(), B); + return convertStrToInt(CI, Str, EndPtr, CInt->getSExtValue(), AsSigned, B); } return nullptr; @@ -3390,7 +3468,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { return optimizeAtoi(CI, Builder); case LibFunc_strtol: case LibFunc_strtoll: - return optimizeStrtol(CI, Builder); + return optimizeStrToInt(CI, Builder, /*AsSigned=*/true); + case LibFunc_strtoul: + case LibFunc_strtoull: + return optimizeStrToInt(CI, Builder, /*AsSigned=*/false); case LibFunc_printf: return optimizePrintF(CI, Builder); case LibFunc_sprintf: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b887ea41676b..238b074089aa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -798,8 +798,7 @@ public: // Override this function to handle the more complex control flow around the // three loops. - std::pair<BasicBlock *, Value *> - createVectorizedLoopSkeleton() final override { + std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final { return createEpilogueVectorizedLoopSkeleton(); } @@ -835,8 +834,7 @@ public: EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair<BasicBlock *, Value *> - createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -866,8 +864,7 @@ public: } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair<BasicBlock *, Value *> - createEpilogueVectorizedLoopSkeleton() final override; + std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; protected: /// Emits an iteration count bypass check after the main vector loop has diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cd044c78d900..d69d1e3d19f3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10972,9 +10972,7 @@ public: It != E; ++It) { PossibleRedValsVect.emplace_back(); auto RedValsVect = It->second.takeVector(); - stable_sort(RedValsVect, [](const auto &P1, const auto &P2) { - return P1.second < P2.second; - }); + stable_sort(RedValsVect, llvm::less_second()); for (const std::pair<Value *, unsigned> &Data : RedValsVect) PossibleRedValsVect.back().append(Data.second, Data.first); } |
