diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-14 18:50:02 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-14 18:50:02 +0000 |
| commit | 1f917f69ff07f09b6dbb670971f57f8efe718b84 (patch) | |
| tree | 99293cbc1411737cd995dac10a99b2c40ef0944c /llvm/lib | |
| parent | 145449b1e420787bb99721a429341fa6be3adfb6 (diff) | |
Diffstat (limited to 'llvm/lib')
455 files changed, 12472 insertions, 4478 deletions
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp index 1d880424e55c..428ae8975c30 100644 --- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -629,9 +630,10 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L, if (!CmpLHSConst || !llvm::is_contained(successors(BB), B)) continue; // First collapse InstChain + const DataLayout &DL = BB->getModule()->getDataLayout(); for (Instruction *I : llvm::reverse(InstChain)) { - CmpLHSConst = ConstantExpr::get(I->getOpcode(), CmpLHSConst, - cast<Constant>(I->getOperand(1)), true); + CmpLHSConst = ConstantFoldBinaryOpOperands( + I->getOpcode(), CmpLHSConst, cast<Constant>(I->getOperand(1)), DL); if (!CmpLHSConst) break; } @@ -826,9 +828,8 @@ void BranchProbabilityInfo::computeEestimateBlockWeight( if (auto BBWeight = getInitialEstimatedBlockWeight(BB)) // If we were able to find estimated weight for the block set it to this // block and propagate up the IR. - propagateEstimatedBlockWeight(getLoopBlock(BB), DT, PDT, - BBWeight.getValue(), BlockWorkList, - LoopWorkList); + propagateEstimatedBlockWeight(getLoopBlock(BB), DT, PDT, BBWeight.value(), + BlockWorkList, LoopWorkList); // BlockWorklist/LoopWorkList contains blocks/loops with at least one // successor/exit having estimated weight. Try to propagate weight to such diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index a81041845052..aa4da27be4e5 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/Config/config.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/ConstantFold.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -1142,8 +1143,12 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL, Ops.push_back(NewC); } - if (auto *CE = dyn_cast<ConstantExpr>(C)) - return ConstantFoldInstOperandsImpl(CE, CE->getOpcode(), Ops, DL, TLI); + if (auto *CE = dyn_cast<ConstantExpr>(C)) { + if (Constant *Res = + ConstantFoldInstOperandsImpl(CE, CE->getOpcode(), Ops, DL, TLI)) + return Res; + return const_cast<Constant *>(C); + } assert(isa<ConstantVector>(C)); return ConstantVector::get(Ops); @@ -1339,7 +1344,9 @@ Constant *llvm::ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, if (Constant *C = SymbolicallyEvaluateBinop(Opcode, LHS, RHS, DL)) return C; - return ConstantExpr::get(Opcode, LHS, RHS); + if (ConstantExpr::isDesirableBinOp(Opcode)) + return ConstantExpr::get(Opcode, LHS, RHS); + return ConstantFoldBinaryInstruction(Opcode, LHS, RHS); } Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *I, @@ -1390,6 +1397,8 @@ Constant *llvm::ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS, // Calculate constant result. Constant *C = ConstantFoldBinaryOpOperands(Opcode, Op0, Op1, DL); + if (!C) + return nullptr; // Flush denormal output if needed. return FlushFPConstant(C, I, /* IsOutput */ true); diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp index e82d2fae9356..db6eae0d962a 100644 --- a/llvm/lib/Analysis/GlobalsModRef.cpp +++ b/llvm/lib/Analysis/GlobalsModRef.cpp @@ -256,22 +256,6 @@ FunctionModRefBehavior GlobalsAAResult::getModRefBehavior(const Function *F) { return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min); } -FunctionModRefBehavior -GlobalsAAResult::getModRefBehavior(const CallBase *Call) { - FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; - - if (!Call->hasOperandBundles()) - if (const Function *F = Call->getCalledFunction()) - if (FunctionInfo *FI = getFunctionInfo(F)) { - if (!isModOrRefSet(FI->getModRefInfo())) - Min = FMRB_DoesNotAccessMemory; - else if (!isModSet(FI->getModRefInfo())) - Min = FMRB_OnlyReadsMemory; - } - - return FunctionModRefBehavior(AAResultBase::getModRefBehavior(Call) & Min); -} - /// Returns the function info for the function, or null if we don't have /// anything useful to say about it. GlobalsAAResult::FunctionInfo * diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp index 3d51042f4da8..a681c528e690 100644 --- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -184,8 +184,8 @@ CmpInst::Predicate IRInstructionData::getPredicate() const { "Can only get a predicate from a compare instruction"); if (RevisedPredicate) - return RevisedPredicate.getValue(); - + return RevisedPredicate.value(); + return cast<CmpInst>(Inst)->getPredicate(); } diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp index cf8592c41eda..3fafc3057a13 100644 --- a/llvm/lib/Analysis/InlineAdvisor.cpp +++ b/llvm/lib/Analysis/InlineAdvisor.cpp @@ -56,10 +56,10 @@ static cl::opt<int> cl::desc("Scale to limit the cost of inline deferral"), cl::init(2), cl::Hidden); -static cl::opt<bool> AnnotateInlinePhase( - "annotate-inline-phase", cl::Hidden, cl::init(false), - cl::desc("If true, annotate inline advisor remarks " - "with LTO and pass information.")); +static cl::opt<bool> + AnnotateInlinePhase("annotate-inline-phase", cl::Hidden, cl::init(false), + cl::desc("If true, annotate inline advisor remarks " + "with LTO and pass information.")); extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats; @@ -514,8 +514,9 @@ void llvm::emitInlinedIntoBasedOnCost( InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM, Optional<InlineContext> IC) : M(M), FAM(FAM), IC(IC), - AnnotatedInlinePassName((IC && AnnotateInlinePhase) ? llvm::AnnotateInlinePassName(*IC) - : DEBUG_TYPE) { + AnnotatedInlinePassName((IC && AnnotateInlinePhase) + ? llvm::AnnotateInlinePassName(*IC) + : DEBUG_TYPE) { if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) { ImportedFunctionsStats = std::make_unique<ImportedFunctionsInliningStatistics>(); diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index e63497260e6e..9f8a5e472f01 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -131,6 +131,12 @@ static cl::opt<size_t> cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit")); +static cl::opt<size_t> + RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, + cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), + cl::desc("Do not inline recursive functions with a stack " + "size that exceeds the specified limit")); + static cl::opt<bool> OptComputeFullInlineCost( "inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " @@ -702,7 +708,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { assert(BFI && "BFI must be available"); auto ProfileCount = BFI->getBlockProfileCount(BB); assert(ProfileCount); - if (ProfileCount.getValue() == 0) + if (ProfileCount.value() == 0) ColdSize += Cost - CostAtBBStart; } @@ -827,7 +833,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB); assert(ProfileCount); - CurrentSavings *= ProfileCount.getValue(); + CurrentSavings *= ProfileCount.value(); CycleSavings += CurrentSavings; } @@ -1781,12 +1787,12 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // return min(A, B) if B is valid. auto MinIfValid = [](int A, Optional<int> B) { - return B ? std::min(A, B.getValue()) : A; + return B ? std::min(A, B.value()) : A; }; // return max(A, B) if B is valid. auto MaxIfValid = [](int A, Optional<int> B) { - return B ? std::max(A, B.getValue()) : A; + return B ? std::max(A, B.value()) : A; }; // Various bonus percentages. These are multiplied by Threshold to get the @@ -2444,8 +2450,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, // If the caller is a recursive function then we don't want to inline // functions which allocate a lot of stack space because it would increase // the caller stack usage dramatically. - if (IsCallerRecursive && - AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { + if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { auto IR = InlineResult::failure("recursive and allocates too much stack space"); if (ORE) diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 013e4d6489fa..4691aebbdfe1 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4849,12 +4849,6 @@ static Value *simplifyPHINode(PHINode *PN, ArrayRef<Value *> IncomingValues, return UndefValue::get(PN->getType()); if (HasUndefInput) { - // We cannot start executing a trapping constant expression on more control - // flow paths. - auto *C = dyn_cast<Constant>(CommonValue); - if (C && C->canTrap()) - return nullptr; - // If we have a PHI node like phi(X, undef, X), where X is defined by some // instruction, we cannot return X as the result of the PHI node unless it // dominates the PHI block. @@ -6117,8 +6111,8 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) { Value *Op2 = Call->getArgOperand(2); auto *FPI = cast<ConstrainedFPIntrinsic>(Call); if (Value *V = simplifyFPOp({Op0, Op1, Op2}, {}, Q, - FPI->getExceptionBehavior().getValue(), - FPI->getRoundingMode().getValue())) + FPI->getExceptionBehavior().value(), + FPI->getRoundingMode().value())) return V; return nullptr; } @@ -6182,38 +6176,33 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) { } case Intrinsic::experimental_constrained_fadd: { auto *FPI = cast<ConstrainedFPIntrinsic>(Call); - return simplifyFAddInst(FPI->getArgOperand(0), FPI->getArgOperand(1), - FPI->getFastMathFlags(), Q, - FPI->getExceptionBehavior().getValue(), - FPI->getRoundingMode().getValue()); + return simplifyFAddInst( + FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(), + Q, FPI->getExceptionBehavior().value(), FPI->getRoundingMode().value()); } case Intrinsic::experimental_constrained_fsub: { auto *FPI = cast<ConstrainedFPIntrinsic>(Call); - return simplifyFSubInst(FPI->getArgOperand(0), FPI->getArgOperand(1), - FPI->getFastMathFlags(), Q, - FPI->getExceptionBehavior().getValue(), - FPI->getRoundingMode().getValue()); + return simplifyFSubInst( + FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(), + Q, FPI->getExceptionBehavior().value(), FPI->getRoundingMode().value()); } case Intrinsic::experimental_constrained_fmul: { auto *FPI = cast<ConstrainedFPIntrinsic>(Call); - return simplifyFMulInst(FPI->getArgOperand(0), FPI->getArgOperand(1), - FPI->getFastMathFlags(), Q, - FPI->getExceptionBehavior().getValue(), - FPI->getRoundingMode().getValue()); + return simplifyFMulInst( + FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(), + Q, FPI->getExceptionBehavior().value(), FPI->getRoundingMode().value()); } case Intrinsic::experimental_constrained_fdiv: { auto *FPI = cast<ConstrainedFPIntrinsic>(Call); - return simplifyFDivInst(FPI->getArgOperand(0), FPI->getArgOperand(1), - FPI->getFastMathFlags(), Q, - FPI->getExceptionBehavior().getValue(), - FPI->getRoundingMode().getValue()); + return simplifyFDivInst( + FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(), + Q, FPI->getExceptionBehavior().value(), FPI->getRoundingMode().value()); } case Intrinsic::experimental_constrained_frem: { auto *FPI = cast<ConstrainedFPIntrinsic>(Call); - return simplifyFRemInst(FPI->getArgOperand(0), FPI->getArgOperand(1), - FPI->getFastMathFlags(), Q, - FPI->getExceptionBehavior().getValue(), - FPI->getRoundingMode().getValue()); + return simplifyFRemInst( + FPI->getArgOperand(0), FPI->getArgOperand(1), FPI->getFastMathFlags(), + Q, FPI->getExceptionBehavior().value(), FPI->getRoundingMode().value()); } default: return nullptr; diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 8a8e9e923b7c..d49b20798c82 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -921,7 +921,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueCast( if (!LHSRes) // More work to do before applying this transfer rule. return None; - const ConstantRange &LHSRange = LHSRes.getValue(); + const ConstantRange &LHSRange = LHSRes.value(); const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth(); @@ -946,8 +946,8 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueBinaryOpImpl( // More work to do before applying this transfer rule. return None; - const ConstantRange &LHSRange = LHSRes.getValue(); - const ConstantRange &RHSRange = RHSRes.getValue(); + const ConstantRange &LHSRange = LHSRes.value(); + const ConstantRange &RHSRange = RHSRes.value(); return ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange)); } diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index bc1d82cf1480..938d950e6da7 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -405,7 +405,10 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Type *Ty, Align Alignment, Instruction *ScanFrom, const DominatorTree *DT, const TargetLibraryInfo *TLI) { - APInt Size(DL.getIndexTypeSizeInBits(V->getType()), DL.getTypeStoreSize(Ty)); + TypeSize TySize = DL.getTypeStoreSize(Ty); + if (TySize.isScalable()) + return false; + APInt Size(DL.getIndexTypeSizeInBits(V->getType()), TySize.getFixedValue()); return isSafeToLoadUnconditionally(V, Alignment, Size, DL, ScanFrom, DT, TLI); } diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index 91501b04448e..f5b121c98ec4 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -501,10 +501,10 @@ Optional<StringRef> llvm::getAllocationFamily(const Value *I, return None; const auto AllocData = getAllocationDataForFunction(Callee, AnyAlloc, TLI); if (AllocData) - return mangledNameForMallocFamily(AllocData.getValue().Family); + return mangledNameForMallocFamily(AllocData.value().Family); const auto FreeData = getFreeFunctionDataForFunction(Callee, TLIFn); if (FreeData) - return mangledNameForMallocFamily(FreeData.getValue().Family); + return mangledNameForMallocFamily(FreeData.value().Family); return None; } diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp index 5cff986245b9..ad8322d7bd79 100644 --- a/llvm/lib/Analysis/MustExecute.cpp +++ b/llvm/lib/Analysis/MustExecute.cpp @@ -493,7 +493,7 @@ static V getOrCreateCachedOptional(K Key, DenseMap<K, Optional<V>> &Map, Optional<V> &OptVal = Map[Key]; if (!OptVal) OptVal = Fn(std::forward<ArgsTy>(args)...); - return OptVal.getValue(); + return OptVal.value(); } const BasicBlock * diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp index 9d5fa6d0a41b..64844f534332 100644 --- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp +++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp @@ -279,19 +279,19 @@ ProfileSummaryInfo::computeThreshold(int PercentileCutoff) const { } bool ProfileSummaryInfo::hasHugeWorkingSetSize() const { - return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue(); + return HasHugeWorkingSetSize && HasHugeWorkingSetSize.value(); } bool ProfileSummaryInfo::hasLargeWorkingSetSize() const { - return HasLargeWorkingSetSize && HasLargeWorkingSetSize.getValue(); + return HasLargeWorkingSetSize && HasLargeWorkingSetSize.value(); } bool ProfileSummaryInfo::isHotCount(uint64_t C) const { - return HotCountThreshold && C >= HotCountThreshold.getValue(); + return HotCountThreshold && C >= HotCountThreshold.value(); } bool ProfileSummaryInfo::isColdCount(uint64_t C) const { - return ColdCountThreshold && C <= ColdCountThreshold.getValue(); + return ColdCountThreshold && C <= ColdCountThreshold.value(); } template <bool isHot> @@ -299,9 +299,9 @@ bool ProfileSummaryInfo::isHotOrColdCountNthPercentile(int PercentileCutoff, uint64_t C) const { auto CountThreshold = computeThreshold(PercentileCutoff); if (isHot) - return CountThreshold && C >= CountThreshold.getValue(); + return CountThreshold && C >= CountThreshold.value(); else - return CountThreshold && C <= CountThreshold.getValue(); + return CountThreshold && C <= CountThreshold.value(); } bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff, diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 207f4df79e45..f61806bd1dad 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2319,9 +2319,13 @@ bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed, return A == B; } -std::pair<SCEV::NoWrapFlags, bool /*Deduced*/> +Optional<SCEV::NoWrapFlags> ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp( const OverflowingBinaryOperator *OBO) { + // It cannot be done any better. + if (OBO->hasNoUnsignedWrap() && OBO->hasNoSignedWrap()) + return None; + SCEV::NoWrapFlags Flags = SCEV::NoWrapFlags::FlagAnyWrap; if (OBO->hasNoUnsignedWrap()) @@ -2331,13 +2335,10 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp( bool Deduced = false; - if (OBO->hasNoUnsignedWrap() && OBO->hasNoSignedWrap()) - return {Flags, Deduced}; - if (OBO->getOpcode() != Instruction::Add && OBO->getOpcode() != Instruction::Sub && OBO->getOpcode() != Instruction::Mul) - return {Flags, Deduced}; + return None; const SCEV *LHS = getSCEV(OBO->getOperand(0)); const SCEV *RHS = getSCEV(OBO->getOperand(1)); @@ -2356,7 +2357,9 @@ ScalarEvolution::getStrengthenedNoWrapFlagsFromBinOp( Deduced = true; } - return {Flags, Deduced}; + if (Deduced) + return Flags; + return None; } // We're trying to construct a SCEV of type `Type' with `Ops' as operands and @@ -4835,7 +4838,7 @@ public: Optional<const SCEV *> Res = compareWithBackedgeCondition(SI->getCondition()); if (Res) { - bool IsOne = cast<SCEVConstant>(Res.getValue())->getValue()->isOne(); + bool IsOne = cast<SCEVConstant>(Res.value())->getValue()->isOne(); Result = SE.getSCEV(IsOne ? SI->getTrueValue() : SI->getFalseValue()); } break; @@ -4843,7 +4846,7 @@ public: default: { Optional<const SCEV *> Res = compareWithBackedgeCondition(I); if (Res) - Result = Res.getValue(); + Result = Res.value(); break; } } @@ -6583,8 +6586,8 @@ ScalarEvolution::getRangeRef(const SCEV *S, // Check if the IR explicitly contains !range metadata. Optional<ConstantRange> MDRange = GetRangeFromMetadata(U->getValue()); if (MDRange) - ConservativeResult = ConservativeResult.intersectWith(MDRange.getValue(), - RangeType); + ConservativeResult = + ConservativeResult.intersectWith(MDRange.value(), RangeType); // Use facts about recurrences in the underlying IR. Note that add // recurrences are AddRecExprs and thus don't hit this path. This @@ -7365,6 +7368,8 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops) { Ops.push_back(II->getArgOperand(1)); return nullptr; case Intrinsic::start_loop_iterations: + case Intrinsic::annotation: + case Intrinsic::ptr_annotation: Ops.push_back(II->getArgOperand(0)); return nullptr; default: @@ -7816,8 +7821,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { return getAddExpr(ClampedX, Y, SCEV::FlagNUW); } case Intrinsic::start_loop_iterations: - // A start_loop_iterations is just equivalent to the first operand for - // SCEV purposes. + case Intrinsic::annotation: + case Intrinsic::ptr_annotation: + // A start_loop_iterations or llvm.annotation or llvm.prt.annotation is + // just eqivalent to the first operand for SCEV purposes. return getSCEV(II->getArgOperand(0)); default: break; @@ -9517,14 +9524,7 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) { } return C; } - case scUDivExpr: { - const SCEVUDivExpr *SU = cast<SCEVUDivExpr>(V); - if (Constant *LHS = BuildConstantFromSCEV(SU->getLHS())) - if (Constant *RHS = BuildConstantFromSCEV(SU->getRHS())) - if (LHS->getType() == RHS->getType()) - return ConstantExpr::getUDiv(LHS, RHS); - return nullptr; - } + case scUDivExpr: case scSMaxExpr: case scUMaxExpr: case scSMinExpr: @@ -10632,7 +10632,7 @@ ScalarEvolution::getMonotonicPredicateType(const SCEVAddRecExpr *LHS, getMonotonicPredicateTypeImpl(LHS, ICmpInst::getSwappedPredicate(Pred)); assert(ResultSwapped && "should be able to analyze both!"); - assert(ResultSwapped.getValue() != Result.getValue() && + assert(ResultSwapped.value() != Result.value() && "monotonicity should flip as we flip the predicate"); } #endif @@ -11808,7 +11808,7 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred, const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB)); // Make sure L does not refer to a value from a potentially previous // iteration of a loop. - if (!properlyDominates(L, IncBB)) + if (!properlyDominates(L, LBB)) return false; if (!ProvedEasily(L, RHS)) return false; diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp index 203858c1cf06..682fc095b0e9 100644 --- a/llvm/lib/Analysis/TFUtils.cpp +++ b/llvm/lib/Analysis/TFUtils.cpp @@ -18,7 +18,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/JSON.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" @@ -49,19 +48,17 @@ using TFStatusPtr = std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)>; struct TFInitializer { TFInitializer() { - assert(!IsInitialized && "TFInitialized should be called only once"); int Argc = 1; const char *Name = ""; const char **NamePtr = &Name; TF_InitMain(Name, &Argc, const_cast<char ***>(&NamePtr)); - IsInitialized = true; } - bool IsInitialized = false; }; -llvm::ManagedStatic<TFInitializer> TFLibInitializer; - -bool ensureInitTF() { return TFLibInitializer->IsInitialized; } +bool ensureInitTF() { + static TFInitializer TFLibInitializer; + return true; +} TFGraphPtr createTFGraph() { return TFGraphPtr(TF_NewGraph(), &TF_DeleteGraph); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 66f61961d01b..6e34a8303c08 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -298,7 +298,7 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } -bool TargetTransformInfo::emitGetActiveLaneMask() const { +PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { return TTIImpl->emitGetActiveLaneMask(); } diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp index 9bcbe4a4cc1e..560f46d39d0d 100644 --- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -303,24 +303,27 @@ public: /// given offset. Update the offset to be relative to the field type. TBAAStructTypeNode getField(uint64_t &Offset) const { bool NewFormat = isNewFormat(); + const ArrayRef<MDOperand> Operands(Node->op_begin(), Node->op_end()); + const unsigned NumOperands = Operands.size(); + if (NewFormat) { // New-format root and scalar type nodes have no fields. - if (Node->getNumOperands() < 6) + if (NumOperands < 6) return TBAAStructTypeNode(); } else { // Parent can be omitted for the root node. - if (Node->getNumOperands() < 2) + if (NumOperands < 2) return TBAAStructTypeNode(); // Fast path for a scalar type node and a struct type node with a single // field. - if (Node->getNumOperands() <= 3) { - uint64_t Cur = Node->getNumOperands() == 2 - ? 0 - : mdconst::extract<ConstantInt>(Node->getOperand(2)) - ->getZExtValue(); + if (NumOperands <= 3) { + uint64_t Cur = + NumOperands == 2 + ? 0 + : mdconst::extract<ConstantInt>(Operands[2])->getZExtValue(); Offset -= Cur; - MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1)); + MDNode *P = dyn_cast_or_null<MDNode>(Operands[1]); if (!P) return TBAAStructTypeNode(); return TBAAStructTypeNode(P); @@ -332,10 +335,11 @@ public: unsigned FirstFieldOpNo = NewFormat ? 3 : 1; unsigned NumOpsPerField = NewFormat ? 3 : 2; unsigned TheIdx = 0; - for (unsigned Idx = FirstFieldOpNo; Idx < Node->getNumOperands(); + + for (unsigned Idx = FirstFieldOpNo; Idx < NumOperands; Idx += NumOpsPerField) { - uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1)) - ->getZExtValue(); + uint64_t Cur = + mdconst::extract<ConstantInt>(Operands[Idx + 1])->getZExtValue(); if (Cur > Offset) { assert(Idx >= FirstFieldOpNo + NumOpsPerField && "TBAAStructTypeNode::getField should have an offset match!"); @@ -345,11 +349,11 @@ public: } // Move along the last field. if (TheIdx == 0) - TheIdx = Node->getNumOperands() - NumOpsPerField; - uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1)) - ->getZExtValue(); + TheIdx = NumOperands - NumOpsPerField; + uint64_t Cur = + mdconst::extract<ConstantInt>(Operands[TheIdx + 1])->getZExtValue(); Offset -= Cur; - MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(TheIdx)); + MDNode *P = dyn_cast_or_null<MDNode>(Operands[TheIdx]); if (!P) return TBAAStructTypeNode(); return TBAAStructTypeNode(P); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 05d5e47bb8d7..add2d427e05b 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4679,27 +4679,22 @@ bool llvm::mustSuppressSpeculation(const LoadInst &LI) { F.hasFnAttribute(Attribute::SanitizeHWAddress); } - -bool llvm::isSafeToSpeculativelyExecute(const Value *V, +bool llvm::isSafeToSpeculativelyExecute(const Instruction *Inst, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { - const Operator *Inst = dyn_cast<Operator>(V); - if (!Inst) - return false; - return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI, DT, TLI); + return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI, + DT, TLI); } -bool llvm::isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, - const Operator *Inst, - const Instruction *CtxI, - const DominatorTree *DT, - const TargetLibraryInfo *TLI) { +bool llvm::isSafeToSpeculativelyExecuteWithOpcode( + unsigned Opcode, const Instruction *Inst, const Instruction *CtxI, + const DominatorTree *DT, const TargetLibraryInfo *TLI) { #ifndef NDEBUG if (Inst->getOpcode() != Opcode) { // Check that the operands are actually compatible with the Opcode override. auto hasEqualReturnAndLeadingOperandTypes = - [](const Operator *Inst, unsigned NumLeadingOperands) { + [](const Instruction *Inst, unsigned NumLeadingOperands) { if (Inst->getNumOperands() < NumLeadingOperands) return false; const Type *ExpectedType = Inst->getType(); @@ -4715,11 +4710,6 @@ bool llvm::isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, } #endif - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) - if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i))) - if (C->canTrap()) - return false; - switch (Opcode) { default: return true; diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index f863a1ffad3a..894680cda1fc 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1502,7 +1502,7 @@ void VFABI::getVectorVariantNames( LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << S << "'\n"); Optional<VFInfo> Info = VFABI::tryDemangleForVFABI(S, *(CI.getModule())); assert(Info && "Invalid name for a VFABI variant."); - assert(CI.getModule()->getFunction(Info.getValue().VectorName) && + assert(CI.getModule()->getFunction(Info.value().VectorName) && "Vector function is missing."); #endif VariantMappings.push_back(std::string(S)); diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 30e6f8599208..c9a982693fa7 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -582,7 +582,6 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(no_sanitize_address); KEYWORD(no_sanitize_hwaddress); - KEYWORD(no_sanitize_memtag); KEYWORD(sanitize_address_dyninit); KEYWORD(ccc); @@ -661,7 +660,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(oge); KEYWORD(ord); KEYWORD(uno); KEYWORD(ueq); KEYWORD(une); KEYWORD(xchg); KEYWORD(nand); KEYWORD(max); KEYWORD(min); KEYWORD(umax); - KEYWORD(umin); + KEYWORD(umin); KEYWORD(fmax); KEYWORD(fmin); KEYWORD(vscale); KEYWORD(x); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index a1cdeac2b47f..fd502eded0a0 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -456,10 +456,15 @@ bool LLParser::parseTargetDefinition() { return false; case lltok::kw_datalayout: Lex.Lex(); - if (parseToken(lltok::equal, "expected '=' after target datalayout") || - parseStringConstant(Str)) + if (parseToken(lltok::equal, "expected '=' after target datalayout")) + return true; + LocTy Loc = Lex.getLoc(); + if (parseStringConstant(Str)) return true; - M->setDataLayout(Str); + Expected<DataLayout> MaybeDL = DataLayout::parse(Str); + if (!MaybeDL) + return error(Loc, toString(MaybeDL.takeError())); + M->setDataLayout(MaybeDL.get()); return false; } } @@ -1107,7 +1112,7 @@ static bool isSanitizer(lltok::Kind Kind) { switch (Kind) { case lltok::kw_no_sanitize_address: case lltok::kw_no_sanitize_hwaddress: - case lltok::kw_no_sanitize_memtag: + case lltok::kw_sanitize_memtag: case lltok::kw_sanitize_address_dyninit: return true; default: @@ -1128,8 +1133,8 @@ bool LLParser::parseSanitizer(GlobalVariable *GV) { case lltok::kw_no_sanitize_hwaddress: Meta.NoHWAddress = true; break; - case lltok::kw_no_sanitize_memtag: - Meta.NoMemtag = true; + case lltok::kw_sanitize_memtag: + Meta.Memtag = true; break; case lltok::kw_sanitize_address_dyninit: Meta.IsDynInit = true; @@ -3474,32 +3479,26 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { } case lltok::kw_extractvalue: return error(ID.Loc, "extractvalue constexprs are no longer supported"); - case lltok::kw_insertvalue: { - Lex.Lex(); - Constant *Val0, *Val1; - SmallVector<unsigned, 4> Indices; - if (parseToken(lltok::lparen, "expected '(' in insertvalue constantexpr") || - parseGlobalTypeAndValue(Val0) || - parseToken(lltok::comma, - "expected comma in insertvalue constantexpr") || - parseGlobalTypeAndValue(Val1) || parseIndexList(Indices) || - parseToken(lltok::rparen, "expected ')' in insertvalue constantexpr")) - return true; - if (!Val0->getType()->isAggregateType()) - return error(ID.Loc, "insertvalue operand must be aggregate type"); - Type *IndexedType = - ExtractValueInst::getIndexedType(Val0->getType(), Indices); - if (!IndexedType) - return error(ID.Loc, "invalid indices for insertvalue"); - if (IndexedType != Val1->getType()) - return error(ID.Loc, "insertvalue operand and field disagree in type: '" + - getTypeString(Val1->getType()) + - "' instead of '" + getTypeString(IndexedType) + - "'"); - ID.ConstantVal = ConstantExpr::getInsertValue(Val0, Val1, Indices); - ID.Kind = ValID::t_Constant; - return false; - } + case lltok::kw_insertvalue: + return error(ID.Loc, "insertvalue constexprs are no longer supported"); + case lltok::kw_udiv: + return error(ID.Loc, "udiv constexprs are no longer supported"); + case lltok::kw_sdiv: + return error(ID.Loc, "sdiv constexprs are no longer supported"); + case lltok::kw_urem: + return error(ID.Loc, "urem constexprs are no longer supported"); + case lltok::kw_srem: + return error(ID.Loc, "srem constexprs are no longer supported"); + case lltok::kw_fadd: + return error(ID.Loc, "fadd constexprs are no longer supported"); + case lltok::kw_fsub: + return error(ID.Loc, "fsub constexprs are no longer supported"); + case lltok::kw_fmul: + return error(ID.Loc, "fmul constexprs are no longer supported"); + case lltok::kw_fdiv: + return error(ID.Loc, "fdiv constexprs are no longer supported"); + case lltok::kw_frem: + return error(ID.Loc, "frem constexprs are no longer supported"); case lltok::kw_icmp: case lltok::kw_fcmp: { unsigned PredVal, Opc = Lex.getUIntVal(); @@ -3559,17 +3558,8 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { } // Binary Operators. case lltok::kw_add: - case lltok::kw_fadd: case lltok::kw_sub: - case lltok::kw_fsub: case lltok::kw_mul: - case lltok::kw_fmul: - case lltok::kw_udiv: - case lltok::kw_sdiv: - case lltok::kw_fdiv: - case lltok::kw_urem: - case lltok::kw_srem: - case lltok::kw_frem: case lltok::kw_shl: case lltok::kw_lshr: case lltok::kw_ashr: { @@ -5398,8 +5388,10 @@ bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V, V = PFS->getVal(ID.StrVal, Ty, ID.Loc); return V == nullptr; case ValID::t_InlineAsm: { - if (!ID.FTy || !InlineAsm::Verify(ID.FTy, ID.StrVal2)) + if (!ID.FTy) return error(ID.Loc, "invalid type for inline asm constraint string"); + if (Error Err = InlineAsm::verify(ID.FTy, ID.StrVal2)) + return error(ID.Loc, toString(std::move(Err))); V = InlineAsm::get( ID.FTy, ID.StrVal, ID.StrVal2, ID.UIntVal & 1, (ID.UIntVal >> 1) & 1, InlineAsm::AsmDialect((ID.UIntVal >> 2) & 1), (ID.UIntVal >> 3) & 1); @@ -7483,6 +7475,14 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) { Operation = AtomicRMWInst::FSub; IsFP = true; break; + case lltok::kw_fmax: + Operation = AtomicRMWInst::FMax; + IsFP = true; + break; + case lltok::kw_fmin: + Operation = AtomicRMWInst::FMin; + IsFP = true; + break; } Lex.Lex(); // Eat the operation. diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 93b07fc0db30..8d5a2555f9af 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -69,7 +69,6 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorOr.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" @@ -1243,6 +1242,8 @@ static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val) { case bitc::RMW_UMIN: return AtomicRMWInst::UMin; case bitc::RMW_FADD: return AtomicRMWInst::FAdd; case bitc::RMW_FSUB: return AtomicRMWInst::FSub; + case bitc::RMW_FMAX: return AtomicRMWInst::FMax; + case bitc::RMW_FMIN: return AtomicRMWInst::FMin; } } @@ -1384,6 +1385,9 @@ static bool isConstExprSupported(uint8_t Opcode) { if (Opcode >= BitcodeConstant::FirstSpecialOpcode) return true; + if (Instruction::isBinaryOp(Opcode)) + return ConstantExpr::isSupportedBinOp(Opcode); + return !ExpandConstantExprs; } @@ -1851,6 +1855,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::DisableSanitizerInstrumentation; case bitc::ATTR_KIND_ELEMENTTYPE: return Attribute::ElementType; + case bitc::ATTR_KIND_FNRETTHUNK_EXTERN: + return Attribute::FnRetThunkExtern; case bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY: return Attribute::InaccessibleMemOnly; case bitc::ATTR_KIND_INACCESSIBLEMEM_OR_ARGMEMONLY: @@ -3672,7 +3678,7 @@ GlobalValue::SanitizerMetadata deserializeSanitizerMetadata(unsigned V) { if (V & (1 << 1)) Meta.NoHWAddress = true; if (V & (1 << 2)) - Meta.NoMemtag = true; + Meta.Memtag = true; if (V & (1 << 3)) Meta.IsDynInit = true; return Meta; @@ -7441,10 +7447,9 @@ class BitcodeErrorCategoryType : public std::error_category { } // end anonymous namespace -static ManagedStatic<BitcodeErrorCategoryType> ErrorCategory; - const std::error_category &llvm::BitcodeErrorCategory() { - return *ErrorCategory; + static BitcodeErrorCategoryType ErrorCategory; + return ErrorCategory; } static Expected<StringRef> readBlobInRecord(BitstreamCursor &Stream, diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 941ed808bab1..590562ce2796 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -577,6 +577,8 @@ static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op) { case AtomicRMWInst::UMin: return bitc::RMW_UMIN; case AtomicRMWInst::FAdd: return bitc::RMW_FADD; case AtomicRMWInst::FSub: return bitc::RMW_FSUB; + case AtomicRMWInst::FMax: return bitc::RMW_FMAX; + case AtomicRMWInst::FMin: return bitc::RMW_FMIN; } } @@ -632,6 +634,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_COLD; case Attribute::DisableSanitizerInstrumentation: return bitc::ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION; + case Attribute::FnRetThunkExtern: + return bitc::ATTR_KIND_FNRETTHUNK_EXTERN; case Attribute::Hot: return bitc::ATTR_KIND_HOT; case Attribute::ElementType: @@ -1230,7 +1234,7 @@ static_assert(sizeof(GlobalValue::SanitizerMetadata) <= sizeof(unsigned), static unsigned serializeSanitizerMetadata(const GlobalValue::SanitizerMetadata &Meta) { return Meta.NoAddress | (Meta.NoHWAddress << 1) | - (Meta.NoMemtag << 2) | (Meta.IsDynInit << 3); + (Meta.Memtag << 2) | (Meta.IsDynInit << 3); } /// Emit top-level description of module, including target triple, inline asm, @@ -2674,9 +2678,6 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, Record.push_back(VE.getValueID(C->getOperand(1))); Record.push_back(CE->getPredicate()); break; - case Instruction::InsertValue: - report_fatal_error("insertvalue constexprs not supported"); - break; } } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) { Code = bitc::CST_CODE_BLOCKADDRESS; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 4a31bf85446b..94612a51d2e1 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1816,6 +1816,11 @@ void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) { if (TM.getTargetTriple().isOSBinFormatXCOFF()) { assert(MAI->hasVisibilityOnlyWithLinkage() && "Visibility should be handled with emitLinkage() on AIX."); + + // Linkage for alias of global variable has been emitted. + if (isa<GlobalVariable>(GA.getAliaseeObject())) + return; + emitLinkage(&GA, Name); // If it's a function, also emit linkage for aliases of function entry // point. @@ -2860,7 +2865,8 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C, AsmPrinter &AP, const Constant *BaseCV = nullptr, - uint64_t Offset = 0); + uint64_t Offset = 0, + AsmPrinter::AliasMapTy *AliasList = nullptr); static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP); static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP); @@ -2914,9 +2920,21 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) { return -1; } -static void emitGlobalConstantDataSequential(const DataLayout &DL, - const ConstantDataSequential *CDS, - AsmPrinter &AP) { +static void emitGlobalAliasInline(AsmPrinter &AP, uint64_t Offset, + AsmPrinter::AliasMapTy *AliasList) { + if (AliasList) { + auto AliasIt = AliasList->find(Offset); + if (AliasIt != AliasList->end()) { + for (const GlobalAlias *GA : AliasIt->second) + AP.OutStreamer->emitLabel(AP.getSymbol(GA)); + AliasList->erase(Offset); + } + } +} + +static void emitGlobalConstantDataSequential( + const DataLayout &DL, const ConstantDataSequential *CDS, AsmPrinter &AP, + AsmPrinter::AliasMapTy *AliasList) { // See if we can aggregate this into a .fill, if so, emit it as such. int Value = isRepeatedByteSequence(CDS, DL); if (Value != -1) { @@ -2933,17 +2951,20 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL, // Otherwise, emit the values in successive locations. unsigned ElementByteSize = CDS->getElementByteSize(); if (isa<IntegerType>(CDS->getElementType())) { - for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { + for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) { + emitGlobalAliasInline(AP, ElementByteSize * I, AliasList); if (AP.isVerbose()) AP.OutStreamer->getCommentOS() - << format("0x%" PRIx64 "\n", CDS->getElementAsInteger(i)); - AP.OutStreamer->emitIntValue(CDS->getElementAsInteger(i), + << format("0x%" PRIx64 "\n", CDS->getElementAsInteger(I)); + AP.OutStreamer->emitIntValue(CDS->getElementAsInteger(I), ElementByteSize); } } else { Type *ET = CDS->getElementType(); - for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) + for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) { + emitGlobalAliasInline(AP, ElementByteSize * I, AliasList); emitGlobalConstantFP(CDS->getElementAsAPFloat(I), ET, AP); + } } unsigned Size = DL.getTypeAllocSize(CDS->getType()); @@ -2956,7 +2977,8 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL, static void emitGlobalConstantArray(const DataLayout &DL, const ConstantArray *CA, AsmPrinter &AP, - const Constant *BaseCV, uint64_t Offset) { + const Constant *BaseCV, uint64_t Offset, + AsmPrinter::AliasMapTy *AliasList) { // See if we can aggregate some values. Make sure it can be // represented as a series of bytes of the constant value. int Value = isRepeatedByteSequence(CA, DL); @@ -2964,44 +2986,75 @@ static void emitGlobalConstantArray(const DataLayout &DL, if (Value != -1) { uint64_t Bytes = DL.getTypeAllocSize(CA->getType()); AP.OutStreamer->emitFill(Bytes, Value); - } - else { - for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) { - emitGlobalConstantImpl(DL, CA->getOperand(i), AP, BaseCV, Offset); - Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType()); + } else { + for (unsigned I = 0, E = CA->getNumOperands(); I != E; ++I) { + emitGlobalConstantImpl(DL, CA->getOperand(I), AP, BaseCV, Offset, + AliasList); + Offset += DL.getTypeAllocSize(CA->getOperand(I)->getType()); } } } +static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP); + static void emitGlobalConstantVector(const DataLayout &DL, - const ConstantVector *CV, AsmPrinter &AP) { - for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i) - emitGlobalConstantImpl(DL, CV->getOperand(i), AP); + const ConstantVector *CV, AsmPrinter &AP, + AsmPrinter::AliasMapTy *AliasList) { + Type *ElementType = CV->getType()->getElementType(); + uint64_t ElementSizeInBits = DL.getTypeSizeInBits(ElementType); + uint64_t ElementAllocSizeInBits = DL.getTypeAllocSizeInBits(ElementType); + uint64_t EmittedSize; + if (ElementSizeInBits != ElementAllocSizeInBits) { + // If the allocation size of an element is different from the size in bits, + // printing each element separately will insert incorrect padding. + // + // The general algorithm here is complicated; instead of writing it out + // here, just use the existing code in ConstantFolding. + Type *IntT = + IntegerType::get(CV->getContext(), DL.getTypeSizeInBits(CV->getType())); + ConstantInt *CI = dyn_cast_or_null<ConstantInt>(ConstantFoldConstant( + ConstantExpr::getBitCast(const_cast<ConstantVector *>(CV), IntT), DL)); + if (!CI) { + report_fatal_error( + "Cannot lower vector global with unusual element type"); + } + emitGlobalAliasInline(AP, 0, AliasList); + emitGlobalConstantLargeInt(CI, AP); + EmittedSize = DL.getTypeStoreSize(CV->getType()); + } else { + for (unsigned I = 0, E = CV->getType()->getNumElements(); I != E; ++I) { + emitGlobalAliasInline(AP, DL.getTypeAllocSize(CV->getType()) * I, AliasList); + emitGlobalConstantImpl(DL, CV->getOperand(I), AP); + } + EmittedSize = + DL.getTypeAllocSize(ElementType) * CV->getType()->getNumElements(); + } unsigned Size = DL.getTypeAllocSize(CV->getType()); - unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) * - CV->getType()->getNumElements(); if (unsigned Padding = Size - EmittedSize) AP.OutStreamer->emitZeros(Padding); } static void emitGlobalConstantStruct(const DataLayout &DL, const ConstantStruct *CS, AsmPrinter &AP, - const Constant *BaseCV, uint64_t Offset) { + const Constant *BaseCV, uint64_t Offset, + AsmPrinter::AliasMapTy *AliasList) { // Print the fields in successive locations. Pad to align if needed! unsigned Size = DL.getTypeAllocSize(CS->getType()); const StructLayout *Layout = DL.getStructLayout(CS->getType()); uint64_t SizeSoFar = 0; - for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) { - const Constant *Field = CS->getOperand(i); + for (unsigned I = 0, E = CS->getNumOperands(); I != E; ++I) { + const Constant *Field = CS->getOperand(I); // Print the actual field value. - emitGlobalConstantImpl(DL, Field, AP, BaseCV, Offset + SizeSoFar); + emitGlobalConstantImpl(DL, Field, AP, BaseCV, Offset + SizeSoFar, + AliasList); // Check if padding is needed and insert one or more 0s. uint64_t FieldSize = DL.getTypeAllocSize(Field->getType()); - uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1)) - - Layout->getElementOffset(i)) - FieldSize; + uint64_t PadSize = ((I == E - 1 ? Size : Layout->getElementOffset(I + 1)) - + Layout->getElementOffset(I)) - + FieldSize; SizeSoFar += FieldSize + PadSize; // Insert padding - this may include padding to increase the size of the @@ -3211,7 +3264,9 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME, static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, AsmPrinter &AP, const Constant *BaseCV, - uint64_t Offset) { + uint64_t Offset, + AsmPrinter::AliasMapTy *AliasList) { + emitGlobalAliasInline(AP, Offset, AliasList); uint64_t Size = DL.getTypeAllocSize(CV->getType()); // Globals with sub-elements such as combinations of arrays and structs @@ -3251,13 +3306,13 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, } if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV)) - return emitGlobalConstantDataSequential(DL, CDS, AP); + return emitGlobalConstantDataSequential(DL, CDS, AP, AliasList); if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV)) - return emitGlobalConstantArray(DL, CVA, AP, BaseCV, Offset); + return emitGlobalConstantArray(DL, CVA, AP, BaseCV, Offset, AliasList); if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV)) - return emitGlobalConstantStruct(DL, CVS, AP, BaseCV, Offset); + return emitGlobalConstantStruct(DL, CVS, AP, BaseCV, Offset, AliasList); if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) { // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of @@ -3276,7 +3331,7 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, } if (const ConstantVector *V = dyn_cast<ConstantVector>(CV)) - return emitGlobalConstantVector(DL, V, AP); + return emitGlobalConstantVector(DL, V, AP, AliasList); // Otherwise, it must be a ConstantExpr. Lower it to an MCExpr, then emit it // thread the streamer with EmitValue. @@ -3292,15 +3347,21 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, } /// EmitGlobalConstant - Print a general LLVM constant to the .s file. -void AsmPrinter::emitGlobalConstant(const DataLayout &DL, const Constant *CV) { +void AsmPrinter::emitGlobalConstant(const DataLayout &DL, const Constant *CV, + AliasMapTy *AliasList) { uint64_t Size = DL.getTypeAllocSize(CV->getType()); if (Size) - emitGlobalConstantImpl(DL, CV, *this); + emitGlobalConstantImpl(DL, CV, *this, nullptr, 0, AliasList); else if (MAI->hasSubsectionsViaSymbols()) { // If the global has zero size, emit a single byte so that two labels don't // look like they are at the same location. OutStreamer->emitIntValue(0, 1); } + if (!AliasList) + return; + for (const auto &AliasPair : *AliasList) + report_fatal_error("Aliases with offset " + Twine(AliasPair.first) + + " were not emitted."); } void AsmPrinter::emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def index 28a02390fccb..c872d0dd2dfa 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def +++ b/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def @@ -51,5 +51,5 @@ HANDLE_DIE_HASH_ATTR(DW_AT_virtuality) HANDLE_DIE_HASH_ATTR(DW_AT_visibility) HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location) HANDLE_DIE_HASH_ATTR(DW_AT_type) - +HANDLE_DIE_HASH_ATTR(DW_AT_linkage_name) #undef HANDLE_DIE_HASH_ATTR diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 5ce6fbb5f647..ad9dc517539a 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1646,6 +1646,8 @@ static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) { case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: // No atomic libcalls are available for max/min/umax/umin. diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 7883a48d121c..59932a542bbc 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -120,8 +120,7 @@ static bool maySpeculateLanes(VPIntrinsic &VPI) { // Fallback to whether the intrinsic is speculatable. Optional<unsigned> OpcOpt = VPI.getFunctionalOpcode(); unsigned FunctionalOpc = OpcOpt.value_or((unsigned)Instruction::Call); - return isSafeToSpeculativelyExecuteWithOpcode(FunctionalOpc, - cast<Operator>(&VPI)); + return isSafeToSpeculativelyExecuteWithOpcode(FunctionalOpc, &VPI); } //// } Helpers diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 081c8b125f17..b06043fb4c31 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -500,6 +500,12 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, LLT DstTy = MRI.getType(DstRegs[0]); LLT LCMTy = getCoverTy(SrcTy, PartTy); + if (PartTy.isVector() && LCMTy == PartTy) { + assert(DstRegs.size() == 1); + B.buildPadVectorWithUndefElements(DstRegs[0], SrcReg); + return; + } + const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); unsigned CoveringSize = LCMTy.getSizeInBits(); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 2c94f87804ac..ad0c0c8315dc 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -697,14 +697,16 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI, return false; Register SrcReg = MI.getOperand(1).getReg(); - GAnyLoad *LoadMI = getOpcodeDef<GAnyLoad>(SrcReg, MRI); - if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()) || - !LoadMI->isSimple()) + // Don't use getOpcodeDef() here since intermediate instructions may have + // multiple users. + GAnyLoad *LoadMI = dyn_cast<GAnyLoad>(MRI.getVRegDef(SrcReg)); + if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg())) return false; Register LoadReg = LoadMI->getDstReg(); - LLT LoadTy = MRI.getType(LoadReg); + LLT RegTy = MRI.getType(LoadReg); Register PtrReg = LoadMI->getPointerReg(); + unsigned RegSize = RegTy.getSizeInBits(); uint64_t LoadSizeBits = LoadMI->getMemSizeInBits(); unsigned MaskSizeBits = MaskVal.countTrailingOnes(); @@ -715,7 +717,7 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI, // If the mask covers the whole destination register, there's nothing to // extend - if (MaskSizeBits >= LoadTy.getSizeInBits()) + if (MaskSizeBits >= RegSize) return false; // Most targets cannot deal with loads of size < 8 and need to re-legalize to @@ -725,17 +727,26 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI, const MachineMemOperand &MMO = LoadMI->getMMO(); LegalityQuery::MemDesc MemDesc(MMO); - MemDesc.MemoryTy = LLT::scalar(MaskSizeBits); + + // Don't modify the memory access size if this is atomic/volatile, but we can + // still adjust the opcode to indicate the high bit behavior. + if (LoadMI->isSimple()) + MemDesc.MemoryTy = LLT::scalar(MaskSizeBits); + else if (LoadSizeBits > MaskSizeBits || LoadSizeBits == RegSize) + return false; + + // TODO: Could check if it's legal with the reduced or original memory size. if (!isLegalOrBeforeLegalizer( - {TargetOpcode::G_ZEXTLOAD, {LoadTy, MRI.getType(PtrReg)}, {MemDesc}})) + {TargetOpcode::G_ZEXTLOAD, {RegTy, MRI.getType(PtrReg)}, {MemDesc}})) return false; MatchInfo = [=](MachineIRBuilder &B) { B.setInstrAndDebugLoc(*LoadMI); auto &MF = B.getMF(); auto PtrInfo = MMO.getPointerInfo(); - auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MaskSizeBits / 8); + auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MemDesc.MemoryTy); B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO); + LoadMI->eraseFromParent(); }; return true; } @@ -805,21 +816,24 @@ bool CombinerHelper::matchSextInRegOfLoad( MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); + Register DstReg = MI.getOperand(0).getReg(); + LLT RegTy = MRI.getType(DstReg); + // Only supports scalars for now. - if (MRI.getType(MI.getOperand(0).getReg()).isVector()) + if (RegTy.isVector()) return false; Register SrcReg = MI.getOperand(1).getReg(); auto *LoadDef = getOpcodeDef<GLoad>(SrcReg, MRI); - if (!LoadDef || !MRI.hasOneNonDBGUse(LoadDef->getOperand(0).getReg()) || - !LoadDef->isSimple()) + if (!LoadDef || !MRI.hasOneNonDBGUse(DstReg)) return false; + uint64_t MemBits = LoadDef->getMemSizeInBits(); + // If the sign extend extends from a narrower width than the load's width, // then we can narrow the load width when we combine to a G_SEXTLOAD. // Avoid widening the load at all. - unsigned NewSizeBits = std::min((uint64_t)MI.getOperand(2).getImm(), - LoadDef->getMemSizeInBits()); + unsigned NewSizeBits = std::min((uint64_t)MI.getOperand(2).getImm(), MemBits); // Don't generate G_SEXTLOADs with a < 1 byte width. if (NewSizeBits < 8) @@ -831,7 +845,15 @@ bool CombinerHelper::matchSextInRegOfLoad( const MachineMemOperand &MMO = LoadDef->getMMO(); LegalityQuery::MemDesc MMDesc(MMO); - MMDesc.MemoryTy = LLT::scalar(NewSizeBits); + + // Don't modify the memory access size if this is atomic/volatile, but we can + // still adjust the opcode to indicate the high bit behavior. + if (LoadDef->isSimple()) + MMDesc.MemoryTy = LLT::scalar(NewSizeBits); + else if (MemBits > NewSizeBits || MemBits == RegTy.getSizeInBits()) + return false; + + // TODO: Could check if it's legal with the reduced or original memory size. if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SEXTLOAD, {MRI.getType(LoadDef->getDstReg()), MRI.getType(LoadDef->getPointerReg())}, diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index a2af66d28f4a..947facc87b71 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2076,9 +2076,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getStackGuard(getOrCreateVReg(CI), MIRBuilder); return true; case Intrinsic::stackprotector: { + const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL); - Register GuardVal = MRI->createGenericVirtualRegister(PtrTy); - getStackGuard(GuardVal, MIRBuilder); + Register GuardVal; + if (TLI.useLoadStackGuardNode()) { + GuardVal = MRI->createGenericVirtualRegister(PtrTy); + getStackGuard(GuardVal, MIRBuilder); + } else + GuardVal = getOrCreateVReg(*CI.getArgOperand(0)); // The guard's value. AllocaInst *Slot = cast<AllocaInst>(CI.getArgOperand(1)); int FI = getOrCreateFrameIndex(*Slot); @@ -2883,6 +2888,12 @@ bool IRTranslator::translateAtomicRMW(const User &U, case AtomicRMWInst::FSub: Opcode = TargetOpcode::G_ATOMICRMW_FSUB; break; + case AtomicRMWInst::FMax: + Opcode = TargetOpcode::G_ATOMICRMW_FMAX; + break; + case AtomicRMWInst::FMin: + Opcode = TargetOpcode::G_ATOMICRMW_FMIN; + break; } MIRBuilder.buildAtomicRMW( diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 19ebf46191a9..0d9580e25606 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -473,6 +473,23 @@ MachineInstrBuilder MachineIRBuilder::buildBoolExt(const DstOp &Res, return buildInstr(ExtOp, Res, Op); } +MachineInstrBuilder MachineIRBuilder::buildBoolExtInReg(const DstOp &Res, + const SrcOp &Op, + bool IsVector, + bool IsFP) { + const auto *TLI = getMF().getSubtarget().getTargetLowering(); + switch (TLI->getBooleanContents(IsVector, IsFP)) { + case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + return buildSExtInReg(Res, Op, 1); + case TargetLoweringBase::ZeroOrOneBooleanContent: + return buildZExtInReg(Res, Op, 1); + case TargetLoweringBase::UndefinedBooleanContent: + return buildCopy(Res, Op); + } + + llvm_unreachable("unexpected BooleanContent"); +} + MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc, const DstOp &Res, const SrcOp &Op) { @@ -938,6 +955,20 @@ MachineIRBuilder::buildAtomicRMWFSub(const DstOp &OldValRes, const SrcOp &Addr, } MachineInstrBuilder +MachineIRBuilder::buildAtomicRMWFMax(const DstOp &OldValRes, const SrcOp &Addr, + const SrcOp &Val, MachineMemOperand &MMO) { + return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FMAX, OldValRes, Addr, Val, + MMO); +} + +MachineInstrBuilder +MachineIRBuilder::buildAtomicRMWFMin(const DstOp &OldValRes, const SrcOp &Addr, + const SrcOp &Val, MachineMemOperand &MMO) { + return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FMIN, OldValRes, Addr, Val, + MMO); +} + +MachineInstrBuilder MachineIRBuilder::buildFence(unsigned Ordering, unsigned Scope) { return buildInstr(TargetOpcode::G_FENCE) .addImm(Ordering) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index b3f38a3b53f3..55f3ad796291 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -310,10 +310,11 @@ bool InterleavedAccess::lowerInterleavedLoad( Extracts.push_back(Extract); continue; } - auto *BI = dyn_cast<BinaryOperator>(User); - if (BI && BI->hasOneUse()) { - if (auto *SVI = dyn_cast<ShuffleVectorInst>(*BI->user_begin())) { - BinOpShuffles.insert(SVI); + if (auto *BI = dyn_cast<BinaryOperator>(User)) { + if (all_of(BI->users(), + [](auto *U) { return isa<ShuffleVectorInst>(U); })) { + for (auto *SVI : BI->users()) + BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI)); continue; } } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 30ca8bd871e8..43c12c67939e 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -536,6 +536,17 @@ public: // What was the old variable value? ValueIDNum OldValue = VarLocs[MLoc.asU64()]; + clobberMloc(MLoc, OldValue, Pos, MakeUndef); + } + /// Overload that takes an explicit value \p OldValue for when the value in + /// \p MLoc has changed and the TransferTracker's locations have not been + /// updated yet. + void clobberMloc(LocIdx MLoc, ValueIDNum OldValue, + MachineBasicBlock::iterator Pos, bool MakeUndef = true) { + auto ActiveMLocIt = ActiveMLocs.find(MLoc); + if (ActiveMLocIt == ActiveMLocs.end()) + return; + VarLocs[MLoc.asU64()] = ValueIDNum::EmptyValue; // Examine the remaining variable locations: if we can find the same value @@ -1730,9 +1741,35 @@ bool InstrRefBasedLDV::transferRegisterCopy(MachineInstr &MI) { if (EmulateOldLDV && !SrcRegOp->isKill()) return false; + // Before we update MTracker, remember which values were present in each of + // the locations about to be overwritten, so that we can recover any + // potentially clobbered variables. + DenseMap<LocIdx, ValueIDNum> ClobberedLocs; + if (TTracker) { + for (MCRegAliasIterator RAI(DestReg, TRI, true); RAI.isValid(); ++RAI) { + LocIdx ClobberedLoc = MTracker->getRegMLoc(*RAI); + auto MLocIt = TTracker->ActiveMLocs.find(ClobberedLoc); + // If ActiveMLocs isn't tracking this location or there are no variables + // using it, don't bother remembering. + if (MLocIt == TTracker->ActiveMLocs.end() || MLocIt->second.empty()) + continue; + ValueIDNum Value = MTracker->readReg(*RAI); + ClobberedLocs[ClobberedLoc] = Value; + } + } + // Copy MTracker info, including subregs if available. InstrRefBasedLDV::performCopy(SrcReg, DestReg); + // The copy might have clobbered variables based on the destination register. + // Tell TTracker about it, passing the old ValueIDNum to search for + // alternative locations (or else terminating those variables). + if (TTracker) { + for (auto LocVal : ClobberedLocs) { + TTracker->clobberMloc(LocVal.first, LocVal.second, MI.getIterator(), false); + } + } + // Only produce a transfer of DBG_VALUE within a block where old LDV // would have. We might make use of the additional value tracking in some // other way, later. @@ -1744,15 +1781,6 @@ bool InstrRefBasedLDV::transferRegisterCopy(MachineInstr &MI) { if (EmulateOldLDV && SrcReg != DestReg) MTracker->defReg(SrcReg, CurBB, CurInst); - // Finally, the copy might have clobbered variables based on the destination - // register. Tell TTracker about it, in case a backup location exists. - if (TTracker) { - for (MCRegAliasIterator RAI(DestReg, TRI, true); RAI.isValid(); ++RAI) { - LocIdx ClobberedLoc = MTracker->getRegMLoc(*RAI); - TTracker->clobberMloc(ClobberedLoc, MI.getIterator(), false); - } - } - return true; } diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 7d825a8bf853..1242ce20b732 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -1049,12 +1049,17 @@ public: // we may end up with a main range not covering all subranges. // This is extremely rare case, so let's check and reconstruct the // main range. - for (LiveInterval::SubRange &S : LI.subranges()) { - if (LI.covers(S)) - continue; - LI.clear(); - LIS.constructMainRangeFromSubranges(LI); - break; + if (LI.hasSubRanges()) { + unsigned SubReg = MO.getSubReg(); + LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg) + : MRI.getMaxLaneMaskForVReg(Reg); + for (LiveInterval::SubRange &S : LI.subranges()) { + if ((S.LaneMask & LaneMask).none() || LI.covers(S)) + continue; + LI.clear(); + LIS.constructMainRangeFromSubranges(LI); + break; + } } continue; diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 40ae7053ea09..0c94e1f7e474 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -742,7 +742,7 @@ bool MIParser::parseBasicBlockDefinition( MBB->setIsInlineAsmBrIndirectTarget(IsInlineAsmBrIndirectTarget); MBB->setIsEHFuncletEntry(IsEHFuncletEntry); if (SectionID) { - MBB->setSectionID(SectionID.getValue()); + MBB->setSectionID(SectionID.value()); MF.setBBSectionsType(BasicBlockSection::List); } return false; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index c186d0ba9969..02c44fa85cd9 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -451,7 +451,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, if (IrrLoopHeaderWeight && IsStandalone) { if (Indexes) OS << '\t'; OS.indent(2) << "; Irreducible loop header weight: " - << IrrLoopHeaderWeight.getValue() << '\n'; + << IrrLoopHeaderWeight.value() << '\n'; } } diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index 81c97ba6a086..867a7ed584b2 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -106,8 +106,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { // We don't want to proceed further for cold functions // or functions of unknown hotness. Lukewarm functions have no prefix. Optional<StringRef> SectionPrefix = MF.getFunction().getSectionPrefix(); - if (SectionPrefix && (SectionPrefix.getValue().equals("unlikely") || - SectionPrefix.getValue().equals("unknown"))) { + if (SectionPrefix && (SectionPrefix.value().equals("unlikely") || + SectionPrefix.value().equals("unknown"))) { return false; } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 4e00a211713e..5f80445a5a34 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -93,8 +93,11 @@ cl::opt<bool> VerifyScheduling( cl::opt<bool> ViewMISchedDAGs( "view-misched-dags", cl::Hidden, cl::desc("Pop up a window to show MISched dags after they are processed")); +cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden, + cl::desc("Print schedule DAGs")); #else const bool ViewMISchedDAGs = false; +const bool PrintDAGs = false; #endif // NDEBUG } // end namespace llvm @@ -112,10 +115,6 @@ static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden, cl::desc("Only schedule this function")); static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden, cl::desc("Only schedule this MBB#")); -static cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden, - cl::desc("Print schedule DAGs")); -#else -static const bool PrintDAGs = false; #endif // NDEBUG /// Avoid quadratic complexity in unusually large basic blocks by limiting the diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index db04f2bcc095..7a008bae726e 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -293,6 +293,7 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addUsedIfAvailable<LiveStacks>(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index 3245d9649be1..581168b31384 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -1448,7 +1448,7 @@ Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg, const TargetRegisterClass *RC) { // If the init register is not undef, try and find an existing phi. if (InitReg) { - auto I = Phis.find({LoopReg, InitReg.getValue()}); + auto I = Phis.find({LoopReg, InitReg.value()}); if (I != Phis.end()) return I->second; } else { @@ -1469,10 +1469,10 @@ Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg, return R; // Found a phi taking undef as input, so rewrite it to take InitReg. MachineInstr *MI = MRI.getVRegDef(R); - MI->getOperand(1).setReg(InitReg.getValue()); - Phis.insert({{LoopReg, InitReg.getValue()}, R}); + MI->getOperand(1).setReg(InitReg.value()); + Phis.insert({{LoopReg, InitReg.value()}, R}); const TargetRegisterClass *ConstrainRegClass = - MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue())); + MRI.constrainRegClass(R, MRI.getRegClass(InitReg.value())); assert(ConstrainRegClass && "Expected a valid constrained register class!"); (void)ConstrainRegClass; UndefPhis.erase(I); diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 1a0f296d5fdc..89a43c4f57f6 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -554,7 +554,7 @@ static void updateLiveness(MachineFunction &MF) { } } -/// Insert restore code for the callee-saved registers used in the function. +/// Insert spill code for the callee-saved registers used in the function. static void insertCSRSaves(MachineBasicBlock &SaveBlock, ArrayRef<CalleeSavedInfo> CSI) { MachineFunction &MF = *SaveBlock.getParent(); diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h index d57b0ca6d53d..d6a3997e4b70 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h @@ -148,9 +148,6 @@ protected: /// Run or not the local reassignment heuristic. This information is /// obtained from the TargetSubtargetInfo. const bool EnableLocalReassign; - -private: - unsigned NextCascade = 1; }; /// ImmutableAnalysis abstraction for fetching the Eviction Advisor. We model it diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index c199b6a6cca8..d627519a34aa 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -870,8 +870,8 @@ bool SelectOptimize::computeLoopCosts( ORE->emit(ORmissL); return false; } - IPredCost += Scaled64::get(ILatency.getValue()); - INonPredCost += Scaled64::get(ILatency.getValue()); + IPredCost += Scaled64::get(ILatency.value()); + INonPredCost += Scaled64::get(ILatency.value()); // For a select that can be converted to branch, // compute its cost as a branch (non-predicated cost). diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aa688d9dda3c..2654c00929d8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2392,12 +2392,14 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { // add (srl (not X), 31), C --> add (sra X, 31), (C + 1) // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1) SDLoc DL(N); - auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL; - SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt); - if (SDValue NewC = - DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT, - {ConstantOp, DAG.getConstant(1, DL, VT)})) + if (SDValue NewC = DAG.FoldConstantArithmetic( + IsAdd ? ISD::ADD : ISD::SUB, DL, VT, + {ConstantOp, DAG.getConstant(1, DL, VT)})) { + SDValue NewShift = DAG.getNode(IsAdd ? ISD::SRA : ISD::SRL, DL, VT, + Not.getOperand(0), ShAmt); return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC); + } + return SDValue(); } @@ -3760,6 +3762,17 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { } } + // If there's no chance of borrowing from adjacent bits, then sub is xor: + // sub C0, X --> xor X, C0 + if (ConstantSDNode *C0 = isConstOrConstSplat(N0)) { + if (!C0->isOpaque()) { + const APInt &C0Val = C0->getAPIntValue(); + const APInt &MaybeOnes = ~DAG.computeKnownBits(N1).Zero; + if ((C0Val - MaybeOnes) == (C0Val ^ MaybeOnes)) + return DAG.getNode(ISD::XOR, DL, VT, N1, N0); + } + } + return SDValue(); } @@ -4550,13 +4563,12 @@ SDValue DAGCombiner::visitREM(SDNode *N) { SDLoc DL(N); // fold (rem c1, c2) -> c1%c2 - ConstantSDNode *N1C = isConstOrConstSplat(N1); if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) return C; // fold (urem X, -1) -> select(FX == -1, 0, FX) // Freeze the numerator to avoid a miscompile with an undefined value. - if (!isSigned && N1C && N1C->isAllOnes()) { + if (!isSigned && llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false)) { SDValue F0 = DAG.getFreeze(N0); SDValue EqualsNeg1 = DAG.getSetCC(DL, CCVT, F0, N1, ISD::SETEQ); return DAG.getSelect(DL, VT, EqualsNeg1, DAG.getConstant(0, DL, VT), F0); @@ -4581,9 +4593,12 @@ SDValue DAGCombiner::visitREM(SDNode *N) { AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } - if (N1.getOpcode() == ISD::SHL && + // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1)) + // TODO: We should sink the following into isKnownToBePowerOfTwo + // using a OrZero parameter analogous to our handling in ValueTracking. + if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { - // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) SDValue NegOne = DAG.getAllOnesConstant(DL, VT); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); @@ -9288,31 +9303,44 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper. // sra (add (shl X, N1C), AddC), N1C --> // sext (add (trunc X to (width - N1C)), AddC') - if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C && - N0.getOperand(0).getOpcode() == ISD::SHL && - N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) { - if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) { - SDValue Shl = N0.getOperand(0); - // Determine what the truncate's type would be and ask the target if that - // is a free operation. - LLVMContext &Ctx = *DAG.getContext(); - unsigned ShiftAmt = N1C->getZExtValue(); - EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt); - if (VT.isVector()) - TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount()); + // sra (sub AddC, (shl X, N1C)), N1C --> + // sext (sub AddC1',(trunc X to (width - N1C))) + if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB) && N1C && + N0.hasOneUse()) { + bool IsAdd = N0.getOpcode() == ISD::ADD; + SDValue Shl = N0.getOperand(IsAdd ? 0 : 1); + if (Shl.getOpcode() == ISD::SHL && Shl.getOperand(1) == N1 && + Shl.hasOneUse()) { + // TODO: AddC does not need to be a splat. + if (ConstantSDNode *AddC = + isConstOrConstSplat(N0.getOperand(IsAdd ? 1 : 0))) { + // Determine what the truncate's type would be and ask the target if + // that is a free operation. + LLVMContext &Ctx = *DAG.getContext(); + unsigned ShiftAmt = N1C->getZExtValue(); + EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt); + if (VT.isVector()) + TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount()); - // TODO: The simple type check probably belongs in the default hook - // implementation and/or target-specific overrides (because - // non-simple types likely require masking when legalized), but that - // restriction may conflict with other transforms. - if (TruncVT.isSimple() && isTypeLegal(TruncVT) && - TLI.isTruncateFree(VT, TruncVT)) { - SDLoc DL(N); - SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT); - SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt). - trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT); - SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC); - return DAG.getSExtOrTrunc(Add, DL, VT); + // TODO: The simple type check probably belongs in the default hook + // implementation and/or target-specific overrides (because + // non-simple types likely require masking when legalized), but + // that restriction may conflict with other transforms. + if (TruncVT.isSimple() && isTypeLegal(TruncVT) && + TLI.isTruncateFree(VT, TruncVT)) { + SDLoc DL(N); + SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT); + SDValue ShiftC = + DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).trunc( + TruncVT.getScalarSizeInBits()), + DL, TruncVT); + SDValue Add; + if (IsAdd) + Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC); + else + Add = DAG.getNode(ISD::SUB, DL, TruncVT, ShiftC, Trunc); + return DAG.getSExtOrTrunc(Add, DL, VT); + } } } } @@ -11025,6 +11053,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (SDValue V = foldVSelectToSignBitSplatMask(N, DAG)) return V; + if (SimplifyDemandedVectorElts(SDValue(N, 0))) + return SDValue(N, 0); + return SDValue(); } @@ -13243,18 +13274,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } - // See if we can simplify the input to this truncate through knowledge that - // only the low bits are being used. - // For example "trunc (or (shl x, 8), y)" // -> trunc y - // Currently we only perform this optimization on scalars because vectors - // may have different active low bits. - if (!VT.isVector()) { - APInt Mask = - APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); - if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); - } - // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { @@ -13341,6 +13360,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + // See if we can simplify the input to this truncate through knowledge that + // only the low bits are being used. + // For example "trunc (or (shl x, 8), y)" // -> trunc y + // Currently we only perform this optimization on scalars because vectors + // may have different active low bits. + if (!VT.isVector()) { + APInt Mask = + APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); + if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); + } + // fold (truncate (extract_subvector(ext x))) -> // (extract_subvector x) // TODO: This can be generalized to cover cases where the truncate and extract @@ -24514,8 +24545,9 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { auto &Size0 = MUC0.NumBytes; auto &Size1 = MUC1.NumBytes; if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && - Size0 && Size1 && *Size0 == *Size1 && OrigAlignment0 > *Size0 && - SrcValOffset0 % *Size0 == 0 && SrcValOffset1 % *Size1 == 0) { + Size0.has_value() && Size1.has_value() && *Size0 == *Size1 && + OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 && + SrcValOffset1 % *Size1 == 0) { int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value(); int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index f464208cd9dc..6c136bdfc652 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2915,6 +2915,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { case ISD::SELECT_CC: Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break; case ISD::SETCC: Res = SoftPromoteHalfOp_SETCC(N); break; case ISD::STORE: Res = SoftPromoteHalfOp_STORE(N, OpNo); break; + case ISD::STACKMAP: + Res = SoftPromoteHalfOp_STACKMAP(N, OpNo); + break; } if (!Res.getNode()) @@ -3042,3 +3045,17 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo) { return DAG.getStore(ST->getChain(), dl, Promoted, ST->getBasePtr(), ST->getMemOperand()); } + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo) { + assert(OpNo > 1); // Because the first two arguments are guaranteed legal. + SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end()); + SDValue Op = N->getOperand(OpNo); + NewOps[OpNo] = GetSoftPromotedHalf(Op); + SDValue NewNode = + DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps); + + for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++) + ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum)); + + return SDValue(); // Signal that we replaced the node ourselves. +} diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 69fd83bcd7b3..343722a97c3c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -19,6 +19,7 @@ #include "LegalizeTypes.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -1723,6 +1724,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { break; case ISD::SET_ROUNDING: Res = PromoteIntOp_SET_ROUNDING(N); break; + case ISD::STACKMAP: + Res = PromoteIntOp_STACKMAP(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -2255,16 +2259,40 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) { SDLoc dl(N); SDValue Op = PromoteIntOpVectorReduction(N, N->getOperand(0)); - EVT EltVT = Op.getValueType().getVectorElementType(); - EVT VT = N->getValueType(0); + EVT OrigEltVT = N->getOperand(0).getValueType().getVectorElementType(); + EVT InVT = Op.getValueType(); + EVT EltVT = InVT.getVectorElementType(); + EVT ResVT = N->getValueType(0); + unsigned Opcode = N->getOpcode(); - if (VT.bitsGE(EltVT)) - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op); + // An i1 vecreduce_xor is equivalent to vecreduce_add, use that instead if + // vecreduce_xor is not legal + if (Opcode == ISD::VECREDUCE_XOR && OrigEltVT == MVT::i1 && + !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_XOR, InVT) && + TLI.isOperationLegalOrCustom(ISD::VECREDUCE_ADD, InVT)) + Opcode = ISD::VECREDUCE_ADD; + + // An i1 vecreduce_or is equivalent to vecreduce_umax, use that instead if + // vecreduce_or is not legal + else if (Opcode == ISD::VECREDUCE_OR && OrigEltVT == MVT::i1 && + !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_OR, InVT) && + TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMAX, InVT)) + Opcode = ISD::VECREDUCE_UMAX; + + // An i1 vecreduce_and is equivalent to vecreduce_umin, use that instead if + // vecreduce_and is not legal + else if (Opcode == ISD::VECREDUCE_AND && OrigEltVT == MVT::i1 && + !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_AND, InVT) && + TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMIN, InVT)) + Opcode = ISD::VECREDUCE_UMIN; + + if (ResVT.bitsGE(EltVT)) + return DAG.getNode(Opcode, SDLoc(N), ResVT, Op); // Result size must be >= element size. If this is not the case after // promotion, also promote the result type and then truncate. - SDValue Reduce = DAG.getNode(N->getOpcode(), dl, EltVT, Op); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Reduce); + SDValue Reduce = DAG.getNode(Opcode, dl, EltVT, Op); + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Reduce); } SDValue DAGTypeLegalizer::PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo) { @@ -2304,6 +2332,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) { + assert(OpNo > 1); // Because the first two arguments are guaranteed legal. + SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end()); + SDValue Operand = N->getOperand(OpNo); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType()); + NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -4653,6 +4690,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::FRAMEADDR: Res = ExpandIntOp_RETURNADDR(N); break; case ISD::ATOMIC_STORE: Res = ExpandIntOp_ATOMIC_STORE(N); break; + case ISD::STACKMAP: + Res = ExpandIntOp_STACKMAP(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -5481,3 +5521,44 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { return DAG.getBuildVector(N->getValueType(0), dl, NewOps); } + +SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) { + assert(OpNo > 1); + + SDValue Op = N->getOperand(OpNo); + SDLoc DL = SDLoc(N); + SmallVector<SDValue> NewOps; + + // Copy operands before the one being expanded. + for (unsigned I = 0; I < OpNo; I++) + NewOps.push_back(N->getOperand(I)); + + if (Op->getOpcode() == ISD::Constant) { + ConstantSDNode *CN = cast<ConstantSDNode>(Op); + EVT Ty = Op.getValueType(); + if (CN->getConstantIntValue()->getValue().getActiveBits() < 64) { + NewOps.push_back( + DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64)); + NewOps.push_back(DAG.getTargetConstant(CN->getZExtValue(), DL, Ty)); + } else { + // FIXME: https://github.com/llvm/llvm-project/issues/55609 + return SDValue(); + } + } else { + // FIXME: Non-constant operands are not yet handled: + // - https://github.com/llvm/llvm-project/issues/26431 + // - https://github.com/llvm/llvm-project/issues/55957 + return SDValue(); + } + + // Copy remaining operands. + for (unsigned I = OpNo + 1; I < N->getNumOperands(); I++) + NewOps.push_back(N->getOperand(I)); + + SDValue NewNode = DAG.getNode(N->getOpcode(), DL, N->getVTList(), NewOps); + + for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++) + ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum)); + + return SDValue(); // Signal that we have replaced the node already. +} diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index de320290bda9..2807b7f5ae68 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -402,6 +402,7 @@ private: SDValue PromoteIntOp_VECREDUCE(SDNode *N); SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); + SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -493,6 +494,7 @@ private: SDValue ExpandIntOp_RETURNADDR(SDNode *N); SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N); SDValue ExpandIntOp_SPLAT_VECTOR(SDNode *N); + SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo); void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &dl); @@ -741,6 +743,7 @@ private: SDValue SoftPromoteHalfOp_SETCC(SDNode *N); SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo); + SDValue SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo); //===--------------------------------------------------------------------===// // Scalarization Support: LegalizeVectorTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index fa555be00ded..143abc08eeea 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5627,7 +5627,6 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); - unsigned NumElts = VT.getVectorNumElements(); SDValue InOp = N->getOperand(N->isStrictFPOpcode() ? 1 : 0); assert(getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector && @@ -5639,7 +5638,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { // See if a widened result type would be legal, if so widen the node. // FIXME: This isn't safe for StrictFP. Other optimization here is needed. EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, - InVT.getVectorNumElements()); + InVT.getVectorElementCount()); if (TLI.isTypeLegal(WideVT) && !N->isStrictFPOpcode()) { SDValue Res; if (N->isStrictFPOpcode()) { @@ -5665,6 +5664,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { EVT InEltVT = InVT.getVectorElementType(); // Unroll the convert into some scalar code and create a nasty build vector. + unsigned NumElts = VT.getVectorNumElements(); SmallVector<SDValue, 16> Ops(NumElts); if (N->isStrictFPOpcode()) { SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end()); @@ -6055,7 +6055,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { // The result type is legal, if its vXi1, keep vXi1 for the new SETCC. if (VT.getScalarType() == MVT::i1) SVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - SVT.getVectorNumElements()); + SVT.getVectorElementCount()); SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N), SVT, InOp0, InOp1, N->getOperand(2)); @@ -6063,7 +6063,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { // Extract the needed results from the result vector. EVT ResVT = EVT::getVectorVT(*DAG.getContext(), SVT.getVectorElementType(), - VT.getVectorNumElements()); + VT.getVectorElementCount()); SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC, DAG.getVectorIdxConstant(0, dl)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b3b8756ae9ba..c8d0f5faf647 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -60,7 +60,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/raw_ostream.h" @@ -3271,6 +3270,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.Zero.setBitsFrom(1); break; case ISD::SETCC: + case ISD::SETCCCARRY: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: { unsigned OpNo = Op->isStrictFPOpcode() ? 1 : 0; @@ -3506,6 +3506,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, break; case ISD::USUBO: case ISD::SSUBO: + case ISD::SUBCARRY: + case ISD::SSUBO_CARRY: if (Op.getResNo() == 1) { // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == @@ -3520,6 +3522,10 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, assert(Op.getResNo() == 0 && "We only compute knownbits for the difference here."); + // TODO: Compute influence of the carry operand. + if (Opcode == ISD::SUBCARRY || Opcode == ISD::SSUBO_CARRY) + break; + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known = KnownBits::computeForAddSub(/* Add */ false, /* NSW */ false, @@ -3529,6 +3535,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, case ISD::UADDO: case ISD::SADDO: case ISD::ADDCARRY: + case ISD::SADDO_CARRY: if (Op.getResNo() == 1) { // If we know the result of a setcc has the top bits zero, use this info. if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) == @@ -3548,7 +3555,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, if (Opcode == ISD::ADDE) // Can't track carry from glue, set carry to unknown. Carry.resetAll(); - else if (Opcode == ISD::ADDCARRY) + else if (Opcode == ISD::ADDCARRY || Opcode == ISD::SADDO_CARRY) // TODO: Compute known bits for the carry operand. Not sure if it is worth // the trouble (how often will we find a known carry bit). And I haven't // tested this very much yet, but something like this might work: @@ -3862,6 +3869,12 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { if (C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2()) return true; + // vscale(power-of-two) is a power-of-two for some targets + if (Val.getOpcode() == ISD::VSCALE && + getTargetLoweringInfo().isVScaleKnownToBeAPowerOfTwo() && + isKnownToBeAPowerOfTwo(Val.getOperand(0))) + return true; + // More could be done here, though the above checks are enough // to handle some common cases. @@ -4108,8 +4121,12 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, return std::min(Tmp, Tmp2); case ISD::SADDO: case ISD::UADDO: + case ISD::SADDO_CARRY: + case ISD::ADDCARRY: case ISD::SSUBO: case ISD::USUBO: + case ISD::SSUBO_CARRY: + case ISD::SUBCARRY: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) @@ -4123,6 +4140,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, return VTBits; break; case ISD::SETCC: + case ISD::SETCCCARRY: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: { unsigned OpNo = Op->isStrictFPOpcode() ? 1 : 0; @@ -7505,6 +7523,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, Opcode == ISD::ATOMIC_LOAD_UMAX || Opcode == ISD::ATOMIC_LOAD_FADD || Opcode == ISD::ATOMIC_LOAD_FSUB || + Opcode == ISD::ATOMIC_LOAD_FMAX || + Opcode == ISD::ATOMIC_LOAD_FMIN || Opcode == ISD::ATOMIC_SWAP || Opcode == ISD::ATOMIC_STORE) && "Invalid Atomic Op"); @@ -10739,19 +10759,19 @@ namespace { } // end anonymous namespace -static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs; -static ManagedStatic<EVTArray> SimpleVTArray; -static ManagedStatic<sys::SmartMutex<true>> VTMutex; - /// getValueTypeList - Return a pointer to the specified value type. /// const EVT *SDNode::getValueTypeList(EVT VT) { + static std::set<EVT, EVT::compareRawBits> EVTs; + static EVTArray SimpleVTArray; + static sys::SmartMutex<true> VTMutex; + if (VT.isExtended()) { - sys::SmartScopedLock<true> Lock(*VTMutex); - return &(*EVTs->insert(VT).first); + sys::SmartScopedLock<true> Lock(VTMutex); + return &(*EVTs.insert(VT).first); } assert(VT.getSimpleVT() < MVT::VALUETYPE_SIZE && "Value type out of range!"); - return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy]; + return &SimpleVTArray.VTs[VT.getSimpleVT().SimpleTy]; } /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 37d05cdba76d..fe3c38ec590d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -703,7 +703,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, unsigned NumRegs; if (IsABIRegCopy) { NumRegs = TLI.getVectorTypeBreakdownForCallingConv( - *DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT, + *DAG.getContext(), CallConv.value(), ValueVT, IntermediateVT, NumIntermediates, RegisterVT); } else { NumRegs = @@ -800,11 +800,11 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, for (EVT ValueVT : ValueVTs) { unsigned NumRegs = isABIMangled() - ? TLI.getNumRegistersForCallingConv(Context, CC.getValue(), ValueVT) + ? TLI.getNumRegistersForCallingConv(Context, CC.value(), ValueVT) : TLI.getNumRegisters(Context, ValueVT); MVT RegisterVT = isABIMangled() - ? TLI.getRegisterTypeForCallingConv(Context, CC.getValue(), ValueVT) + ? TLI.getRegisterTypeForCallingConv(Context, CC.value(), ValueVT) : TLI.getRegisterType(Context, ValueVT); for (unsigned i = 0; i != NumRegs; ++i) Regs.push_back(Reg + i); @@ -831,10 +831,10 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, // Copy the legal parts from the registers. EVT ValueVT = ValueVTs[Value]; unsigned NumRegs = RegCount[Value]; - MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv( - *DAG.getContext(), - CallConv.getValue(), RegVTs[Value]) - : RegVTs[Value]; + MVT RegisterVT = + isABIMangled() ? TLI.getRegisterTypeForCallingConv( + *DAG.getContext(), CallConv.value(), RegVTs[Value]) + : RegVTs[Value]; Parts.resize(NumRegs); for (unsigned i = 0; i != NumRegs; ++i) { @@ -914,10 +914,10 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) { unsigned NumParts = RegCount[Value]; - MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv( - *DAG.getContext(), - CallConv.getValue(), RegVTs[Value]) - : RegVTs[Value]; + MVT RegisterVT = + isABIMangled() ? TLI.getRegisterTypeForCallingConv( + *DAG.getContext(), CallConv.value(), RegVTs[Value]) + : RegVTs[Value]; if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT)) ExtendKind = ISD::ZERO_EXTEND; @@ -1309,7 +1309,7 @@ void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) { if (handleDebugValue(V, Var, Expr, DL, InstDL, SDOrder, /*IsVariadic=*/false)) { LLVM_DEBUG(dbgs() << "Salvaged debug location info for:\n " - << DDI.getDI() << "\nBy stripping back to:\n " << V); + << *DDI.getDI() << "\nBy stripping back to:\n " << *V); return; } } @@ -1321,7 +1321,7 @@ void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) { auto SDV = DAG.getConstantDbgValue(Var, Expr, Undef, DL, SDNodeOrder); DAG.AddDbgValue(SDV, false); - LLVM_DEBUG(dbgs() << "Dropping debug value info for:\n " << DDI.getDI() + LLVM_DEBUG(dbgs() << "Dropping debug value info for:\n " << *DDI.getDI() << "\n"); LLVM_DEBUG(dbgs() << " Last seen at:\n " << *DDI.getDI()->getOperand(0) << "\n"); @@ -3747,13 +3747,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { setValue(&I, DAG.getBuildVector(VT, DL, Ops)); } -void SelectionDAGBuilder::visitInsertValue(const User &I) { - ArrayRef<unsigned> Indices; - if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I)) - Indices = IV->getIndices(); - else - Indices = cast<ConstantExpr>(&I)->getIndices(); - +void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { + ArrayRef<unsigned> Indices = I.getIndices(); const Value *Op0 = I.getOperand(0); const Value *Op1 = I.getOperand(1); Type *AggTy = I.getType(); @@ -4616,6 +4611,8 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) { case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break; case AtomicRMWInst::FAdd: NT = ISD::ATOMIC_LOAD_FADD; break; case AtomicRMWInst::FSub: NT = ISD::ATOMIC_LOAD_FSUB; break; + case AtomicRMWInst::FMax: NT = ISD::ATOMIC_LOAD_FMAX; break; + case AtomicRMWInst::FMin: NT = ISD::ATOMIC_LOAD_FMIN; break; } AtomicOrdering Ordering = I.getOrdering(); SyncScope::ID SSID = I.getSyncScopeID(); @@ -8410,52 +8407,6 @@ public: return false; } - - /// getCallOperandValEVT - Return the EVT of the Value* that this operand - /// corresponds to. If there is no Value* for this operand, it returns - /// MVT::Other. - EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI, - const DataLayout &DL, - llvm::Type *ParamElemType) const { - if (!CallOperandVal) return MVT::Other; - - if (isa<BasicBlock>(CallOperandVal)) - return TLI.getProgramPointerTy(DL); - - llvm::Type *OpTy = CallOperandVal->getType(); - - // FIXME: code duplicated from TargetLowering::ParseConstraints(). - // If this is an indirect operand, the operand is a pointer to the - // accessed type. - if (isIndirect) { - OpTy = ParamElemType; - assert(OpTy && "Indirect operand must have elementtype attribute"); - } - - // Look for vector wrapped in a struct. e.g. { <16 x i8> }. - if (StructType *STy = dyn_cast<StructType>(OpTy)) - if (STy->getNumElements() == 1) - OpTy = STy->getElementType(0); - - // If OpTy is not a single value, it may be a struct/union that we - // can tile with integers. - if (!OpTy->isSingleValueType() && OpTy->isSized()) { - unsigned BitSize = DL.getTypeSizeInBits(OpTy); - switch (BitSize) { - default: break; - case 1: - case 8: - case 16: - case 32: - case 64: - case 128: - OpTy = IntegerType::get(Context, BitSize); - break; - } - } - - return TLI.getAsmOperandValueType(DL, OpTy, true); - } }; @@ -8722,37 +8673,12 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, bool HasSideEffect = IA->hasSideEffects(); ExtraFlags ExtraInfo(Call); - unsigned ArgNo = 0; // ArgNo - The argument of the CallInst. - unsigned ResNo = 0; // ResNo - The result number of the next output. for (auto &T : TargetConstraints) { ConstraintOperands.push_back(SDISelAsmOperandInfo(T)); SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back(); - // Compute the value type for each operand. - if (OpInfo.hasArg()) { - OpInfo.CallOperandVal = Call.getArgOperand(ArgNo); + if (OpInfo.CallOperandVal) OpInfo.CallOperand = getValue(OpInfo.CallOperandVal); - Type *ParamElemTy = Call.getParamElementType(ArgNo); - EVT VT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, - DAG.getDataLayout(), ParamElemTy); - OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other; - ArgNo++; - } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) { - // The return value of the call is this value. As such, there is no - // corresponding argument. - assert(!Call.getType()->isVoidTy() && "Bad inline asm!"); - if (StructType *STy = dyn_cast<StructType>(Call.getType())) { - OpInfo.ConstraintVT = TLI.getSimpleValueType( - DAG.getDataLayout(), STy->getElementType(ResNo)); - } else { - assert(ResNo == 0 && "Asm only has one result!"); - OpInfo.ConstraintVT = TLI.getAsmOperandValueType( - DAG.getDataLayout(), Call.getType()).getSimpleVT(); - } - ++ResNo; - } else { - OpInfo.ConstraintVT = MVT::Other; - } if (!HasSideEffect) HasSideEffect = OpInfo.hasMemory(TLI); @@ -8865,7 +8791,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, if (RegError) { const MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const char *RegName = TRI.getName(RegError.getValue()); + const char *RegName = TRI.getName(RegError.value()); emitInlineAsmError(Call, "register '" + Twine(RegName) + "' allocated for constraint '" + Twine(OpInfo.ConstraintCode) + @@ -9385,9 +9311,9 @@ static void addStackMapLiveVars(const CallBase &Call, unsigned StartIdx, } } -/// Lower llvm.experimental.stackmap directly to its target opcode. +/// Lower llvm.experimental.stackmap. void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { - // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>, + // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>, // [live variables...]) assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value."); @@ -9412,29 +9338,45 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL); InFlag = Chain.getValue(1); - // Add the <id> and <numBytes> constants. - SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos)); - Ops.push_back(DAG.getTargetConstant( - cast<ConstantSDNode>(IDVal)->getZExtValue(), DL, MVT::i64)); - SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos)); - Ops.push_back(DAG.getTargetConstant( - cast<ConstantSDNode>(NBytesVal)->getZExtValue(), DL, - MVT::i32)); + // Add the STACKMAP operands, starting with DAG house-keeping. + Ops.push_back(Chain); + Ops.push_back(InFlag); - // Push live variables for the stack map. - addStackMapLiveVars(CI, 2, DL, Ops, *this); + // Add the <id>, <numShadowBytes> operands. + // + // These do not require legalisation, and can be emitted directly to target + // constant nodes. + SDValue ID = getValue(CI.getArgOperand(0)); + assert(ID.getValueType() == MVT::i64); + SDValue IDConst = DAG.getTargetConstant( + cast<ConstantSDNode>(ID)->getZExtValue(), DL, ID.getValueType()); + Ops.push_back(IDConst); - // We are not pushing any register mask info here on the operands list, - // because the stackmap doesn't clobber anything. + SDValue Shad = getValue(CI.getArgOperand(1)); + assert(Shad.getValueType() == MVT::i32); + SDValue ShadConst = DAG.getTargetConstant( + cast<ConstantSDNode>(Shad)->getZExtValue(), DL, Shad.getValueType()); + Ops.push_back(ShadConst); - // Push the chain and the glue flag. - Ops.push_back(Chain); - Ops.push_back(InFlag); + // Add the live variables. + for (unsigned I = 2; I < CI.arg_size(); I++) { + SDValue Op = getValue(CI.getArgOperand(I)); + + // Things on the stack are pointer-typed, meaning that they are already + // legal and can be emitted directly to target nodes. + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + Ops.push_back(DAG.getTargetFrameIndex( + FI->getIndex(), TLI.getFrameIndexTy(DAG.getDataLayout()))); + } else { + // Otherwise emit a target independent node to be legalised. + Ops.push_back(getValue(CI.getArgOperand(I))); + } + } // Create the STACKMAP node. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops); - Chain = SDValue(SM, 0); + Chain = DAG.getNode(ISD::STACKMAP, DL, NodeTys, Ops); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 72cca3d9b001..4a3ab00614b3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -529,7 +529,7 @@ private: void visitShuffleVector(const User &I); void visitExtractValue(const ExtractValueInst &I); - void visitInsertValue(const User &I); + void visitInsertValue(const InsertValueInst &I); void visitLandingPad(const LandingPadInst &LP); void visitGetElementPtr(const User &I); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index bbfc6e5ef64f..9df0b64c26c3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -486,6 +486,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; + case ISD::STACKMAP: + return "stackmap"; // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 2b63359c2b1b..7f453f081982 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -51,6 +50,7 @@ #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/SwiftErrorValueTracking.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -64,7 +64,6 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstIterator.h" @@ -345,47 +344,6 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -/// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that -/// may trap on it. In this case we have to split the edge so that the path -/// through the predecessor block that doesn't go to the phi block doesn't -/// execute the possibly trapping instruction. If available, we pass domtree -/// and loop info to be updated when we split critical edges. This is because -/// SelectionDAGISel preserves these analyses. -/// This is required for correctness, so it must be done at -O0. -/// -static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT, - LoopInfo *LI) { - // Loop for blocks with phi nodes. - for (BasicBlock &BB : Fn) { - PHINode *PN = dyn_cast<PHINode>(BB.begin()); - if (!PN) continue; - - ReprocessBlock: - // For each block with a PHI node, check to see if any of the input values - // are potentially trapping constant expressions. Constant expressions are - // the only potentially trapping value that can occur as the argument to a - // PHI. - for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I)); ++I) - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Constant *C = dyn_cast<Constant>(PN->getIncomingValue(i)); - if (!C || !C->canTrap()) continue; - - // The only case we have to worry about is when the edge is critical. - // Since this block has a PHI Node, we assume it has multiple input - // edges: check to see if the pred has multiple successors. - BasicBlock *Pred = PN->getIncomingBlock(i); - if (Pred->getTerminator()->getNumSuccessors() == 1) - continue; - - // Okay, we have to split this edge. - SplitCriticalEdge( - Pred->getTerminator(), GetSuccessorNumber(Pred, &BB), - CriticalEdgeSplittingOptions(DT, LI).setMergeIdenticalEdges()); - goto ReprocessBlock; - } - } -} - static void computeUsesMSVCFloatingPoint(const Triple &TT, const Function &F, MachineModuleInfo &MMI) { // Only needed for MSVC @@ -445,10 +403,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(Fn); GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr; ORE = std::make_unique<OptimizationRemarkEmitter>(&Fn); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); - LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); BlockFrequencyInfo *BFI = nullptr; if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOpt::None) @@ -456,8 +410,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); - SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI); - CurDAG->init(*MF, *ORE, this, LibInfo, getAnalysisIfAvailable<LegacyDivergenceAnalysis>(), PSI, BFI); FuncInfo->set(Fn, *MF, CurDAG); @@ -2241,6 +2193,52 @@ void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) { N->getOperand(0)); } +void SelectionDAGISel::Select_STACKMAP(SDNode *N) { + std::vector<SDValue> Ops; + auto *It = N->op_begin(); + SDLoc DL(N); + + // Stash the chain and glue operands so we can move them to the end. + SDValue Chain = *It++; + SDValue InFlag = *It++; + + // <id> operand. + SDValue ID = *It++; + assert(ID.getValueType() == MVT::i64); + Ops.push_back(ID); + + // <numShadowBytes> operand. + SDValue Shad = *It++; + assert(Shad.getValueType() == MVT::i32); + Ops.push_back(Shad); + + // Live variable operands. + for (; It != N->op_end(); It++) { + SDNode *OpNode = It->getNode(); + SDValue O; + + // FrameIndex nodes should have been directly emitted to TargetFrameIndex + // nodes at DAG-construction time. + assert(OpNode->getOpcode() != ISD::FrameIndex); + + if (OpNode->getOpcode() == ISD::Constant) { + Ops.push_back( + CurDAG->getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64)); + O = CurDAG->getTargetConstant( + cast<ConstantSDNode>(OpNode)->getZExtValue(), DL, It->getValueType()); + } else { + O = *It; + } + Ops.push_back(O); + } + + Ops.push_back(Chain); + Ops.push_back(InFlag); + + SDVTList NodeTys = CurDAG->getVTList(MVT::Other, MVT::Glue); + CurDAG->SelectNodeTo(N, TargetOpcode::STACKMAP, NodeTys, Ops); +} + /// GetVBR - decode a vbr encoding whose top bit is set. LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) { @@ -2795,6 +2793,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::ARITH_FENCE: Select_ARITH_FENCE(NodeToMatch); return; + case ISD::STACKMAP: + Select_STACKMAP(NodeToMatch); + return; } assert(!NodeToMatch->isMachineOpcode() && "Node already selected!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 19a52fde44c1..3061158eea30 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -531,14 +531,14 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops, for (const Value *V : SI.Bases) { auto Opt = S.isGCManagedPointer(V->getType()->getScalarType()); if (Opt) { - assert(Opt.getValue() && + assert(Opt.value() && "non gc managed base pointer found in statepoint"); } } for (const Value *V : SI.Ptrs) { auto Opt = S.isGCManagedPointer(V->getType()->getScalarType()); if (Opt) { - assert(Opt.getValue() && + assert(Opt.value() && "non gc managed derived pointer found in statepoint"); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a6b471ea22b7..66389a57f780 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1362,6 +1362,29 @@ bool TargetLowering::SimplifyDemandedBits( } } + // AND(INSERT_SUBVECTOR(C,X,I),M) -> INSERT_SUBVECTOR(AND(C,M),X,I) + // iff 'C' is Undef/Constant and AND(X,M) == X (for DemandedBits). + if (Op0.getOpcode() == ISD::INSERT_SUBVECTOR && + (Op0.getOperand(0).isUndef() || + ISD::isBuildVectorOfConstantSDNodes(Op0.getOperand(0).getNode())) && + Op0->hasOneUse()) { + unsigned NumSubElts = + Op0.getOperand(1).getValueType().getVectorNumElements(); + unsigned SubIdx = Op0.getConstantOperandVal(2); + APInt DemandedSub = + APInt::getBitsSet(NumElts, SubIdx, SubIdx + NumSubElts); + KnownBits KnownSubMask = + TLO.DAG.computeKnownBits(Op1, DemandedSub & DemandedElts, Depth + 1); + if (DemandedBits.isSubsetOf(KnownSubMask.One)) { + SDValue NewAnd = + TLO.DAG.getNode(ISD::AND, dl, VT, Op0.getOperand(0), Op1); + SDValue NewInsert = + TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, NewAnd, + Op0.getOperand(1), Op0.getOperand(2)); + return TLO.CombineTo(Op, NewInsert); + } + } + if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1)) return true; @@ -1371,20 +1394,6 @@ bool TargetLowering::SimplifyDemandedBits( return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); - // Attempt to avoid multi-use ops if we don't need anything from them. - if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) { - SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( - Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); - SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( - Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); - if (DemandedOp0 || DemandedOp1) { - Op0 = DemandedOp0 ? DemandedOp0 : Op0; - Op1 = DemandedOp1 ? DemandedOp1 : Op1; - SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1); - return TLO.CombineTo(Op, NewOp); - } - } - // If all of the demanded bits are known one on one side, return the other. // These bits cannot contribute to the result of the 'and'. if (DemandedBits.isSubsetOf(Known2.Zero | Known.One)) @@ -1402,6 +1411,20 @@ bool TargetLowering::SimplifyDemandedBits( if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) return true; + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); + SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( + Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0 || DemandedOp1) { + Op0 = DemandedOp0 ? DemandedOp0 : Op0; + Op1 = DemandedOp1 ? DemandedOp1 : Op1; + SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } + Known &= Known2; break; } @@ -1418,6 +1441,19 @@ bool TargetLowering::SimplifyDemandedBits( return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); + // If all of the demanded bits are known zero on one side, return the other. + // These bits cannot contribute to the result of the 'or'. + if (DemandedBits.isSubsetOf(Known2.One | Known.Zero)) + return TLO.CombineTo(Op, Op0); + if (DemandedBits.isSubsetOf(Known.One | Known2.Zero)) + return TLO.CombineTo(Op, Op1); + // If the RHS is a constant, see if we can simplify it. + if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO)) + return true; + // If the operation can be done in a smaller type, do so. + if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) + return true; + // Attempt to avoid multi-use ops if we don't need anything from them. if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) { SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( @@ -1432,19 +1468,6 @@ bool TargetLowering::SimplifyDemandedBits( } } - // If all of the demanded bits are known zero on one side, return the other. - // These bits cannot contribute to the result of the 'or'. - if (DemandedBits.isSubsetOf(Known2.One | Known.Zero)) - return TLO.CombineTo(Op, Op0); - if (DemandedBits.isSubsetOf(Known.One | Known2.Zero)) - return TLO.CombineTo(Op, Op1); - // If the RHS is a constant, see if we can simplify it. - if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO)) - return true; - // If the operation can be done in a smaller type, do so. - if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) - return true; - Known |= Known2; break; } @@ -1461,20 +1484,6 @@ bool TargetLowering::SimplifyDemandedBits( return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); - // Attempt to avoid multi-use ops if we don't need anything from them. - if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) { - SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( - Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); - SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( - Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); - if (DemandedOp0 || DemandedOp1) { - Op0 = DemandedOp0 ? DemandedOp0 : Op0; - Op1 = DemandedOp1 ? DemandedOp1 : Op1; - SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1); - return TLO.CombineTo(Op, NewOp); - } - } - // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'xor'. if (DemandedBits.isSubsetOf(Known.Zero)) @@ -1519,6 +1528,20 @@ bool TargetLowering::SimplifyDemandedBits( if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO)) return true; + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); + SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( + Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0 || DemandedOp1) { + Op0 = DemandedOp0 ? DemandedOp0 : Op0; + Op1 = DemandedOp1 ? DemandedOp1 : Op1; + SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } + Known ^= Known2; break; } @@ -1972,9 +1995,9 @@ bool TargetLowering::SimplifyDemandedBits( KnownBits Known1 = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1); Known = KnownBits::umin(Known0, Known1); if (Optional<bool> IsULE = KnownBits::ule(Known0, Known1)) - return TLO.CombineTo(Op, IsULE.getValue() ? Op0 : Op1); + return TLO.CombineTo(Op, IsULE.value() ? Op0 : Op1); if (Optional<bool> IsULT = KnownBits::ult(Known0, Known1)) - return TLO.CombineTo(Op, IsULT.getValue() ? Op0 : Op1); + return TLO.CombineTo(Op, IsULT.value() ? Op0 : Op1); break; } case ISD::UMAX: { @@ -1985,9 +2008,9 @@ bool TargetLowering::SimplifyDemandedBits( KnownBits Known1 = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1); Known = KnownBits::umax(Known0, Known1); if (Optional<bool> IsUGE = KnownBits::uge(Known0, Known1)) - return TLO.CombineTo(Op, IsUGE.getValue() ? Op0 : Op1); + return TLO.CombineTo(Op, IsUGE.value() ? Op0 : Op1); if (Optional<bool> IsUGT = KnownBits::ugt(Known0, Known1)) - return TLO.CombineTo(Op, IsUGT.getValue() ? Op0 : Op1); + return TLO.CombineTo(Op, IsUGT.value() ? Op0 : Op1); break; } case ISD::BITREVERSE: { @@ -2486,9 +2509,7 @@ bool TargetLowering::SimplifyDemandedBits( // won't wrap after simplification. Flags.setNoSignedWrap(false); Flags.setNoUnsignedWrap(false); - SDValue NewOp = - TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags); - return TLO.CombineTo(Op, NewOp); + Op->setFlags(Flags); } return true; } @@ -3031,15 +3052,15 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::VSELECT: { + SDValue Sel = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + // Try to transform the select condition based on the current demanded // elements. - // TODO: If a condition element is undef, we can choose from one arm of the - // select (and if one arm is undef, then we can propagate that to the - // result). - // TODO - add support for constant vselect masks (see IR version of this). - APInt UnusedUndef, UnusedZero; - if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UnusedUndef, - UnusedZero, TLO, Depth + 1)) + APInt UndefSel, UndefZero; + if (SimplifyDemandedVectorElts(Sel, DemandedElts, UndefSel, UndefZero, TLO, + Depth + 1)) return true; // See if we can simplify either vselect operand. @@ -3047,15 +3068,24 @@ bool TargetLowering::SimplifyDemandedVectorElts( APInt DemandedRHS(DemandedElts); APInt UndefLHS, ZeroLHS; APInt UndefRHS, ZeroRHS; - if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedLHS, UndefLHS, - ZeroLHS, TLO, Depth + 1)) + if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO, + Depth + 1)) return true; - if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedRHS, UndefRHS, - ZeroRHS, TLO, Depth + 1)) + if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO, + Depth + 1)) return true; KnownUndef = UndefLHS & UndefRHS; KnownZero = ZeroLHS & ZeroRHS; + + // If we know that the selected element is always zero, we don't need the + // select value element. + APInt DemandedSel = DemandedElts & ~KnownZero; + if (DemandedSel != DemandedElts) + if (SimplifyDemandedVectorElts(Sel, DemandedSel, UndefSel, UndefZero, TLO, + Depth + 1)) + return true; + break; } case ISD::VECTOR_SHUFFLE: { @@ -5239,17 +5269,13 @@ TargetLowering::ParseConstraints(const DataLayout &DL, case 32: case 64: case 128: - OpInfo.ConstraintVT = - MVT::getVT(IntegerType::get(OpTy->getContext(), BitSize), true); + OpTy = IntegerType::get(OpTy->getContext(), BitSize); break; } - } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) { - unsigned PtrSize = DL.getPointerSizeInBits(PT->getAddressSpace()); - OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize); - } else { - OpInfo.ConstraintVT = MVT::getVT(OpTy, true); } + EVT VT = getAsmOperandValueType(DL, OpTy, true); + OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other; ArgNo++; } } @@ -7833,7 +7859,7 @@ SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const { // return popcount(~x); // // Ref: "Hacker's Delight" by Henry Warren - for (unsigned i = 0; (1U << i) <= (NumBitsPerElt / 2); ++i) { + for (unsigned i = 0; (1U << i) < NumBitsPerElt; ++i) { SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT); Op = DAG.getNode(ISD::OR, dl, VT, Op, DAG.getNode(ISD::SRL, dl, VT, Op, Tmp)); diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index f3d68bd9c92d..2badbe34ae6a 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -449,9 +449,6 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) { Name == ".llvmbc" || Name == ".llvmcmd") return SectionKind::getMetadata(); - if (Name == ".llvm.offloading") - return SectionKind::getExclude(); - if (Name.empty() || Name[0] != '.') return K; // Default implementation based on some magic section names. @@ -501,6 +498,9 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) { if (hasPrefix(Name, ".preinit_array")) return ELF::SHT_PREINIT_ARRAY; + if (hasPrefix(Name, ".llvm.offloading")) + return ELF::SHT_LLVM_OFFLOADING; + if (K.isBSS() || K.isThreadBSS()) return ELF::SHT_NOBITS; diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index 50c52190c1f6..298359dea9af 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -1005,6 +1005,7 @@ void DWARFLinker::DIECloner::cloneExpression( // instead indicate the generic type. The same holds for // DW_OP_reinterpret, which is currently not supported. if (RefOffset > 0 || Op.getCode() != dwarf::DW_OP_convert) { + RefOffset += Unit.getOrigUnit().getOffset(); auto RefDie = Unit.getOrigUnit().getDIEForOffset(RefOffset); CompileUnit::DIEInfo &Info = Unit.getInfo(RefDie); if (DIE *Clone = Info.Clone) diff --git a/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp b/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp index d12f6c796e50..74803a3e495a 100644 --- a/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp +++ b/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp @@ -8,7 +8,6 @@ #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include <string> using namespace llvm; @@ -42,9 +41,9 @@ public: }; } // namespace -static llvm::ManagedStatic<CodeViewErrorCategory> CodeViewErrCategory; const std::error_category &llvm::codeview::CVErrorCategory() { - return *CodeViewErrCategory; + static CodeViewErrorCategory CodeViewErrCategory; + return CodeViewErrCategory; } char CodeViewError::ID; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index c785026f8461..2e567d8bc7ee 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -1205,13 +1205,13 @@ void DWARFContext::addLocalsForDie(DWARFCompileUnit *CU, DWARFDie Subprogram, if (auto DeclFileAttr = Die.find(DW_AT_decl_file)) { if (const auto *LT = CU->getContext().getLineTableForUnit(CU)) LT->getFileNameByIndex( - DeclFileAttr->getAsUnsignedConstant().getValue(), + DeclFileAttr->getAsUnsignedConstant().value(), CU->getCompilationDir(), DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, Local.DeclFile); } if (auto DeclLineAttr = Die.find(DW_AT_decl_line)) - Local.DeclLine = DeclLineAttr->getAsUnsignedConstant().getValue(); + Local.DeclLine = DeclLineAttr->getAsUnsignedConstant().value(); Result.push_back(Local); return; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index 2e0780e249aa..33856c12b3c9 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -327,20 +327,20 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData, FileEntry.Source = Value; break; case DW_LNCT_directory_index: - FileEntry.DirIdx = Value.getAsUnsignedConstant().getValue(); + FileEntry.DirIdx = Value.getAsUnsignedConstant().value(); break; case DW_LNCT_timestamp: - FileEntry.ModTime = Value.getAsUnsignedConstant().getValue(); + FileEntry.ModTime = Value.getAsUnsignedConstant().value(); break; case DW_LNCT_size: - FileEntry.Length = Value.getAsUnsignedConstant().getValue(); + FileEntry.Length = Value.getAsUnsignedConstant().value(); break; case DW_LNCT_MD5: - if (!Value.getAsBlock() || Value.getAsBlock().getValue().size() != 16) + if (!Value.getAsBlock() || Value.getAsBlock().value().size() != 16) return createStringError( errc::invalid_argument, "failed to parse file entry because the MD5 hash is invalid"); - std::uninitialized_copy_n(Value.getAsBlock().getValue().begin(), 16, + std::uninitialized_copy_n(Value.getAsBlock().value().begin(), 16, FileEntry.Checksum.begin()); break; default: diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 96c546250974..15a2d23c4fd2 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -136,23 +136,30 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die, auto Color = HighlightColor::Enumerator; if (Attr == DW_AT_decl_file || Attr == DW_AT_call_file) { Color = HighlightColor::String; - if (const auto *LT = U->getContext().getLineTableForUnit(U)) - if (LT->getFileNameByIndex( - *FormValue.getAsUnsignedConstant(), U->getCompilationDir(), - DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, File)) { - File = '"' + File + '"'; - Name = File; + if (const auto *LT = U->getContext().getLineTableForUnit(U)) { + if (Optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) { + if (LT->getFileNameByIndex( + *Val, U->getCompilationDir(), + DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, + File)) { + File = '"' + File + '"'; + Name = File; + } } + } } else if (Optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) Name = AttributeValueString(Attr, *Val); if (!Name.empty()) WithColor(OS, Color) << Name; - else if (Attr == DW_AT_decl_line || Attr == DW_AT_call_line) - OS << *FormValue.getAsUnsignedConstant(); - else if (Attr == DW_AT_low_pc && - (FormValue.getAsAddress() == - dwarf::computeTombstoneAddress(U->getAddressByteSize()))) { + else if (Attr == DW_AT_decl_line || Attr == DW_AT_call_line) { + if (Optional<uint64_t> Val = FormValue.getAsUnsignedConstant()) + OS << *Val; + else + FormValue.dump(OS, DumpOpts); + } else if (Attr == DW_AT_low_pc && + (FormValue.getAsAddress() == + dwarf::computeTombstoneAddress(U->getAddressByteSize()))) { if (DumpOpts.Verbose) { FormValue.dump(OS, DumpOpts); OS << " ("; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index c704f8f583af..2be2a12aa025 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -704,6 +704,14 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, } break; } + case DW_AT_call_line: + case DW_AT_decl_line: { + if (!AttrValue.Value.getAsUnsignedConstant()) { + ReportError("DIE has " + AttributeString(Attr) + + " with invalid encoding"); + } + break; + } default: break; } diff --git a/llvm/lib/DebugInfo/MSF/MSFError.cpp b/llvm/lib/DebugInfo/MSF/MSFError.cpp index 9df2158423a4..fd93c3e726cc 100644 --- a/llvm/lib/DebugInfo/MSF/MSFError.cpp +++ b/llvm/lib/DebugInfo/MSF/MSFError.cpp @@ -8,7 +8,6 @@ #include "llvm/DebugInfo/MSF/MSFError.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include <string> using namespace llvm; @@ -50,7 +49,9 @@ public: }; } // namespace -static llvm::ManagedStatic<MSFErrorCategory> MSFCategory; -const std::error_category &llvm::msf::MSFErrCategory() { return *MSFCategory; } +const std::error_category &llvm::msf::MSFErrCategory() { + static MSFErrorCategory MSFCategory; + return MSFCategory; +} char MSFError::ID; diff --git a/llvm/lib/DebugInfo/PDB/DIA/DIAError.cpp b/llvm/lib/DebugInfo/PDB/DIA/DIAError.cpp index 819651f77787..0bd93a0e9506 100644 --- a/llvm/lib/DebugInfo/PDB/DIA/DIAError.cpp +++ b/llvm/lib/DebugInfo/PDB/DIA/DIAError.cpp @@ -1,6 +1,5 @@ #include "llvm/DebugInfo/PDB/DIA/DIAError.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; using namespace llvm::pdb; @@ -31,7 +30,9 @@ public: } }; -static llvm::ManagedStatic<DIAErrorCategory> DIACategory; -const std::error_category &llvm::pdb::DIAErrCategory() { return *DIACategory; } +const std::error_category &llvm::pdb::DIAErrCategory() { + static DIAErrorCategory DIACategory; + return DIACategory; +} char DIAError::ID; diff --git a/llvm/lib/DebugInfo/PDB/GenericError.cpp b/llvm/lib/DebugInfo/PDB/GenericError.cpp index 0e4cba3174b2..d6da2dd62140 100644 --- a/llvm/lib/DebugInfo/PDB/GenericError.cpp +++ b/llvm/lib/DebugInfo/PDB/GenericError.cpp @@ -8,7 +8,6 @@ #include "llvm/DebugInfo/PDB/GenericError.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; using namespace llvm::pdb; @@ -42,7 +41,9 @@ public: }; } // namespace -static llvm::ManagedStatic<PDBErrorCategory> PDBCategory; -const std::error_category &llvm::pdb::PDBErrCategory() { return *PDBCategory; } +const std::error_category &llvm::pdb::PDBErrCategory() { + static PDBErrorCategory PDBCategory; + return PDBCategory; +} char PDBError::ID; diff --git a/llvm/lib/DebugInfo/PDB/Native/RawError.cpp b/llvm/lib/DebugInfo/PDB/Native/RawError.cpp index ed6cf0839675..31320288a603 100644 --- a/llvm/lib/DebugInfo/PDB/Native/RawError.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/RawError.cpp @@ -1,6 +1,5 @@ #include "llvm/DebugInfo/PDB/Native/RawError.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; using namespace llvm::pdb; @@ -47,7 +46,9 @@ public: }; } // namespace -static llvm::ManagedStatic<RawErrorCategory> RawCategory; -const std::error_category &llvm::pdb::RawErrCategory() { return *RawCategory; } +const std::error_category &llvm::pdb::RawErrCategory() { + static RawErrorCategory RawCategory; + return RawCategory; +} char RawError::ID; diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp index d2ff8aa7c995..c239d4c260ec 100644 --- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp +++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -327,6 +327,8 @@ Optional<ArrayRef<uint8_t>> getBuildID(const ELFFile<ELFT> &Obj) { return {}; } +} // end anonymous namespace + Optional<ArrayRef<uint8_t>> getBuildID(const ELFObjectFileBase *Obj) { Optional<ArrayRef<uint8_t>> BuildID; if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Obj)) @@ -342,8 +344,6 @@ Optional<ArrayRef<uint8_t>> getBuildID(const ELFObjectFileBase *Obj) { return BuildID; } -} // end anonymous namespace - ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath, const MachOObjectFile *MachExeObj, const std::string &ArchName) { diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp index 7b1c36fdbe09..ef4e11ca38e6 100644 --- a/llvm/lib/Debuginfod/Debuginfod.cpp +++ b/llvm/lib/Debuginfod/Debuginfod.cpp @@ -8,25 +8,39 @@ /// /// \file /// -/// This file defines the fetchInfo function, which retrieves -/// any of the three supported artifact types: (executable, debuginfo, source -/// file) associated with a build-id from debuginfod servers. If a source file -/// is to be fetched, its absolute path must be specified in the Description -/// argument to fetchInfo. +/// This file contains several definitions for the debuginfod client and server. +/// For the client, this file defines the fetchInfo function. For the server, +/// this file defines the DebuginfodLogEntry and DebuginfodServer structs, as +/// well as the DebuginfodLog, DebuginfodCollection classes. The fetchInfo +/// function retrieves any of the three supported artifact types: (executable, +/// debuginfo, source file) associated with a build-id from debuginfod servers. +/// If a source file is to be fetched, its absolute path must be specified in +/// the Description argument to fetchInfo. The DebuginfodLogEntry, +/// DebuginfodLog, and DebuginfodCollection are used by the DebuginfodServer to +/// scan the local filesystem for binaries and serve the debuginfod protocol. /// //===----------------------------------------------------------------------===// #include "llvm/Debuginfod/Debuginfod.h" #include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/Magic.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" #include "llvm/Debuginfod/HTTPClient.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" #include "llvm/Support/CachePruning.h" #include "llvm/Support/Caching.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/Path.h" +#include "llvm/Support/ThreadPool.h" #include "llvm/Support/xxhash.h" +#include <atomic> + namespace llvm { static std::string uniqueKey(llvm::StringRef S) { return utostr(xxHash64(S)); } @@ -46,6 +60,8 @@ Expected<SmallVector<StringRef>> getDefaultDebuginfodUrls() { return DebuginfodUrls; } +/// Finds a default local file caching directory for the debuginfod client, +/// first checking DEBUGINFOD_CACHE_PATH. Expected<std::string> getDefaultDebuginfodCacheDirectory() { if (const char *CacheDirectoryEnv = std::getenv("DEBUGINFOD_CACHE_PATH")) return CacheDirectoryEnv; @@ -208,4 +224,293 @@ Expected<std::string> getCachedOrDownloadArtifact( return createStringError(errc::argument_out_of_domain, "build id not found"); } + +DebuginfodLogEntry::DebuginfodLogEntry(const Twine &Message) + : Message(Message.str()) {} + +void DebuginfodLog::push(const Twine &Message) { + push(DebuginfodLogEntry(Message)); +} + +void DebuginfodLog::push(DebuginfodLogEntry Entry) { + { + std::lock_guard<std::mutex> Guard(QueueMutex); + LogEntryQueue.push(Entry); + } + QueueCondition.notify_one(); +} + +DebuginfodLogEntry DebuginfodLog::pop() { + { + std::unique_lock<std::mutex> Guard(QueueMutex); + // Wait for messages to be pushed into the queue. + QueueCondition.wait(Guard, [&] { return !LogEntryQueue.empty(); }); + } + std::lock_guard<std::mutex> Guard(QueueMutex); + if (!LogEntryQueue.size()) + llvm_unreachable("Expected message in the queue."); + + DebuginfodLogEntry Entry = LogEntryQueue.front(); + LogEntryQueue.pop(); + return Entry; +} + +DebuginfodCollection::DebuginfodCollection(ArrayRef<StringRef> PathsRef, + DebuginfodLog &Log, ThreadPool &Pool, + double MinInterval) + : Log(Log), Pool(Pool), MinInterval(MinInterval) { + for (StringRef Path : PathsRef) + Paths.push_back(Path.str()); +} + +Error DebuginfodCollection::update() { + std::lock_guard<sys::Mutex> Guard(UpdateMutex); + if (UpdateTimer.isRunning()) + UpdateTimer.stopTimer(); + UpdateTimer.clear(); + for (const std::string &Path : Paths) { + Log.push("Updating binaries at path " + Path); + if (Error Err = findBinaries(Path)) + return Err; + } + Log.push("Updated collection"); + UpdateTimer.startTimer(); + return Error::success(); +} + +Expected<bool> DebuginfodCollection::updateIfStale() { + if (!UpdateTimer.isRunning()) + return false; + UpdateTimer.stopTimer(); + double Time = UpdateTimer.getTotalTime().getWallTime(); + UpdateTimer.startTimer(); + if (Time < MinInterval) + return false; + if (Error Err = update()) + return std::move(Err); + return true; +} + +Error DebuginfodCollection::updateForever(std::chrono::milliseconds Interval) { + while (true) { + if (Error Err = update()) + return Err; + std::this_thread::sleep_for(Interval); + } + llvm_unreachable("updateForever loop should never end"); +} + +static bool isDebugBinary(object::ObjectFile *Object) { + // TODO: handle PDB debuginfo + std::unique_ptr<DWARFContext> Context = DWARFContext::create( + *Object, DWARFContext::ProcessDebugRelocations::Process); + const DWARFObject &DObj = Context->getDWARFObj(); + unsigned NumSections = 0; + DObj.forEachInfoSections([&](const DWARFSection &S) { NumSections++; }); + return NumSections; +} + +static bool hasELFMagic(StringRef FilePath) { + file_magic Type; + std::error_code EC = identify_magic(FilePath, Type); + if (EC) + return false; + switch (Type) { + case file_magic::elf: + case file_magic::elf_relocatable: + case file_magic::elf_executable: + case file_magic::elf_shared_object: + case file_magic::elf_core: + return true; + default: + return false; + } +} + +Error DebuginfodCollection::findBinaries(StringRef Path) { + std::error_code EC; + sys::fs::recursive_directory_iterator I(Twine(Path), EC), E; + std::mutex IteratorMutex; + ThreadPoolTaskGroup IteratorGroup(Pool); + for (unsigned WorkerIndex = 0; WorkerIndex < Pool.getThreadCount(); + WorkerIndex++) { + IteratorGroup.async([&, this]() -> void { + std::string FilePath; + while (true) { + { + // Check if iteration is over or there is an error during iteration + std::lock_guard<std::mutex> Guard(IteratorMutex); + if (I == E || EC) + return; + // Grab a file path from the directory iterator and advance the + // iterator. + FilePath = I->path(); + I.increment(EC); + } + + // Inspect the file at this path to determine if it is debuginfo. + if (!hasELFMagic(FilePath)) + continue; + + Expected<object::OwningBinary<object::Binary>> BinOrErr = + object::createBinary(FilePath); + + if (!BinOrErr) { + consumeError(BinOrErr.takeError()); + continue; + } + object::Binary *Bin = std::move(BinOrErr.get().getBinary()); + if (!Bin->isObject()) + continue; + + // TODO: Support non-ELF binaries + object::ELFObjectFileBase *Object = + dyn_cast<object::ELFObjectFileBase>(Bin); + if (!Object) + continue; + + Optional<BuildIDRef> ID = symbolize::getBuildID(Object); + if (!ID) + continue; + + std::string IDString = buildIDToString(ID.value()); + if (isDebugBinary(Object)) { + std::lock_guard<sys::RWMutex> DebugBinariesGuard(DebugBinariesMutex); + DebugBinaries[IDString] = FilePath; + } else { + std::lock_guard<sys::RWMutex> BinariesGuard(BinariesMutex); + Binaries[IDString] = FilePath; + } + } + }); + } + IteratorGroup.wait(); + std::unique_lock<std::mutex> Guard(IteratorMutex); + if (EC) + return errorCodeToError(EC); + return Error::success(); +} + +Expected<Optional<std::string>> +DebuginfodCollection::getBinaryPath(BuildIDRef ID) { + Log.push("getting binary path of ID " + buildIDToString(ID)); + std::shared_lock<sys::RWMutex> Guard(BinariesMutex); + auto Loc = Binaries.find(buildIDToString(ID)); + if (Loc != Binaries.end()) { + std::string Path = Loc->getValue(); + return Path; + } + return None; +} + +Expected<Optional<std::string>> +DebuginfodCollection::getDebugBinaryPath(BuildIDRef ID) { + Log.push("getting debug binary path of ID " + buildIDToString(ID)); + std::shared_lock<sys::RWMutex> Guard(DebugBinariesMutex); + auto Loc = DebugBinaries.find(buildIDToString(ID)); + if (Loc != DebugBinaries.end()) { + std::string Path = Loc->getValue(); + return Path; + } + return None; +} + +Expected<std::string> DebuginfodCollection::findBinaryPath(BuildIDRef ID) { + { + // Check collection; perform on-demand update if stale. + Expected<Optional<std::string>> PathOrErr = getBinaryPath(ID); + if (!PathOrErr) + return PathOrErr.takeError(); + Optional<std::string> Path = *PathOrErr; + if (!Path) { + Expected<bool> UpdatedOrErr = updateIfStale(); + if (!UpdatedOrErr) + return UpdatedOrErr.takeError(); + if (*UpdatedOrErr) { + // Try once more. + PathOrErr = getBinaryPath(ID); + if (!PathOrErr) + return PathOrErr.takeError(); + Path = *PathOrErr; + } + } + if (Path) + return Path.value(); + } + + // Try federation. + Expected<std::string> PathOrErr = getCachedOrDownloadExecutable(ID); + if (!PathOrErr) + consumeError(PathOrErr.takeError()); + + // Fall back to debug binary. + return findDebugBinaryPath(ID); +} + +Expected<std::string> DebuginfodCollection::findDebugBinaryPath(BuildIDRef ID) { + // Check collection; perform on-demand update if stale. + Expected<Optional<std::string>> PathOrErr = getDebugBinaryPath(ID); + if (!PathOrErr) + return PathOrErr.takeError(); + Optional<std::string> Path = *PathOrErr; + if (!Path) { + Expected<bool> UpdatedOrErr = updateIfStale(); + if (!UpdatedOrErr) + return UpdatedOrErr.takeError(); + if (*UpdatedOrErr) { + // Try once more. + PathOrErr = getBinaryPath(ID); + if (!PathOrErr) + return PathOrErr.takeError(); + Path = *PathOrErr; + } + } + if (Path) + return Path.value(); + + // Try federation. + return getCachedOrDownloadDebuginfo(ID); +} + +DebuginfodServer::DebuginfodServer(DebuginfodLog &Log, + DebuginfodCollection &Collection) + : Log(Log), Collection(Collection) { + cantFail( + Server.get(R"(/buildid/(.*)/debuginfo)", [&](HTTPServerRequest Request) { + Log.push("GET " + Request.UrlPath); + std::string IDString; + if (!tryGetFromHex(Request.UrlPathMatches[0], IDString)) { + Request.setResponse( + {404, "text/plain", "Build ID is not a hex string\n"}); + return; + } + BuildID ID(IDString.begin(), IDString.end()); + Expected<std::string> PathOrErr = Collection.findDebugBinaryPath(ID); + if (Error Err = PathOrErr.takeError()) { + consumeError(std::move(Err)); + Request.setResponse({404, "text/plain", "Build ID not found\n"}); + return; + } + streamFile(Request, *PathOrErr); + })); + cantFail( + Server.get(R"(/buildid/(.*)/executable)", [&](HTTPServerRequest Request) { + Log.push("GET " + Request.UrlPath); + std::string IDString; + if (!tryGetFromHex(Request.UrlPathMatches[0], IDString)) { + Request.setResponse( + {404, "text/plain", "Build ID is not a hex string\n"}); + return; + } + BuildID ID(IDString.begin(), IDString.end()); + Expected<std::string> PathOrErr = Collection.findBinaryPath(ID); + if (Error Err = PathOrErr.takeError()) { + consumeError(std::move(Err)); + Request.setResponse({404, "text/plain", "Build ID not found\n"}); + return; + } + streamFile(Request, *PathOrErr); + })); +} + } // namespace llvm diff --git a/llvm/lib/Debuginfod/HTTPServer.cpp b/llvm/lib/Debuginfod/HTTPServer.cpp new file mode 100644 index 000000000000..2ea923d5a734 --- /dev/null +++ b/llvm/lib/Debuginfod/HTTPServer.cpp @@ -0,0 +1,189 @@ +//===-- llvm/Debuginfod/HTTPServer.cpp - HTTP server library -----*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// +/// This file defines the methods of the HTTPServer class and the streamFile +/// function. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Debuginfod/HTTPServer.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" + +#ifdef LLVM_ENABLE_HTTPLIB +#include "httplib.h" +#endif + +using namespace llvm; + +bool llvm::streamFile(HTTPServerRequest &Request, StringRef FilePath) { + Expected<sys::fs::file_t> FDOrErr = sys::fs::openNativeFileForRead(FilePath); + if (Error Err = FDOrErr.takeError()) { + consumeError(std::move(Err)); + Request.setResponse({404u, "text/plain", "Could not open file to read.\n"}); + return false; + } + ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = + MemoryBuffer::getOpenFile(*FDOrErr, FilePath, + /*FileSize=*/-1, + /*RequiresNullTerminator=*/false); + sys::fs::closeFile(*FDOrErr); + if (Error Err = errorCodeToError(MBOrErr.getError())) { + consumeError(std::move(Err)); + Request.setResponse({404u, "text/plain", "Could not memory-map file.\n"}); + return false; + } + // Lambdas are copied on conversion to to std::function, preventing use of + // smart pointers. + MemoryBuffer *MB = MBOrErr->release(); + Request.setResponse({200u, "application/octet-stream", MB->getBufferSize(), + [=](size_t Offset, size_t Length) -> StringRef { + return MB->getBuffer().substr(Offset, Length); + }, + [=](bool Success) { delete MB; }}); + return true; +} + +#ifdef LLVM_ENABLE_HTTPLIB + +bool HTTPServer::isAvailable() { return true; } + +HTTPServer::HTTPServer() { Server = std::make_unique<httplib::Server>(); } + +HTTPServer::~HTTPServer() { stop(); } + +static void expandUrlPathMatches(const std::smatch &Matches, + HTTPServerRequest &Request) { + bool UrlPathSet = false; + for (const auto &it : Matches) { + if (UrlPathSet) + Request.UrlPathMatches.push_back(it); + else { + Request.UrlPath = it; + UrlPathSet = true; + } + } +} + +HTTPServerRequest::HTTPServerRequest(const httplib::Request &HTTPLibRequest, + httplib::Response &HTTPLibResponse) + : HTTPLibResponse(HTTPLibResponse) { + expandUrlPathMatches(HTTPLibRequest.matches, *this); +} + +void HTTPServerRequest::setResponse(HTTPResponse Response) { + HTTPLibResponse.set_content(Response.Body.begin(), Response.Body.size(), + Response.ContentType); + HTTPLibResponse.status = Response.Code; +} + +void HTTPServerRequest::setResponse(StreamingHTTPResponse Response) { + HTTPLibResponse.set_content_provider( + Response.ContentLength, Response.ContentType, + [=](size_t Offset, size_t Length, httplib::DataSink &Sink) { + if (Offset < Response.ContentLength) { + StringRef Chunk = Response.Provider(Offset, Length); + Sink.write(Chunk.begin(), Chunk.size()); + } + return true; + }, + [=](bool Success) { Response.CompletionHandler(Success); }); + + HTTPLibResponse.status = Response.Code; +} + +Error HTTPServer::get(StringRef UrlPathPattern, HTTPRequestHandler Handler) { + std::string ErrorMessage; + if (!Regex(UrlPathPattern).isValid(ErrorMessage)) + return createStringError(errc::argument_out_of_domain, ErrorMessage); + Server->Get(std::string(UrlPathPattern), + [Handler](const httplib::Request &HTTPLibRequest, + httplib::Response &HTTPLibResponse) { + HTTPServerRequest Request(HTTPLibRequest, HTTPLibResponse); + Handler(Request); + }); + return Error::success(); +} + +Error HTTPServer::bind(unsigned ListenPort, const char *HostInterface) { + if (!Server->bind_to_port(HostInterface, ListenPort)) + return createStringError(errc::io_error, + "Could not assign requested address."); + Port = ListenPort; + return Error::success(); +} + +Expected<unsigned> HTTPServer::bind(const char *HostInterface) { + int ListenPort = Server->bind_to_any_port(HostInterface); + if (ListenPort < 0) + return createStringError(errc::io_error, + "Could not assign any port on requested address."); + return Port = ListenPort; +} + +Error HTTPServer::listen() { + if (!Port) + return createStringError(errc::io_error, + "Cannot listen without first binding to a port."); + if (!Server->listen_after_bind()) + return createStringError( + errc::io_error, + "An unknown error occurred when cpp-httplib attempted to listen."); + return Error::success(); +} + +void HTTPServer::stop() { + Server->stop(); + Port = 0; +} + +#else + +// TODO: Implement barebones standalone HTTP server implementation. +bool HTTPServer::isAvailable() { return false; } + +HTTPServer::HTTPServer() = default; + +HTTPServer::~HTTPServer() = default; + +void HTTPServerRequest::setResponse(HTTPResponse Response) { + llvm_unreachable("No HTTP server implementation available"); +} + +void HTTPServerRequest::setResponse(StreamingHTTPResponse Response) { + llvm_unreachable("No HTTP server implementation available"); +} + +Error HTTPServer::get(StringRef UrlPathPattern, HTTPRequestHandler Handler) { + llvm_unreachable("No HTTP server implementation available"); +} + +Error HTTPServer::bind(unsigned ListenPort, const char *HostInterface) { + llvm_unreachable("No HTTP server implementation available"); +} + +Expected<unsigned> HTTPServer::bind(const char *HostInterface) { + llvm_unreachable("No HTTP server implementation available"); +} + +Error HTTPServer::listen() { + llvm_unreachable("No HTTP server implementation available"); +} + +void HTTPServer::stop() { + llvm_unreachable("No HTTP server implementation available"); +} + +#endif // LLVM_ENABLE_HTTPLIB diff --git a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp index 29a623ebe449..f1eeee3b3599 100644 --- a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp +++ b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp @@ -12,7 +12,6 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Mutex.h" #include <mutex> @@ -91,11 +90,18 @@ typedef llvm::DenseMap<JITEventListener::ObjectKey, RegisteredObjectInfo> /// object files that are in executable memory managed by the client of this /// class. class GDBJITRegistrationListener : public JITEventListener { + /// Lock used to serialize all jit registration events, since they + /// modify global variables. + /// + /// Only a single instance of GDBJITRegistrationListener is ever created, + /// and so the lock can be a member variable of that instance. This ensures + /// destructors are run in the correct order. + sys::Mutex JITDebugLock; + /// A map of in-memory object files that have been registered with the /// JIT interface. RegisteredObjectBufferMap ObjectBufferMap; -public: /// Instantiates the JIT service. GDBJITRegistrationListener() = default; @@ -103,6 +109,12 @@ public: /// internal resources. ~GDBJITRegistrationListener() override; +public: + static GDBJITRegistrationListener &instance() { + static GDBJITRegistrationListener Instance; + return Instance; + } + /// Creates an entry in the JIT registry for the buffer @p Object, /// which must contain an object file in executable memory with any /// debug information for the debugger. @@ -121,10 +133,6 @@ private: void deregisterObjectInternal(RegisteredObjectBufferMap::iterator I); }; -/// Lock used to serialize all jit registration events, since they -/// modify global variables. -ManagedStatic<sys::Mutex> JITDebugLock; - /// Do the registration. void NotifyDebugger(jit_code_entry* JITCodeEntry) { __jit_debug_descriptor.action_flag = JIT_REGISTER_FN; @@ -143,7 +151,7 @@ void NotifyDebugger(jit_code_entry* JITCodeEntry) { GDBJITRegistrationListener::~GDBJITRegistrationListener() { // Free all registered object files. - std::lock_guard<llvm::sys::Mutex> locked(*JITDebugLock); + std::lock_guard<llvm::sys::Mutex> locked(JITDebugLock); for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(), E = ObjectBufferMap.end(); I != E; ++I) { @@ -167,7 +175,7 @@ void GDBJITRegistrationListener::notifyObjectLoaded( const char *Buffer = DebugObj.getBinary()->getMemoryBufferRef().getBufferStart(); size_t Size = DebugObj.getBinary()->getMemoryBufferRef().getBufferSize(); - std::lock_guard<llvm::sys::Mutex> locked(*JITDebugLock); + std::lock_guard<llvm::sys::Mutex> locked(JITDebugLock); assert(ObjectBufferMap.find(K) == ObjectBufferMap.end() && "Second attempt to perform debug registration."); jit_code_entry* JITCodeEntry = new jit_code_entry(); @@ -186,7 +194,7 @@ void GDBJITRegistrationListener::notifyObjectLoaded( } void GDBJITRegistrationListener::notifyFreeingObject(ObjectKey K) { - std::lock_guard<llvm::sys::Mutex> locked(*JITDebugLock); + std::lock_guard<llvm::sys::Mutex> locked(JITDebugLock); RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(K); if (I != ObjectBufferMap.end()) { @@ -228,14 +236,12 @@ void GDBJITRegistrationListener::deregisterObjectInternal( JITCodeEntry = nullptr; } -llvm::ManagedStatic<GDBJITRegistrationListener> GDBRegListener; - } // end namespace namespace llvm { JITEventListener* JITEventListener::createGDBRegistrationListener() { - return &*GDBRegListener; + return &GDBJITRegistrationListener::instance(); } } // namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/COFF.cpp b/llvm/lib/ExecutionEngine/JITLink/COFF.cpp new file mode 100644 index 000000000000..fddc9b813fb2 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/COFF.cpp @@ -0,0 +1,137 @@ +//===-------------- COFF.cpp - JIT linker function for COFF -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// COFF jit-link function. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/COFF.h" + +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/ExecutionEngine/JITLink/COFF_x86_64.h" +#include "llvm/Object/COFF.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MemoryBuffer.h" +#include <cstring> + +using namespace llvm; + +#define DEBUG_TYPE "jitlink" + +namespace llvm { +namespace jitlink { + +static StringRef getMachineName(uint16_t Machine) { + switch (Machine) { + case COFF::IMAGE_FILE_MACHINE_I386: + return "i386"; + case COFF::IMAGE_FILE_MACHINE_AMD64: + return "x86_64"; + case COFF::IMAGE_FILE_MACHINE_ARMNT: + return "ARM"; + case COFF::IMAGE_FILE_MACHINE_ARM64: + return "ARM64"; + default: + return "unknown"; + } +} + +Expected<std::unique_ptr<LinkGraph>> +createLinkGraphFromCOFFObject(MemoryBufferRef ObjectBuffer) { + StringRef Data = ObjectBuffer.getBuffer(); + + // Check magic + auto Magic = identify_magic(ObjectBuffer.getBuffer()); + if (Magic != file_magic::coff_object) + return make_error<JITLinkError>("Invalid COFF buffer"); + + if (Data.size() < sizeof(object::coff_file_header)) + return make_error<JITLinkError>("Truncated COFF buffer"); + + uint64_t CurPtr = 0; + bool IsPE = false; + + // Check if this is a PE/COFF file. + if (Data.size() >= sizeof(object::dos_header) + sizeof(COFF::PEMagic)) { + const auto *DH = + reinterpret_cast<const object::dos_header *>(Data.data() + CurPtr); + if (DH->Magic[0] == 'M' && DH->Magic[1] == 'Z') { + // Check the PE magic bytes. ("PE\0\0") + CurPtr = DH->AddressOfNewExeHeader; + if (memcmp(Data.data() + CurPtr, COFF::PEMagic, sizeof(COFF::PEMagic)) != + 0) { + return make_error<JITLinkError>("Incorrect PE magic"); + } + CurPtr += sizeof(COFF::PEMagic); + IsPE = true; + } + } + if (Data.size() < CurPtr + sizeof(object::coff_file_header)) + return make_error<JITLinkError>("Truncated COFF buffer"); + + const object::coff_file_header *COFFHeader = + reinterpret_cast<const object::coff_file_header *>(Data.data() + CurPtr); + const object::coff_bigobj_file_header *COFFBigObjHeader = nullptr; + + // Deal with bigobj file + if (!IsPE && COFFHeader->Machine == COFF::IMAGE_FILE_MACHINE_UNKNOWN && + COFFHeader->NumberOfSections == uint16_t(0xffff) && + Data.size() >= sizeof(object::coff_bigobj_file_header)) { + if (Data.size() < sizeof(object::coff_file_header)) { + return make_error<JITLinkError>("Truncated COFF buffer"); + } + COFFBigObjHeader = + reinterpret_cast<const object::coff_bigobj_file_header *>(Data.data() + + CurPtr); + + // Verify that we are dealing with bigobj. + if (COFFBigObjHeader->Version >= COFF::BigObjHeader::MinBigObjectVersion && + std::memcmp(COFFBigObjHeader->UUID, COFF::BigObjMagic, + sizeof(COFF::BigObjMagic)) == 0) { + COFFHeader = nullptr; + CurPtr += sizeof(object::coff_bigobj_file_header); + } else + COFFBigObjHeader = nullptr; + } + + uint16_t Machine = + COFFHeader ? COFFHeader->Machine : COFFBigObjHeader->Machine; + LLVM_DEBUG({ + dbgs() << "jitLink_COFF: PE = " << (IsPE ? "yes" : "no") + << ", bigobj = " << (COFFBigObjHeader ? "yes" : "no") + << ", identifier = \"" << ObjectBuffer.getBufferIdentifier() << "\" " + << "machine = " << getMachineName(Machine) << "\n"; + }); + + switch (Machine) { + case COFF::IMAGE_FILE_MACHINE_AMD64: + return createLinkGraphFromCOFFObject_x86_64(ObjectBuffer); + default: + return make_error<JITLinkError>( + "Unsupported target machine architecture in COFF object " + + ObjectBuffer.getBufferIdentifier() + ": " + getMachineName(Machine)); + } +} + +void link_COFF(std::unique_ptr<LinkGraph> G, + std::unique_ptr<JITLinkContext> Ctx) { + switch (G->getTargetTriple().getArch()) { + case Triple::x86_64: + link_COFF_x86_64(std::move(G), std::move(Ctx)); + return; + default: + Ctx->notifyFailed(make_error<JITLinkError>( + "Unsupported target machine architecture in COFF link graph " + + G->getName())); + return; + } +} + +} // end namespace jitlink +} // end namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp new file mode 100644 index 000000000000..43b9c2ba400b --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp @@ -0,0 +1,527 @@ +//=--------- COFFLinkGraphBuilder.cpp - COFF LinkGraph builder ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic COFF LinkGraph buliding code. +// +//===----------------------------------------------------------------------===// +#include "COFFLinkGraphBuilder.h" + +#define DEBUG_TYPE "jitlink" + +static const char *CommonSectionName = "__common"; + +namespace llvm { +namespace jitlink { + +COFFLinkGraphBuilder::COFFLinkGraphBuilder( + const object::COFFObjectFile &Obj, Triple TT, + LinkGraph::GetEdgeKindNameFunction GetEdgeKindName) + : Obj(Obj), + G(std::make_unique<LinkGraph>( + Obj.getFileName().str(), Triple(std::move(TT)), getPointerSize(Obj), + getEndianness(Obj), std::move(GetEdgeKindName))) { + LLVM_DEBUG({ + dbgs() << "Created COFFLinkGraphBuilder for \"" << Obj.getFileName() + << "\"\n"; + }); +} + +COFFLinkGraphBuilder::~COFFLinkGraphBuilder() = default; + +unsigned +COFFLinkGraphBuilder::getPointerSize(const object::COFFObjectFile &Obj) { + return Obj.getBytesInAddress(); +} + +support::endianness +COFFLinkGraphBuilder::getEndianness(const object::COFFObjectFile &Obj) { + return Obj.isLittleEndian() ? support::little : support::big; +} + +uint64_t COFFLinkGraphBuilder::getSectionSize(const object::COFFObjectFile &Obj, + const object::coff_section *Sec) { + // Consider the difference between executable form and object form. + // More information is inside COFFObjectFile::getSectionSize + if (Obj.getDOSHeader()) + return std::min(Sec->VirtualSize, Sec->SizeOfRawData); + return Sec->SizeOfRawData; +} + +uint64_t +COFFLinkGraphBuilder::getSectionAddress(const object::COFFObjectFile &Obj, + const object::coff_section *Section) { + return Section->VirtualAddress + Obj.getImageBase(); +} + +bool COFFLinkGraphBuilder::isComdatSection( + const object::coff_section *Section) { + return Section->Characteristics & COFF::IMAGE_SCN_LNK_COMDAT; +} + +Section &COFFLinkGraphBuilder::getCommonSection() { + if (!CommonSection) + CommonSection = + &G->createSection(CommonSectionName, MemProt::Read | MemProt::Write); + return *CommonSection; +} + +Expected<std::unique_ptr<LinkGraph>> COFFLinkGraphBuilder::buildGraph() { + if (!Obj.isRelocatableObject()) + return make_error<JITLinkError>("Object is not a relocatable COFF file"); + + if (auto Err = graphifySections()) + return std::move(Err); + + if (auto Err = graphifySymbols()) + return std::move(Err); + + if (auto Err = addRelocations()) + return std::move(Err); + + return std::move(G); +} + +StringRef +COFFLinkGraphBuilder::getCOFFSectionName(COFFSectionIndex SectionIndex, + const object::coff_section *Sec, + object::COFFSymbolRef Sym) { + switch (SectionIndex) { + case COFF::IMAGE_SYM_UNDEFINED: { + if (Sym.getValue()) + return "(common)"; + else + return "(external)"; + } + case COFF::IMAGE_SYM_ABSOLUTE: + return "(absolute)"; + case COFF::IMAGE_SYM_DEBUG: { + // Used with .file symbol + return "(debug)"; + } + default: { + // Non reserved regular section numbers + if (Expected<StringRef> SecNameOrErr = Obj.getSectionName(Sec)) + return *SecNameOrErr; + } + } + return ""; +} + +Error COFFLinkGraphBuilder::graphifySections() { + LLVM_DEBUG(dbgs() << " Creating graph sections...\n"); + + GraphBlocks.resize(Obj.getNumberOfSections() + 1); + // For each section... + for (COFFSectionIndex SecIndex = 1; + SecIndex <= static_cast<COFFSectionIndex>(Obj.getNumberOfSections()); + SecIndex++) { + Expected<const object::coff_section *> Sec = Obj.getSection(SecIndex); + if (!Sec) + return Sec.takeError(); + + StringRef SectionName; + if (Expected<StringRef> SecNameOrErr = Obj.getSectionName(*Sec)) + SectionName = *SecNameOrErr; + + bool IsDiscardable = + (*Sec)->Characteristics & + (COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_LNK_INFO); + if (IsDiscardable) { + LLVM_DEBUG(dbgs() << " " << SecIndex << ": \"" << SectionName + << "\" is discardable: " + "No graph section will be created.\n"); + continue; + } + + // FIXME: Skip debug info sections + + LLVM_DEBUG({ + dbgs() << " " + << "Creating section for \"" << SectionName << "\"\n"; + }); + + // Get the section's memory protection flags. + MemProt Prot = MemProt::None; + if ((*Sec)->Characteristics & COFF::IMAGE_SCN_MEM_EXECUTE) + Prot |= MemProt::Exec; + if ((*Sec)->Characteristics & COFF::IMAGE_SCN_MEM_READ) + Prot |= MemProt::Read; + if ((*Sec)->Characteristics & COFF::IMAGE_SCN_MEM_WRITE) + Prot |= MemProt::Write; + + // Look for existing sections first. + auto *GraphSec = G->findSectionByName(SectionName); + if (!GraphSec) + GraphSec = &G->createSection(SectionName, Prot); + if (GraphSec->getMemProt() != Prot) + return make_error<JITLinkError>("MemProt should match"); + + Block *B = nullptr; + if ((*Sec)->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) + B = &G->createZeroFillBlock( + *GraphSec, getSectionSize(Obj, *Sec), + orc::ExecutorAddr(getSectionAddress(Obj, *Sec)), + (*Sec)->getAlignment(), 0); + else { + ArrayRef<uint8_t> Data; + if (auto Err = Obj.getSectionContents(*Sec, Data)) + return Err; + + B = &G->createContentBlock( + *GraphSec, + ArrayRef<char>(reinterpret_cast<const char *>(Data.data()), + Data.size()), + orc::ExecutorAddr(getSectionAddress(Obj, *Sec)), + (*Sec)->getAlignment(), 0); + } + + setGraphBlock(SecIndex, B); + } + + return Error::success(); +} + +Error COFFLinkGraphBuilder::graphifySymbols() { + LLVM_DEBUG(dbgs() << " Creating graph symbols...\n"); + + SymbolSets.resize(Obj.getNumberOfSections() + 1); + GraphSymbols.resize(Obj.getNumberOfSymbols()); + + for (COFFSymbolIndex SymIndex = 0; + SymIndex < static_cast<COFFSymbolIndex>(Obj.getNumberOfSymbols()); + SymIndex++) { + Expected<object::COFFSymbolRef> Sym = Obj.getSymbol(SymIndex); + if (!Sym) + return Sym.takeError(); + + StringRef SymbolName; + if (Expected<StringRef> SymNameOrErr = Obj.getSymbolName(*Sym)) + SymbolName = *SymNameOrErr; + + COFFSectionIndex SectionIndex = Sym->getSectionNumber(); + const object::coff_section *Sec = nullptr; + + if (!COFF::isReservedSectionNumber(SectionIndex)) { + auto SecOrErr = Obj.getSection(SectionIndex); + if (!SecOrErr) + return make_error<JITLinkError>( + "Invalid COFF section number:" + formatv("{0:d}: ", SectionIndex) + + " (" + toString(SecOrErr.takeError()) + ")"); + Sec = *SecOrErr; + } + + // Create jitlink symbol + jitlink::Symbol *GSym = nullptr; + if (Sym->isFileRecord()) + LLVM_DEBUG({ + dbgs() << " " << SymIndex << ": Skipping FileRecord symbol \"" + << SymbolName << "\" in " + << getCOFFSectionName(SectionIndex, Sec, *Sym) + << " (index: " << SectionIndex << ") \n"; + }); + else if (Sym->isUndefined()) { + LLVM_DEBUG({ + dbgs() << " " << SymIndex + << ": Creating external graph symbol for COFF symbol \"" + << SymbolName << "\" in " + << getCOFFSectionName(SectionIndex, Sec, *Sym) + << " (index: " << SectionIndex << ") \n"; + }); + GSym = + &G->addExternalSymbol(SymbolName, Sym->getValue(), Linkage::Strong); + } else if (Sym->isWeakExternal()) { + COFFSymbolIndex TagIndex = + Sym->getAux<object::coff_aux_weak_external>()->TagIndex; + assert(Sym->getAux<object::coff_aux_weak_external>()->Characteristics != + COFF::IMAGE_WEAK_EXTERN_SEARCH_NOLIBRARY && + "IMAGE_WEAK_EXTERN_SEARCH_NOLIBRARY is not supported."); + assert(Sym->getAux<object::coff_aux_weak_external>()->Characteristics != + COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY && + "IMAGE_WEAK_EXTERN_SEARCH_LIBRARY is not supported."); + WeakAliasRequests.push_back({SymIndex, TagIndex, SymbolName}); + } else { + Expected<jitlink::Symbol *> NewGSym = + createDefinedSymbol(SymIndex, SymbolName, *Sym, Sec); + if (!NewGSym) + return NewGSym.takeError(); + GSym = *NewGSym; + if (GSym) { + LLVM_DEBUG({ + dbgs() << " " << SymIndex + << ": Creating defined graph symbol for COFF symbol \"" + << SymbolName << "\" in " + << getCOFFSectionName(SectionIndex, Sec, *Sym) + << " (index: " << SectionIndex << ") \n"; + dbgs() << " " << *GSym << "\n"; + }); + } + } + + // Register the symbol + if (GSym) + setGraphSymbol(SectionIndex, SymIndex, *GSym); + SymIndex += Sym->getNumberOfAuxSymbols(); + } + + if (auto Err = flushWeakAliasRequests()) + return Err; + + if (auto Err = calculateImplicitSizeOfSymbols()) + return Err; + + return Error::success(); +} + +Error COFFLinkGraphBuilder::flushWeakAliasRequests() { + // Export the weak external symbols and alias it + for (auto &WeakAlias : WeakAliasRequests) { + if (auto *Target = getGraphSymbol(WeakAlias.Target)) { + Expected<object::COFFSymbolRef> AliasSymbol = + Obj.getSymbol(WeakAlias.Alias); + if (!AliasSymbol) + return AliasSymbol.takeError(); + + // FIXME: Support this when there's a way to handle this. + if (!Target->isDefined()) + return make_error<JITLinkError>("Weak external symbol with external " + "symbol as alternative not supported."); + + jitlink::Symbol *NewSymbol = &G->addDefinedSymbol( + Target->getBlock(), Target->getOffset(), WeakAlias.SymbolName, + Target->getSize(), Linkage::Weak, Scope::Default, + Target->isCallable(), false); + setGraphSymbol(AliasSymbol->getSectionNumber(), WeakAlias.Alias, + *NewSymbol); + LLVM_DEBUG({ + dbgs() << " " << WeakAlias.Alias + << ": Creating weak external symbol for COFF symbol \"" + << WeakAlias.SymbolName << "\" in section " + << AliasSymbol->getSectionNumber() << "\n"; + dbgs() << " " << *NewSymbol << "\n"; + }); + } else + return make_error<JITLinkError>("Weak symbol alias requested but actual " + "symbol not found for symbol " + + formatv("{0:d}", WeakAlias.Alias)); + } + return Error::success(); +} + +// In COFF, most of the defined symbols don't contain the size information. +// Hence, we calculate the "implicit" size of symbol by taking the delta of +// offsets of consecutive symbols within a block. We maintain a balanced tree +// set of symbols sorted by offset per each block in order to achieve +// logarithmic time complexity of sorted symbol insertion. Symbol is inserted to +// the set once it's processed in graphifySymbols. In this function, we iterate +// each collected symbol in sorted order and calculate the implicit size. +Error COFFLinkGraphBuilder::calculateImplicitSizeOfSymbols() { + for (COFFSectionIndex SecIndex = 1; + SecIndex <= static_cast<COFFSectionIndex>(Obj.getNumberOfSections()); + SecIndex++) { + auto &SymbolSet = SymbolSets[SecIndex]; + jitlink::Block *B = getGraphBlock(SecIndex); + orc::ExecutorAddrDiff LastOffset = B->getSize(); + orc::ExecutorAddrDiff LastDifferentOffset = B->getSize(); + orc::ExecutorAddrDiff LastSize = 0; + for (auto It = SymbolSet.rbegin(); It != SymbolSet.rend(); It++) { + orc::ExecutorAddrDiff Offset = It->first; + jitlink::Symbol *Symbol = It->second; + orc::ExecutorAddrDiff CandSize; + // Last offset can be same when aliasing happened + if (Symbol->getOffset() == LastOffset) + CandSize = LastSize; + else + CandSize = LastOffset - Offset; + + LLVM_DEBUG({ + if (Offset + Symbol->getSize() > LastDifferentOffset) + dbgs() << " Overlapping symbol range generated for the following " + "symbol:" + << "\n" + << " " << *Symbol << "\n"; + }); + (void)LastDifferentOffset; + if (LastOffset != Offset) + LastDifferentOffset = Offset; + LastSize = CandSize; + LastOffset = Offset; + if (Symbol->getSize()) { + // Non empty symbol can happen in COMDAT symbol. + // We don't consider the possibility of overlapping symbol range that + // could be introduced by disparity between inferred symbol size and + // defined symbol size because symbol size information is currently only + // used by jitlink-check where we have control to not make overlapping + // ranges. + continue; + } + + LLVM_DEBUG({ + if (!CandSize) + dbgs() << " Empty implicit symbol size generated for the following " + "symbol:" + << "\n" + << " " << *Symbol << "\n"; + }); + + Symbol->setSize(CandSize); + } + } + return Error::success(); +} + +Expected<Symbol *> COFFLinkGraphBuilder::createDefinedSymbol( + COFFSymbolIndex SymIndex, StringRef SymbolName, + object::COFFSymbolRef Symbol, const object::coff_section *Section) { + if (Symbol.isCommon()) { + // FIXME: correct alignment + return &G->addCommonSymbol(SymbolName, Scope::Default, getCommonSection(), + orc::ExecutorAddr(), Symbol.getValue(), + Symbol.getValue(), false); + } + if (Symbol.isAbsolute()) + return &G->addAbsoluteSymbol(SymbolName, + orc::ExecutorAddr(Symbol.getValue()), 0, + Linkage::Strong, Scope::Local, false); + + if (llvm::COFF::isReservedSectionNumber(Symbol.getSectionNumber())) + return make_error<JITLinkError>( + "Reserved section number used in regular symbol " + + formatv("{0:d}", SymIndex)); + + Block *B = getGraphBlock(Symbol.getSectionNumber()); + if (Symbol.isExternal()) { + // This is not a comdat sequence, export the symbol as it is + if (!isComdatSection(Section)) + return &G->addDefinedSymbol( + *B, Symbol.getValue(), SymbolName, 0, Linkage::Strong, Scope::Default, + Symbol.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION, false); + else { + if (!PendingComdatExport) + return make_error<JITLinkError>("No pending COMDAT export for symbol " + + formatv("{0:d}", SymIndex)); + if (PendingComdatExport->SectionIndex != Symbol.getSectionNumber()) + return make_error<JITLinkError>( + "COMDAT export section number mismatch for symbol " + + formatv("{0:d}", SymIndex)); + return exportCOMDATSymbol(SymIndex, SymbolName, Symbol); + } + } + + if (Symbol.getStorageClass() == COFF::IMAGE_SYM_CLASS_STATIC) { + const object::coff_aux_section_definition *Definition = + Symbol.getSectionDefinition(); + if (!Definition || !isComdatSection(Section)) { + // Handle typical static symbol + return &G->addDefinedSymbol( + *B, Symbol.getValue(), SymbolName, 0, Linkage::Strong, Scope::Local, + Symbol.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION, false); + } + if (Definition->Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) { + // FIXME: don't dead strip this when parent section is alive + return &G->addDefinedSymbol( + *B, Symbol.getValue(), SymbolName, 0, Linkage::Strong, Scope::Local, + Symbol.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION, false); + } + if (PendingComdatExport) + return make_error<JITLinkError>( + "COMDAT export request already exists before symbol " + + formatv("{0:d}", SymIndex)); + return createCOMDATExportRequest(SymIndex, Symbol, Definition); + } + return make_error<JITLinkError>("Unsupported storage class " + + formatv("{0:d}", Symbol.getStorageClass()) + + " in symbol " + formatv("{0:d}", SymIndex)); +} + +// COMDAT handling: +// When IMAGE_SCN_LNK_COMDAT flag is set in the flags of a section, +// the section is called a COMDAT section. It contains two symbols +// in a sequence that specifes the behavior. First symbol is the section +// symbol which contains the size and name of the section. It also contains +// selection type that specifies how duplicate of the symbol is handled. +// Second symbol is COMDAT symbol which usually defines the external name and +// data type. +// +// Since two symbols always come in a specific order, we initiate pending COMDAT +// export request when we encounter the first symbol and actually exports it +// when we process the second symbol. +// +// Process the first symbol of COMDAT sequence. +Expected<Symbol *> COFFLinkGraphBuilder::createCOMDATExportRequest( + COFFSymbolIndex SymIndex, object::COFFSymbolRef Symbol, + const object::coff_aux_section_definition *Definition) { + Block *B = getGraphBlock(Symbol.getSectionNumber()); + Linkage L = Linkage::Strong; + switch (Definition->Selection) { + case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES: { + L = Linkage::Strong; + break; + } + case COFF::IMAGE_COMDAT_SELECT_ANY: { + L = Linkage::Weak; + break; + } + case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH: + case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE: { + // FIXME: Implement size/content validation when LinkGraph is able to + // handle this. + L = Linkage::Weak; + break; + } + case COFF::IMAGE_COMDAT_SELECT_LARGEST: { + // FIXME: Support IMAGE_COMDAT_SELECT_LARGEST when LinkGraph is able to + // handle this. + return make_error<JITLinkError>( + "IMAGE_COMDAT_SELECT_LARGEST is not supported."); + } + case COFF::IMAGE_COMDAT_SELECT_NEWEST: { + // Even link.exe doesn't support this selection properly. + return make_error<JITLinkError>( + "IMAGE_COMDAT_SELECT_NEWEST is not supported."); + } + default: { + return make_error<JITLinkError>("Invalid comdat selection type: " + + formatv("{0:d}", Definition->Selection)); + } + } + PendingComdatExport = {SymIndex, Symbol.getSectionNumber(), L}; + return &G->addAnonymousSymbol(*B, Symbol.getValue(), Definition->Length, + false, false); +} + +// Process the second symbol of COMDAT sequence. +Expected<Symbol *> +COFFLinkGraphBuilder::exportCOMDATSymbol(COFFSymbolIndex SymIndex, + StringRef SymbolName, + object::COFFSymbolRef Symbol) { + COFFSymbolIndex TargetIndex = PendingComdatExport->SymbolIndex; + Linkage L = PendingComdatExport->Linkage; + jitlink::Symbol *Target = getGraphSymbol(TargetIndex); + assert(Target && "COMDAT leaader is invalid."); + assert((llvm::count_if(G->defined_symbols(), + [&](const jitlink::Symbol *Sym) { + return Sym->getName() == SymbolName; + }) == 0) && + "Duplicate defined symbol"); + Target->setName(SymbolName); + Target->setLinkage(L); + Target->setCallable(Symbol.getComplexType() == + COFF::IMAGE_SYM_DTYPE_FUNCTION); + Target->setScope(Scope::Default); + LLVM_DEBUG({ + dbgs() << " " << SymIndex + << ": Exporting COMDAT graph symbol for COFF symbol \"" << SymbolName + << "\" in section " << Symbol.getSectionNumber() << "\n"; + dbgs() << " " << *Target << "\n"; + }); + PendingComdatExport = None; + return Target; +} + +} // namespace jitlink +} // namespace llvm
\ No newline at end of file diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h new file mode 100644 index 000000000000..4dc1b14dc4a2 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.h @@ -0,0 +1,199 @@ +//===----- COFFLinkGraphBuilder.h - COFF LinkGraph builder ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic COFF LinkGraph building code. +// +//===----------------------------------------------------------------------===// + +#ifndef LIB_EXECUTIONENGINE_JITLINK_COFFLINKGRAPHBUILDER_H +#define LIB_EXECUTIONENGINE_JITLINK_COFFLINKGRAPHBUILDER_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/Object/COFF.h" + +#include "EHFrameSupportImpl.h" +#include "JITLinkGeneric.h" + +#define DEBUG_TYPE "jitlink" + +#include <list> + +namespace llvm { +namespace jitlink { + +class COFFLinkGraphBuilder { +public: + virtual ~COFFLinkGraphBuilder(); + Expected<std::unique_ptr<LinkGraph>> buildGraph(); + +protected: + using COFFSectionIndex = int32_t; + using COFFSymbolIndex = int32_t; + + COFFLinkGraphBuilder(const object::COFFObjectFile &Obj, Triple TT, + LinkGraph::GetEdgeKindNameFunction GetEdgeKindName); + + LinkGraph &getGraph() const { return *G; } + + const object::COFFObjectFile &getObject() const { return Obj; } + + virtual Error addRelocations() = 0; + + Error graphifySections(); + Error graphifySymbols(); + + void setGraphSymbol(COFFSectionIndex SecIndex, COFFSymbolIndex SymIndex, + Symbol &Sym) { + assert(!GraphSymbols[SymIndex] && "Duplicate symbol at index"); + GraphSymbols[SymIndex] = &Sym; + if (!COFF::isReservedSectionNumber(SecIndex)) + SymbolSets[SecIndex].insert({Sym.getOffset(), &Sym}); + } + + Symbol *getGraphSymbol(COFFSymbolIndex SymIndex) const { + if (SymIndex < 0 || + SymIndex >= static_cast<COFFSymbolIndex>(GraphSymbols.size())) + return nullptr; + return GraphSymbols[SymIndex]; + } + + void setGraphBlock(COFFSectionIndex SecIndex, Block *B) { + assert(!GraphBlocks[SecIndex] && "Duplicate section at index"); + assert(!COFF::isReservedSectionNumber(SecIndex) && "Invalid section index"); + GraphBlocks[SecIndex] = B; + } + + Block *getGraphBlock(COFFSectionIndex SecIndex) const { + if (SecIndex <= 0 || + SecIndex >= static_cast<COFFSectionIndex>(GraphSymbols.size())) + return nullptr; + return GraphBlocks[SecIndex]; + } + + object::COFFObjectFile::section_iterator_range sections() const { + return Obj.sections(); + } + + /// Traverse all matching relocation records in the given section. The handler + /// function Func should be callable with this signature: + /// Error(const object::RelocationRef&, + /// const object::SectionRef&, Section &) + /// + template <typename RelocHandlerFunction> + Error forEachRelocation(const object::SectionRef &RelSec, + RelocHandlerFunction &&Func, + bool ProcessDebugSections = false); + + /// Traverse all matching relocation records in the given section. Convenience + /// wrapper to allow passing a member function for the handler. + /// + template <typename ClassT, typename RelocHandlerMethod> + Error forEachRelocation(const object::SectionRef &RelSec, ClassT *Instance, + RelocHandlerMethod &&Method, + bool ProcessDebugSections = false) { + return forEachRelocation( + RelSec, + [Instance, Method](const auto &Rel, const auto &Target, auto &GS) { + return (Instance->*Method)(Rel, Target, GS); + }, + ProcessDebugSections); + } + +private: + // Pending comdat symbol export that is initiated by the first symbol of + // COMDAT sequence. + struct ComdatExportRequest { + COFFSymbolIndex SymbolIndex; + COFFSectionIndex SectionIndex; + jitlink::Linkage Linkage; + }; + Optional<ComdatExportRequest> PendingComdatExport; + + // This represents a pending request to create a weak external symbol with a + // name. + struct WeakAliasRequest { + COFFSymbolIndex Alias; + COFFSymbolIndex Target; + StringRef SymbolName; + }; + std::vector<WeakAliasRequest> WeakAliasRequests; + + // Per COFF section jitlink symbol set sorted by offset. + // Used for calculating implicit size of defined symbols. + using SymbolSet = std::set<std::pair<orc::ExecutorAddrDiff, Symbol *>>; + std::vector<SymbolSet> SymbolSets; + + Section &getCommonSection(); + + Expected<Symbol *> createDefinedSymbol(COFFSymbolIndex SymIndex, + StringRef SymbolName, + object::COFFSymbolRef Symbol, + const object::coff_section *Section); + Expected<Symbol *> createCOMDATExportRequest( + COFFSymbolIndex SymIndex, object::COFFSymbolRef Symbol, + const object::coff_aux_section_definition *Definition); + Expected<Symbol *> exportCOMDATSymbol(COFFSymbolIndex SymIndex, + StringRef SymbolName, + object::COFFSymbolRef Symbol); + Error flushWeakAliasRequests(); + Error calculateImplicitSizeOfSymbols(); + + static uint64_t getSectionAddress(const object::COFFObjectFile &Obj, + const object::coff_section *Section); + static uint64_t getSectionSize(const object::COFFObjectFile &Obj, + const object::coff_section *Section); + static bool isComdatSection(const object::coff_section *Section); + static unsigned getPointerSize(const object::COFFObjectFile &Obj); + static support::endianness getEndianness(const object::COFFObjectFile &Obj); + StringRef getCOFFSectionName(COFFSectionIndex SectionIndex, + const object::coff_section *Sec, + object::COFFSymbolRef Sym); + + const object::COFFObjectFile &Obj; + std::unique_ptr<LinkGraph> G; + + Section *CommonSection = nullptr; + std::vector<Block *> GraphBlocks; + std::vector<Symbol *> GraphSymbols; +}; + +template <typename RelocHandlerFunction> +Error COFFLinkGraphBuilder::forEachRelocation(const object::SectionRef &RelSec, + RelocHandlerFunction &&Func, + bool ProcessDebugSections) { + + auto COFFRelSect = Obj.getCOFFSection(RelSec); + + // Target sections have names in valid COFF object files. + Expected<StringRef> Name = Obj.getSectionName(COFFRelSect); + if (!Name) + return Name.takeError(); + LLVM_DEBUG(dbgs() << " " << *Name << ":\n"); + + // Lookup the link-graph node corresponding to the target section name. + auto *BlockToFix = getGraphBlock(RelSec.getIndex() + 1); + if (!BlockToFix) + return make_error<StringError>( + "Referencing a section that wasn't added to the graph: " + *Name, + inconvertibleErrorCode()); + + // Let the callee process relocation entries one by one. + for (const auto &R : RelSec.relocations()) + if (Error Err = Func(R, RelSec, *BlockToFix)) + return Err; + + LLVM_DEBUG(dbgs() << "\n"); + return Error::success(); +} + +} // end namespace jitlink +} // end namespace llvm + +#endif // LIB_EXECUTIONENGINE_JITLINK_COFFLINKGRAPHBUILDER_H diff --git a/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp new file mode 100644 index 000000000000..3d36ad1ed767 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/COFF_x86_64.cpp @@ -0,0 +1,216 @@ +//===----- COFF_x86_64.cpp - JIT linker implementation for COFF/x86_64 ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// COFF/x86_64 jit-link implementation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/COFF_x86_64.h" +#include "COFFLinkGraphBuilder.h" +#include "EHFrameSupportImpl.h" +#include "JITLinkGeneric.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/ExecutionEngine/JITLink/x86_64.h" +#include "llvm/Object/COFF.h" +#include "llvm/Support/Endian.h" + +#define DEBUG_TYPE "jitlink" + +using namespace llvm; +using namespace llvm::jitlink; + +namespace { + +class COFFJITLinker_x86_64 : public JITLinker<COFFJITLinker_x86_64> { + friend class JITLinker<COFFJITLinker_x86_64>; + +public: + COFFJITLinker_x86_64(std::unique_ptr<JITLinkContext> Ctx, + std::unique_ptr<LinkGraph> G, + PassConfiguration PassConfig) + : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {} + +private: + Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const { + return x86_64::applyFixup(G, B, E, nullptr); + } +}; + +class COFFLinkGraphBuilder_x86_64 : public COFFLinkGraphBuilder { +private: + uint64_t ImageBase = 0; + enum COFFX86RelocationKind { + COFFAddr32NB, + COFFRel32, + }; + + static Expected<COFFX86RelocationKind> + getRelocationKind(const uint32_t Type) { + switch (Type) { + case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_ADDR32NB: + return COFFAddr32NB; + case COFF::RelocationTypeAMD64::IMAGE_REL_AMD64_REL32: + return COFFRel32; + } + + return make_error<JITLinkError>("Unsupported x86_64 relocation:" + + formatv("{0:d}", Type)); + } + + Error addRelocations() override { + + LLVM_DEBUG(dbgs() << "Processing relocations:\n"); + + for (const auto &RelSect : sections()) + if (Error Err = COFFLinkGraphBuilder::forEachRelocation( + RelSect, this, &COFFLinkGraphBuilder_x86_64::addSingleRelocation)) + return Err; + + return Error::success(); + } + + uint64_t getImageBase() { + if (!ImageBase) { + ImageBase = std::numeric_limits<uint64_t>::max(); + for (const auto &Block : getGraph().blocks()) { + if (Block->getAddress().getValue()) + ImageBase = std::min(ImageBase, Block->getAddress().getValue()); + } + } + return ImageBase; + } + + Error addSingleRelocation(const object::RelocationRef &Rel, + const object::SectionRef &FixupSect, + Block &BlockToFix) { + + const object::coff_relocation *COFFRel = getObject().getCOFFRelocation(Rel); + auto SymbolIt = Rel.getSymbol(); + if (SymbolIt == getObject().symbol_end()) { + return make_error<StringError>( + formatv("Invalid symbol index in relocation entry. " + "index: {0}, section: {1}", + COFFRel->SymbolTableIndex, FixupSect.getIndex()), + inconvertibleErrorCode()); + } + + object::COFFSymbolRef COFFSymbol = getObject().getCOFFSymbol(*SymbolIt); + COFFSymbolIndex SymIndex = getObject().getSymbolIndex(COFFSymbol); + + Symbol *GraphSymbol = getGraphSymbol(SymIndex); + if (!GraphSymbol) + return make_error<StringError>( + formatv("Could not find symbol at given index, did you add it to " + "JITSymbolTable? index: {0}, section: {1}", + SymIndex, FixupSect.getIndex()), + inconvertibleErrorCode()); + + Expected<COFFX86RelocationKind> RelocKind = + getRelocationKind(Rel.getType()); + if (!RelocKind) + return RelocKind.takeError(); + + int64_t Addend = 0; + orc::ExecutorAddr FixupAddress = + orc::ExecutorAddr(FixupSect.getAddress()) + Rel.getOffset(); + Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); + + Edge::Kind Kind = Edge::Invalid; + + switch (*RelocKind) { + case COFFAddr32NB: { + Kind = x86_64::Pointer32; + Offset -= getImageBase(); + break; + } + case COFFRel32: { + Kind = x86_64::BranchPCRel32; + break; + } + }; + + Edge GE(Kind, Offset, *GraphSymbol, Addend); + LLVM_DEBUG({ + dbgs() << " "; + printEdge(dbgs(), BlockToFix, GE, x86_64::getEdgeKindName(Kind)); + dbgs() << "\n"; + }); + + BlockToFix.addEdge(std::move(GE)); + return Error::success(); + } + + /// Return the string name of the given COFF x86_64 edge kind. + const char *getCOFFX86RelocationKindName(COFFX86RelocationKind R) { + switch (R) { + case COFFAddr32NB: + return "COFFAddr32NB"; + case COFFRel32: + return "COFFRel32"; + } + } + +public: + COFFLinkGraphBuilder_x86_64(const object::COFFObjectFile &Obj, const Triple T) + : COFFLinkGraphBuilder(Obj, std::move(T), x86_64::getEdgeKindName) {} +}; + +Error buildTables_COFF_x86_64(LinkGraph &G) { + LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n"); + + x86_64::GOTTableManager GOT; + x86_64::PLTTableManager PLT(GOT); + visitExistingEdges(G, GOT, PLT); + return Error::success(); +} +} // namespace + +namespace llvm { +namespace jitlink { + +Expected<std::unique_ptr<LinkGraph>> +createLinkGraphFromCOFFObject_x86_64(MemoryBufferRef ObjectBuffer) { + LLVM_DEBUG({ + dbgs() << "Building jitlink graph for new input " + << ObjectBuffer.getBufferIdentifier() << "...\n"; + }); + + auto COFFObj = object::ObjectFile::createCOFFObjectFile(ObjectBuffer); + if (!COFFObj) + return COFFObj.takeError(); + + return COFFLinkGraphBuilder_x86_64(**COFFObj, (*COFFObj)->makeTriple()) + .buildGraph(); +} + +void link_COFF_x86_64(std::unique_ptr<LinkGraph> G, + std::unique_ptr<JITLinkContext> Ctx) { + PassConfiguration Config; + const Triple &TT = G->getTargetTriple(); + if (Ctx->shouldAddDefaultTargetPasses(TT)) { + // Add a mark-live pass. + if (auto MarkLive = Ctx->getMarkLivePass(TT)) + Config.PrePrunePasses.push_back(std::move(MarkLive)); + else + Config.PrePrunePasses.push_back(markAllSymbolsLive); + + // Add an in-place GOT/Stubs/TLSInfoEntry build pass. + Config.PostPrunePasses.push_back(buildTables_COFF_x86_64); + + // Add GOT/Stubs optimizer pass. + Config.PreFixupPasses.push_back(x86_64::optimizeGOTAndStubAccesses); + } + + if (auto Err = Ctx->modifyPassConfig(*G, Config)) + return Ctx->notifyFailed(std::move(Err)); + + COFFJITLinker_x86_64::link(std::move(Ctx), std::move(G), std::move(Config)); +} + +} // namespace jitlink +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index b1492cd74508..389fd14c0f29 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -33,7 +33,7 @@ Error EHFrameEdgeFixer::operator()(LinkGraph &G) { if (!EHFrame) { LLVM_DEBUG({ dbgs() << "EHFrameEdgeFixer: No " << EHFrameSectionName - << " section. Nothing to do\n"; + << " section in \"" << G.getName() << "\". Nothing to do.\n"; }); return Error::success(); } @@ -44,7 +44,8 @@ Error EHFrameEdgeFixer::operator()(LinkGraph &G) { "EHFrameEdgeFixer only supports 32 and 64 bit targets"); LLVM_DEBUG({ - dbgs() << "EHFrameEdgeFixer: Processing " << EHFrameSectionName << "...\n"; + dbgs() << "EHFrameEdgeFixer: Processing " << EHFrameSectionName << " in \"" + << G.getName() << "\"...\n"; }); ParseContext PC(G); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp index 98da3f155c3e..7d67e5ef343a 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp @@ -63,6 +63,10 @@ private: ELFPrel64, ELFAdrGOTPage21, ELFLd64GOTLo12, + ELFTLSDescAdrPage21, + ELFTLSDescAddLo12, + ELFTLSDescLd64Lo12, + ELFTLSDescCall, }; static Expected<ELFAArch64RelocationKind> @@ -104,6 +108,14 @@ private: return ELFAdrGOTPage21; case ELF::R_AARCH64_LD64_GOT_LO12_NC: return ELFLd64GOTLo12; + case ELF::R_AARCH64_TLSDESC_ADR_PAGE21: + return ELFTLSDescAdrPage21; + case ELF::R_AARCH64_TLSDESC_ADD_LO12: + return ELFTLSDescAddLo12; + case ELF::R_AARCH64_TLSDESC_LD64_LO12: + return ELFTLSDescLd64Lo12; + case ELF::R_AARCH64_TLSDESC_CALL: + return ELFTLSDescCall; } return make_error<JITLinkError>( @@ -292,6 +304,21 @@ private: Kind = aarch64::GOTPageOffset12; break; } + case ELFTLSDescAdrPage21: { + Kind = aarch64::TLSDescPage21; + break; + } + case ELFTLSDescAddLo12: { + Kind = aarch64::TLSDescPageOffset12; + break; + } + case ELFTLSDescLd64Lo12: { + Kind = aarch64::TLSDescPageOffset12; + break; + } + case ELFTLSDescCall: { + return Error::success(); + } }; Edge GE(Kind, Offset, *GraphSymbol, Addend); @@ -302,6 +329,7 @@ private: }); BlockToFix.addEdge(std::move(GE)); + return Error::success(); } @@ -342,6 +370,14 @@ private: return "ELFAdrGOTPage21"; case ELFLd64GOTLo12: return "ELFLd64GOTLo12"; + case ELFTLSDescAdrPage21: + return "ELFTLSDescAdrPage21"; + case ELFTLSDescAddLo12: + return "ELFTLSDescAddLo12"; + case ELFTLSDescLd64Lo12: + return "ELFTLSDescLd64Lo12"; + case ELFTLSDescCall: + return "ELFTLSDescCall"; default: return getGenericEdgeKindName(static_cast<Edge::Kind>(R)); } @@ -354,12 +390,133 @@ public: aarch64::getEdgeKindName) {} }; +// TLS Info Builder. +class TLSInfoTableManager_ELF_aarch64 + : public TableManager<TLSInfoTableManager_ELF_aarch64> { +public: + static StringRef getSectionName() { return "$__TLSINFO"; } + + static const uint8_t TLSInfoEntryContent[16]; + + bool visitEdge(LinkGraph &G, Block *B, Edge &E) { return false; } + + Symbol &createEntry(LinkGraph &G, Symbol &Target) { + // the TLS Info entry's key value will be written by the fixTLVSectionByName + // pass, so create mutable content. + auto &TLSInfoEntry = G.createMutableContentBlock( + getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()), + orc::ExecutorAddr(), 8, 0); + TLSInfoEntry.addEdge(aarch64::Pointer64, 8, Target, 0); + return G.addAnonymousSymbol(TLSInfoEntry, 0, 16, false, false); + } + +private: + Section &getTLSInfoSection(LinkGraph &G) { + if (!TLSInfoTable) + TLSInfoTable = &G.createSection(getSectionName(), MemProt::Read); + return *TLSInfoTable; + } + + ArrayRef<char> getTLSInfoEntryContent() const { + return {reinterpret_cast<const char *>(TLSInfoEntryContent), + sizeof(TLSInfoEntryContent)}; + } + + Section *TLSInfoTable = nullptr; +}; + +const uint8_t TLSInfoTableManager_ELF_aarch64::TLSInfoEntryContent[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*pthread key */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /*data address*/ +}; + +// TLS Descriptor Builder. +class TLSDescTableManager_ELF_aarch64 + : public TableManager<TLSDescTableManager_ELF_aarch64> { +public: + TLSDescTableManager_ELF_aarch64( + TLSInfoTableManager_ELF_aarch64 &TLSInfoTableManager) + : TLSInfoTableManager(TLSInfoTableManager) {} + + static StringRef getSectionName() { return "$__TLSDESC"; } + + static const uint8_t TLSDescEntryContent[16]; + + bool visitEdge(LinkGraph &G, Block *B, Edge &E) { + Edge::Kind KindToSet = Edge::Invalid; + switch (E.getKind()) { + case aarch64::TLSDescPage21: { + KindToSet = aarch64::Page21; + break; + } + case aarch64::TLSDescPageOffset12: { + KindToSet = aarch64::PageOffset12; + break; + } + default: + return false; + } + assert(KindToSet != Edge::Invalid && + "Fell through switch, but no new kind to set"); + DEBUG_WITH_TYPE("jitlink", { + dbgs() << " Fixing " << G.getEdgeKindName(E.getKind()) << " edge at " + << B->getFixupAddress(E) << " (" << B->getAddress() << " + " + << formatv("{0:x}", E.getOffset()) << ")\n"; + }); + E.setKind(KindToSet); + E.setTarget(getEntryForTarget(G, E.getTarget())); + return true; + } + + Symbol &createEntry(LinkGraph &G, Symbol &Target) { + auto &EntryBlock = + G.createContentBlock(getTLSDescSection(G), getTLSDescBlockContent(), + orc::ExecutorAddr(), 8, 0); + EntryBlock.addEdge(aarch64::Pointer64, 0, getTLSDescResolver(G), 0); + EntryBlock.addEdge(aarch64::Pointer64, 8, + TLSInfoTableManager.getEntryForTarget(G, Target), 0); + return G.addAnonymousSymbol(EntryBlock, 0, 8, false, false); + } + +private: + Section &getTLSDescSection(LinkGraph &G) { + if (!GOTSection) + GOTSection = &G.createSection(getSectionName(), MemProt::Read); + return *GOTSection; + } + + Symbol &getTLSDescResolver(LinkGraph &G) { + if (!TLSDescResolver) + TLSDescResolver = + &G.addExternalSymbol("__tlsdesc_resolver", 8, Linkage::Strong); + return *TLSDescResolver; + } + + ArrayRef<char> getTLSDescBlockContent() { + return {reinterpret_cast<const char *>(TLSDescEntryContent), + sizeof(TLSDescEntryContent)}; + } + + Section *GOTSection = nullptr; + Symbol *TLSDescResolver = nullptr; + TLSInfoTableManager_ELF_aarch64 &TLSInfoTableManager; +}; + +const uint8_t TLSDescTableManager_ELF_aarch64::TLSDescEntryContent[16] = { + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, /*resolver function pointer*/ + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 /*pointer to tls info*/ +}; + Error buildTables_ELF_aarch64(LinkGraph &G) { LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n"); aarch64::GOTTableManager GOT; aarch64::PLTTableManager PLT(GOT); - visitExistingEdges(G, GOT, PLT); + TLSInfoTableManager_ELF_aarch64 TLSInfo; + TLSDescTableManager_ELF_aarch64 TLSDesc(TLSInfo); + visitExistingEdges(G, GOT, PLT, TLSDesc, TLSInfo); return Error::success(); } @@ -406,7 +563,7 @@ void link_ELF_aarch64(std::unique_ptr<LinkGraph> G, else Config.PrePrunePasses.push_back(markAllSymbolsLive); - // Add an in-place GOT/Stubs build pass. + // Add an in-place GOT/TLS/Stubs build pass. Config.PostPrunePasses.push_back(buildTables_ELF_aarch64); } diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp index 197ab71f5274..c7596efe2bb8 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp @@ -494,6 +494,30 @@ private: Block &BlockToFix) { using Base = ELFLinkGraphBuilder<ELFT>; + uint32_t Type = Rel.getType(false); + // We do not implement linker relaxation, except what is required for + // alignment (see below). + if (Type == llvm::ELF::R_RISCV_RELAX) + return Error::success(); + + int64_t Addend = Rel.r_addend; + if (Type == llvm::ELF::R_RISCV_ALIGN) { + uint64_t Alignment = PowerOf2Ceil(Addend); + // FIXME: Implement support for ensuring alignment together with linker + // relaxation; 2 bytes are guaranteed by the length of compressed + // instructions, so this does not need any action from our side. + if (Alignment > 2) + return make_error<JITLinkError>( + formatv("Unsupported relocation R_RISCV_ALIGN with alignment {0} " + "larger than 2 (addend: {1})", + Alignment, Addend)); + return Error::success(); + } + + Expected<riscv::EdgeKind_riscv> Kind = getRelocationKind(Type); + if (!Kind) + return Kind.takeError(); + uint32_t SymbolIndex = Rel.getSymbol(false); auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec); if (!ObjSymbol) @@ -508,12 +532,6 @@ private: Base::GraphSymbols.size()), inconvertibleErrorCode()); - uint32_t Type = Rel.getType(false); - Expected<riscv::EdgeKind_riscv> Kind = getRelocationKind(Type); - if (!Kind) - return Kind.takeError(); - - int64_t Addend = Rel.r_addend; auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset; Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge GE(*Kind, Offset, *GraphSymbol, Addend); diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index 43efe0725cfe..08fdc7c9e6b1 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -9,10 +9,10 @@ #include "llvm/ExecutionEngine/JITLink/JITLink.h" #include "llvm/BinaryFormat/Magic.h" +#include "llvm/ExecutionEngine/JITLink/COFF.h" #include "llvm/ExecutionEngine/JITLink/ELF.h" #include "llvm/ExecutionEngine/JITLink/MachO.h" #include "llvm/Support/Format.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" @@ -41,8 +41,6 @@ public: } }; -static ManagedStatic<JITLinkerErrorCategory> JITLinkerErrorCategory; - } // namespace namespace llvm { @@ -53,7 +51,8 @@ char JITLinkError::ID = 0; void JITLinkError::log(raw_ostream &OS) const { OS << ErrMsg; } std::error_code JITLinkError::convertToErrorCode() const { - return std::error_code(GenericJITLinkError, *JITLinkerErrorCategory); + static JITLinkerErrorCategory TheJITLinkerErrorCategory; + return std::error_code(GenericJITLinkError, TheJITLinkerErrorCategory); } const char *getGenericEdgeKindName(Edge::Kind K) { @@ -410,6 +409,8 @@ createLinkGraphFromObject(MemoryBufferRef ObjectBuffer) { return createLinkGraphFromMachOObject(ObjectBuffer); case file_magic::elf_relocatable: return createLinkGraphFromELFObject(ObjectBuffer); + case file_magic::coff_object: + return createLinkGraphFromCOFFObject(ObjectBuffer); default: return make_error<JITLinkError>("Unsupported file format"); }; @@ -421,6 +422,8 @@ void link(std::unique_ptr<LinkGraph> G, std::unique_ptr<JITLinkContext> Ctx) { return link_MachO(std::move(G), std::move(Ctx)); case Triple::ELF: return link_ELF(std::move(G), std::move(Ctx)); + case Triple::COFF: + return link_COFF(std::move(G), std::move(Ctx)); default: Ctx->notifyFailed(make_error<JITLinkError>("Unsupported object format")); }; diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp index dd50314d3ed7..04194318498f 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp @@ -425,7 +425,7 @@ private: else return TargetSymbolOrErr.takeError(); - Kind = aarch64::PointerToGOT; + Kind = aarch64::Delta32ToGOT; break; case MachODelta32: case MachODelta64: { diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp index 28a6f9ce90d9..9ecc71dfbb54 100644 --- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp @@ -48,8 +48,12 @@ const char *getEdgeKindName(Edge::Kind R) { return "TLVPage21"; case TLVPageOffset12: return "TLVPageOffset12"; - case PointerToGOT: - return "PointerToGOT"; + case TLSDescPage21: + return "TLSDescPage21"; + case TLSDescPageOffset12: + return "TLSDescPageOffset12"; + case Delta32ToGOT: + return "Delta32ToGOT"; case PairedAddend: return "PairedAddend"; case LDRLiteral19: diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index e476c549412a..e7ca636c83e9 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -839,11 +839,13 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::registerInitSections( Error ELFNixPlatform::ELFNixPlatformPlugin::fixTLVSectionsAndEdges( jitlink::LinkGraph &G, JITDylib &JD) { - // TODO implement TLV support - for (auto *Sym : G.external_symbols()) + for (auto *Sym : G.external_symbols()) { if (Sym->getName() == "__tls_get_addr") { Sym->setName("___orc_rt_elfnix_tls_get_addr"); + } else if (Sym->getName() == "__tlsdesc_resolver") { + Sym->setName("___orc_rt_elfnix_tlsdesc_resolver"); } + } auto *TLSInfoEntrySection = G.findSectionByName("$__TLSINFO"); diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 6d67e6d87b56..1926ef1ecc72 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -666,8 +666,9 @@ Error LLJITBuilderState::prepareForConstruction() { // JIT linker. if (!CreateObjectLinkingLayer) { auto &TT = JTMB->getTargetTriple(); - if (TT.isOSBinFormatMachO() && - (TT.getArch() == Triple::aarch64 || TT.getArch() == Triple::x86_64)) { + if (TT.getArch() == Triple::riscv64 || + (TT.isOSBinFormatMachO() && + (TT.getArch() == Triple::aarch64 || TT.getArch() == Triple::x86_64))) { JTMB->setRelocationModel(Reloc::PIC_); JTMB->setCodeModel(CodeModel::Small); diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp index 394a555e453b..356b81b4f1c5 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp @@ -9,6 +9,7 @@ #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" #include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h" #include "llvm/ExecutionEngine/Orc/MachOPlatform.h" +#include "llvm/Object/COFF.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" @@ -145,6 +146,55 @@ getELFObjectFileSymbolInfo(ExecutionSession &ES, return I; } +static Expected<MaterializationUnit::Interface> +getCOFFObjectFileSymbolInfo(ExecutionSession &ES, + const object::COFFObjectFile &Obj) { + MaterializationUnit::Interface I; + + for (auto &Sym : Obj.symbols()) { + Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); + if (!SymFlagsOrErr) + // TODO: Test this error. + return SymFlagsOrErr.takeError(); + + // Skip symbols not defined in this object file. + if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) + continue; + + // Skip symbols that are not global. + if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) + continue; + + // Skip symbols that have type SF_File. + if (auto SymType = Sym.getType()) { + if (*SymType == object::SymbolRef::ST_File) + continue; + } else + return SymType.takeError(); + + auto Name = Sym.getName(); + if (!Name) + return Name.takeError(); + + auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); + if (!SymFlags) + return SymFlags.takeError(); + *SymFlags |= JITSymbolFlags::Exported; + auto COFFSym = Obj.getCOFFSymbol(Sym); + + // Weak external is always a function + if (COFFSym.isWeakExternal()) { + *SymFlags |= JITSymbolFlags::Callable; + } + + I.SymbolFlags[ES.intern(*Name)] = std::move(*SymFlags); + } + + // FIXME: handle init symbols + + return I; +} + Expected<MaterializationUnit::Interface> getGenericObjectFileSymbolInfo(ExecutionSession &ES, const object::ObjectFile &Obj) { @@ -196,6 +246,8 @@ getObjectFileInterface(ExecutionSession &ES, MemoryBufferRef ObjBuffer) { return getMachOObjectFileSymbolInfo(ES, *MachOObj); else if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj->get())) return getELFObjectFileSymbolInfo(ES, *ELFObj); + else if (auto *COFFObj = dyn_cast<object::COFFObjectFile>(Obj->get())) + return getCOFFObjectFileSymbolInfo(ES, *COFFObj); return getGenericObjectFileSymbolInfo(ES, **Obj); } diff --git a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp index ef764a3f0d7f..da8aaad08cad 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp @@ -665,7 +665,7 @@ void OrcMips32_Base::writeIndirectStubsBlock( // // i.. - assert(stubAndPointerRangesOk<OrcAArch64>( + assert(stubAndPointerRangesOk<OrcMips32_Base>( StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) && "PointersBlock is out of range"); @@ -884,7 +884,7 @@ void OrcMips64::writeIndirectStubsBlock( // // ... - assert(stubAndPointerRangesOk<OrcAArch64>( + assert(stubAndPointerRangesOk<OrcMips64>( StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) && "PointersBlock is out of range"); diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp index fdad90cbcfb7..2cc2bddeb21a 100644 --- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp @@ -12,7 +12,6 @@ #include "llvm/ExecutionEngine/Orc/Shared/OrcError.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include <type_traits> @@ -70,7 +69,10 @@ public: } }; -static ManagedStatic<OrcErrorCategory> OrcErrCat; +OrcErrorCategory &getOrcErrCat() { + static OrcErrorCategory OrcErrCat; + return OrcErrCat; +} } // namespace namespace llvm { @@ -81,7 +83,7 @@ char JITSymbolNotFound::ID = 0; std::error_code orcError(OrcErrorCode ErrCode) { typedef std::underlying_type<OrcErrorCode>::type UT; - return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat); + return std::error_code(static_cast<UT>(ErrCode), getOrcErrCat()); } DuplicateDefinition::DuplicateDefinition(std::string SymbolName) @@ -105,7 +107,7 @@ JITSymbolNotFound::JITSymbolNotFound(std::string SymbolName) std::error_code JITSymbolNotFound::convertToErrorCode() const { typedef std::underlying_type<OrcErrorCode>::type UT; return std::error_code(static_cast<UT>(OrcErrorCode::JITSymbolNotFound), - *OrcErrCat); + getOrcErrCat()); } void JITSymbolNotFound::log(raw_ostream &OS) const { diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp index ffa2969536e7..8296b03398a0 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp @@ -11,7 +11,6 @@ #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/ManagedStatic.h" #include <cstdint> #include <mutex> @@ -67,9 +66,6 @@ LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() { using namespace llvm; using namespace llvm::orc; -// Serialize rendezvous with the debugger as well as access to shared data. -ManagedStatic<std::mutex> JITDebugLock; - // Register debug object, return error message or null for success. static void registerJITLoaderGDBImpl(const char *ObjAddr, size_t Size) { LLVM_DEBUG({ @@ -85,7 +81,9 @@ static void registerJITLoaderGDBImpl(const char *ObjAddr, size_t Size) { E->symfile_size = Size; E->prev_entry = nullptr; - std::lock_guard<std::mutex> Lock(*JITDebugLock); + // Serialize rendezvous with the debugger as well as access to shared data. + static std::mutex JITDebugLock; + std::lock_guard<std::mutex> Lock(JITDebugLock); // Insert this entry at the head of the list. jit_code_entry *NextEntry = __jit_debug_descriptor.first_entry; diff --git a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp index 4a236e183c8b..bb41bac32534 100644 --- a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp +++ b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp @@ -24,7 +24,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/Errno.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/Path.h" @@ -488,15 +487,14 @@ void PerfJITEventListener::NotifyDebug(uint64_t CodeAddr, } } -// There should be only a single event listener per process, otherwise perf gets -// confused. -llvm::ManagedStatic<PerfJITEventListener> PerfListener; - } // end anonymous namespace namespace llvm { JITEventListener *JITEventListener::createPerfJITEventListener() { - return &*PerfListener; + // There should be only a single event listener per process, otherwise perf + // gets confused. + static PerfJITEventListener PerfListener; + return &PerfListener; } } // namespace llvm diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 2e0cba849165..54ab00732330 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -19,7 +19,6 @@ #include "llvm/Object/ELFObjectFile.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/MSVCErrorWorkarounds.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include <mutex> @@ -51,8 +50,6 @@ public: } }; -static ManagedStatic<RuntimeDyldErrorCategory> RTDyldErrorCategory; - } char RuntimeDyldError::ID = 0; @@ -62,7 +59,8 @@ void RuntimeDyldError::log(raw_ostream &OS) const { } std::error_code RuntimeDyldError::convertToErrorCode() const { - return std::error_code(GenericRTDyldError, *RTDyldErrorCategory); + static RuntimeDyldErrorCategory RTDyldErrorCategory; + return std::error_code(GenericRTDyldError, RTDyldErrorCategory); } // Empty out-of-line virtual destructor as the key function. diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index da1102fc9f07..c702584b7a33 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -479,7 +479,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, *TargetPtr &= 0xfff8001fU; // Immediate:15:2 goes in bits 18:5 of TBZ, TBNZ - or32le(TargetPtr, (BranchImm & 0x0FFFFFFC) << 3); + or32le(TargetPtr, (BranchImm & 0x0000FFFC) << 3); break; } case ELF::R_AARCH64_CALL26: // fallthrough diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp index 6e8856f481af..0f846f7bfee5 100644 --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -214,7 +214,7 @@ static int isVariantApplicableInContextHelper( Optional<bool> Result = HandleTrait(Property, IsActiveTrait); if (Result) - return Result.getValue(); + return Result.value(); } if (!DeviceSetOnly) { @@ -235,7 +235,7 @@ static int isVariantApplicableInContextHelper( Optional<bool> Result = HandleTrait(Property, FoundInOrder); if (Result) - return Result.getValue(); + return Result.value(); if (!FoundInOrder) { LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] Construct property " diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 9b08a24e14d4..574d9174bebf 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -791,6 +791,38 @@ void OpenMPIRBuilder::emitOffloadingEntry(Constant *Addr, StringRef Name, Entry->setAlignment(Align(1)); } +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( + const LocationDescription &Loc, Value *&Return, Value *Ident, + Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, + ArrayRef<Value *> KernelArgs, ArrayRef<Value *> NoWaitArgs) { + if (!updateToLocation(Loc)) + return Loc.IP; + + auto *KernelArgsPtr = + Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args"); + for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) { + llvm::Value *Arg = + Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I); + Builder.CreateAlignedStore( + KernelArgs[I], Arg, + M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType())); + } + + bool HasNoWait = !NoWaitArgs.empty(); + SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams, + NumThreads, HostPtr, KernelArgsPtr}; + if (HasNoWait) + OffloadingArgs.append(NoWaitArgs.begin(), NoWaitArgs.end()); + + Return = Builder.CreateCall( + HasNoWait + ? getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel_nowait) + : getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel), + OffloadingArgs); + + return Builder.saveIP(); +} + void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB) { @@ -1260,6 +1292,9 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, if (!updateToLocation(Loc)) return InsertPointTy(); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); // The current basic block is split into four basic blocks. After outlining, // they will be mapped as follows: // ``` @@ -1285,7 +1320,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, OI.EntryBB = TaskAllocaBB; OI.OuterAllocaBB = AllocaIP.getBlock(); OI.ExitBB = TaskExitBB; - OI.PostOutlineCB = [this, &Loc, Tied, Final](Function &OutlinedFn) { + OI.PostOutlineCB = [this, Ident, Tied, Final](Function &OutlinedFn) { // The input IR here looks like the following- // ``` // func @current_fn() { @@ -1324,9 +1359,6 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID) // call. - uint32_t SrcLocStrSize; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadID = getOrCreateThreadID(Ident); // Argument - `flags` @@ -2834,7 +2866,8 @@ void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) { }); } -void OpenMPIRBuilder::applySimd(DebugLoc, CanonicalLoopInfo *CanonicalLoop) { +void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, + ConstantInt *Simdlen) { LLVMContext &Ctx = Builder.getContext(); Function *F = CanonicalLoop->getFunction(); @@ -2879,6 +2912,11 @@ void OpenMPIRBuilder::applySimd(DebugLoc, CanonicalLoopInfo *CanonicalLoop) { AccessGroup}), MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst})}); + if (Simdlen != nullptr) + addLoopMetadata( + CanonicalLoop, + MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"), + ConstantAsMetadata::get(Simdlen)})); } /// Create the TargetMachine object to query the backend for optimization @@ -3962,6 +4000,8 @@ Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2, case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: llvm_unreachable("Unsupported atomic update operation"); } llvm_unreachable("Unsupported atomic update operation"); @@ -4126,20 +4166,37 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( assert(X.Var->getType()->isPointerTy() && "OMP atomic expects a pointer to target memory"); - assert((X.ElemTy->isIntegerTy() || X.ElemTy->isPointerTy()) && - "OMP atomic compare expected a integer scalar type"); // compare capture if (V.Var) { assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type"); assert(V.ElemTy == X.ElemTy && "x and v must be of same type"); } + bool IsInteger = E->getType()->isIntegerTy(); + if (Op == OMPAtomicCompareOp::EQ) { AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO); - AtomicCmpXchgInst *Result = - Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure); + AtomicCmpXchgInst *Result = nullptr; + if (!IsInteger) { + unsigned Addrspace = + cast<PointerType>(X.Var->getType())->getAddressSpace(); + IntegerType *IntCastTy = + IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits()); + Value *XBCast = + Builder.CreateBitCast(X.Var, IntCastTy->getPointerTo(Addrspace)); + Value *EBCast = Builder.CreateBitCast(E, IntCastTy); + Value *DBCast = Builder.CreateBitCast(D, IntCastTy); + Result = Builder.CreateAtomicCmpXchg(XBCast, EBCast, DBCast, MaybeAlign(), + AO, Failure); + } else { + Result = + Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure); + } + if (V.Var) { Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0); + if (!IsInteger) + OldValue = Builder.CreateBitCast(OldValue, X.ElemTy); assert(OldValue->getType() == V.ElemTy && "OldValue and V must be of same type"); if (IsPostfixUpdate) { @@ -4213,19 +4270,29 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( // x = x <= expr ? x : expr; AtomicRMWInst::BinOp NewOp; if (IsXBinopExpr) { - if (X.IsSigned) - NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min - : AtomicRMWInst::Max; - else - NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin - : AtomicRMWInst::UMax; + if (IsInteger) { + if (X.IsSigned) + NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min + : AtomicRMWInst::Max; + else + NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin + : AtomicRMWInst::UMax; + } else { + NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin + : AtomicRMWInst::FMax; + } } else { - if (X.IsSigned) - NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max - : AtomicRMWInst::Min; - else - NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax - : AtomicRMWInst::UMin; + if (IsInteger) { + if (X.IsSigned) + NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max + : AtomicRMWInst::Min; + else + NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax + : AtomicRMWInst::UMin; + } else { + NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax + : AtomicRMWInst::FMin; + } } AtomicRMWInst *OldValue = @@ -4243,12 +4310,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( case AtomicRMWInst::UMax: Pred = CmpInst::ICMP_UGT; break; + case AtomicRMWInst::FMax: + Pred = CmpInst::FCMP_OGT; + break; case AtomicRMWInst::Min: Pred = CmpInst::ICMP_SLT; break; case AtomicRMWInst::UMin: Pred = CmpInst::ICMP_ULT; break; + case AtomicRMWInst::FMin: + Pred = CmpInst::FCMP_OLT; + break; default: llvm_unreachable("unexpected comparison op"); } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 596348ddb462..a29040b8c2aa 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1590,10 +1590,6 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, Out << ", "; } - if (CE->hasIndices()) - for (unsigned I : CE->getIndices()) - Out << ", " << I; - if (CE->isCast()) { Out << " to "; WriterCtx.TypePrinter->print(CE->getType(), Out); @@ -3542,8 +3538,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) { Out << ", no_sanitize_address"; if (MD.NoHWAddress) Out << ", no_sanitize_hwaddress"; - if (MD.NoMemtag) - Out << ", no_sanitize_memtag"; + if (MD.Memtag) + Out << ", sanitize_memtag"; if (MD.IsDynInit) Out << ", sanitize_address_dyninit"; } @@ -4299,9 +4295,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) { bool PrintAllTypes = false; Type *TheType = Operand->getType(); - // Select, Store and ShuffleVector always print all types. - if (isa<SelectInst>(I) || isa<StoreInst>(I) || isa<ShuffleVectorInst>(I) - || isa<ReturnInst>(I)) { + // Select, Store, ShuffleVector and CmpXchg always print all types. + if (isa<SelectInst>(I) || isa<StoreInst>(I) || isa<ShuffleVectorInst>(I) || + isa<ReturnInst>(I) || isa<AtomicCmpXchgInst>(I)) { PrintAllTypes = true; } else { for (unsigned i = 1, E = I.getNumOperands(); i != E; ++i) { diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index 41b4f2919221..98adff107cec 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -1218,9 +1218,13 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, if (Instruction::isIntDivRem(Opcode) && C2Splat->isNullValue()) return PoisonValue::get(VTy); if (Constant *C1Splat = C1->getSplatValue()) { - return ConstantVector::getSplat( - VTy->getElementCount(), - ConstantExpr::get(Opcode, C1Splat, C2Splat)); + Constant *Res = + ConstantExpr::isDesirableBinOp(Opcode) + ? ConstantExpr::get(Opcode, C1Splat, C2Splat) + : ConstantFoldBinaryInstruction(Opcode, C1Splat, C2Splat); + if (!Res) + return nullptr; + return ConstantVector::getSplat(VTy->getElementCount(), Res); } } @@ -1237,7 +1241,12 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue()) return PoisonValue::get(VTy); - Result.push_back(ConstantExpr::get(Opcode, LHS, RHS)); + Constant *Res = ConstantExpr::isDesirableBinOp(Opcode) + ? ConstantExpr::get(Opcode, LHS, RHS) + : ConstantFoldBinaryInstruction(Opcode, LHS, RHS); + if (!Res) + return nullptr; + Result.push_back(Res); } return ConstantVector::get(Result); @@ -2218,9 +2227,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, : cast<FixedVectorType>(CurrIdx->getType())->getNumElements(), Factor); - NewIdxs[i] = ConstantExpr::getSRem(CurrIdx, Factor); + NewIdxs[i] = + ConstantFoldBinaryInstruction(Instruction::SRem, CurrIdx, Factor); + + Constant *Div = + ConstantFoldBinaryInstruction(Instruction::SDiv, CurrIdx, Factor); - Constant *Div = ConstantExpr::getSDiv(CurrIdx, Factor); + // We're working on either ConstantInt or vectors of ConstantInt, + // so these should always fold. + assert(NewIdxs[i] != nullptr && Div != nullptr && "Should have folded"); unsigned CommonExtendedWidth = std::max(PrevIdx->getType()->getScalarSizeInBits(), diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 0bf5e09d6647..f9800cc0c07c 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -547,8 +547,6 @@ void llvm::deleteConstant(Constant *C) { delete static_cast<InsertElementConstantExpr *>(C); else if (isa<ShuffleVectorConstantExpr>(C)) delete static_cast<ShuffleVectorConstantExpr *>(C); - else if (isa<InsertValueConstantExpr>(C)) - delete static_cast<InsertValueConstantExpr *>(C); else if (isa<GetElementPtrConstantExpr>(C)) delete static_cast<GetElementPtrConstantExpr *>(C); else if (isa<CompareConstantExpr>(C)) @@ -561,51 +559,6 @@ void llvm::deleteConstant(Constant *C) { } } -static bool canTrapImpl(const Constant *C, - SmallPtrSetImpl<const Constant *> &NonTrappingOps) { - assert(C->getType()->isFirstClassType() && - "Cannot evaluate non-first-class types!"); - // ConstantExpr or ConstantAggregate trap if any operands can trap. - if (isa<ConstantExpr>(C) || isa<ConstantAggregate>(C)) { - for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { - const Constant *Op = cast<Constant>(C->getOperand(i)); - if (isa<ConstantExpr>(Op) || isa<ConstantAggregate>(Op)) { - if (NonTrappingOps.insert(Op).second && canTrapImpl(Op, NonTrappingOps)) - return true; - } - } - } - - // The only leafs that can trap are constant expressions. - const ConstantExpr *CE = dyn_cast<ConstantExpr>(C); - if (!CE) - return false; - - // Otherwise, only specific operations can trap. - switch (CE->getOpcode()) { - default: - return false; - case Instruction::SDiv: - case Instruction::SRem: - // Signed div/rem can trap for SignedMin / -1. - if (!CE->getOperand(0)->isNotMinSignedValue() && - (!isa<ConstantInt>(CE->getOperand(1)) || - CE->getOperand(1)->isAllOnesValue())) - return true; - LLVM_FALLTHROUGH; - case Instruction::UDiv: - case Instruction::URem: - // Div and rem can trap if the RHS is not known to be non-zero. - return !isa<ConstantInt>(CE->getOperand(1)) || - CE->getOperand(1)->isNullValue(); - } -} - -bool Constant::canTrap() const { - SmallPtrSet<const Constant *, 4> NonTrappingOps; - return canTrapImpl(this, NonTrappingOps); -} - /// Check if C contains a GlobalValue for which Predicate is true. static bool ConstHasGlobalValuePredicate(const Constant *C, @@ -1488,14 +1441,6 @@ bool ConstantExpr::isCompare() const { return getOpcode() == Instruction::ICmp || getOpcode() == Instruction::FCmp; } -bool ConstantExpr::hasIndices() const { - return getOpcode() == Instruction::InsertValue; -} - -ArrayRef<unsigned> ConstantExpr::getIndices() const { - return cast<InsertValueConstantExpr>(this)->Indices; -} - unsigned ConstantExpr::getPredicate() const { return cast<CompareConstantExpr>(this)->predicate; } @@ -1539,9 +1484,6 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty, OnlyIfReducedTy); case Instruction::ExtractElement: return ConstantExpr::getExtractElement(Ops[0], Ops[1], OnlyIfReducedTy); - case Instruction::InsertValue: - return ConstantExpr::getInsertValue(Ops[0], Ops[1], getIndices(), - OnlyIfReducedTy); case Instruction::FNeg: return ConstantExpr::getFNeg(Ops[0]); case Instruction::ShuffleVector: @@ -2324,6 +2266,8 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2, // Check the operands for consistency first. assert(Instruction::isBinaryOp(Opcode) && "Invalid opcode in binary constant expression"); + assert(isSupportedBinOp(Opcode) && + "Binop not supported as constant expression"); assert(C1->getType() == C2->getType() && "Operand types in binary constant expression should match"); @@ -2378,6 +2322,60 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2, return pImpl->ExprConstants.getOrCreate(C1->getType(), Key); } +bool ConstantExpr::isDesirableBinOp(unsigned Opcode) { + switch (Opcode) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return false; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return true; + default: + llvm_unreachable("Argument must be binop opcode"); + } +} + +bool ConstantExpr::isSupportedBinOp(unsigned Opcode) { + switch (Opcode) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return false; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return true; + default: + llvm_unreachable("Argument must be binop opcode"); + } +} + Constant *ConstantExpr::getSizeOf(Type* Ty) { // sizeof is implemented as: (i64) gep (Ty*)null, 1 // Note that a non-inbounds gep is used, as null isn't within any object. @@ -2517,7 +2515,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C, if (InRangeIndex && *InRangeIndex < 63) SubClassOptionalData |= (*InRangeIndex + 1) << 1; const ConstantExprKeyType Key(Instruction::GetElementPtr, ArgVec, 0, - SubClassOptionalData, None, None, Ty); + SubClassOptionalData, None, Ty); LLVMContextImpl *pImpl = C->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ReqTy, Key); @@ -2638,36 +2636,12 @@ Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2, // Look up the constant in the table first to ensure uniqueness Constant *ArgVec[] = {V1, V2}; - ConstantExprKeyType Key(Instruction::ShuffleVector, ArgVec, 0, 0, None, Mask); + ConstantExprKeyType Key(Instruction::ShuffleVector, ArgVec, 0, 0, Mask); LLVMContextImpl *pImpl = ShufTy->getContext().pImpl; return pImpl->ExprConstants.getOrCreate(ShufTy, Key); } -Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val, - ArrayRef<unsigned> Idxs, - Type *OnlyIfReducedTy) { - assert(Agg->getType()->isFirstClassType() && - "Non-first-class type for constant insertvalue expression"); - - assert(ExtractValueInst::getIndexedType(Agg->getType(), - Idxs) == Val->getType() && - "insertvalue indices invalid!"); - Type *ReqTy = Val->getType(); - - if (Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs)) - return FC; - - if (OnlyIfReducedTy == ReqTy) - return nullptr; - - Constant *ArgVec[] = { Agg, Val }; - const ConstantExprKeyType Key(Instruction::InsertValue, ArgVec, 0, 0, Idxs); - - LLVMContextImpl *pImpl = Agg->getContext().pImpl; - return pImpl->ExprConstants.getOrCreate(ReqTy, Key); -} - Constant *ConstantExpr::getNeg(Constant *C, bool HasNUW, bool HasNSW) { assert(C->getType()->isIntOrIntVectorTy() && "Cannot NEG a nonintegral value!"); @@ -2694,10 +2668,6 @@ Constant *ConstantExpr::getAdd(Constant *C1, Constant *C2, return get(Instruction::Add, C1, C2, Flags); } -Constant *ConstantExpr::getFAdd(Constant *C1, Constant *C2) { - return get(Instruction::FAdd, C1, C2); -} - Constant *ConstantExpr::getSub(Constant *C1, Constant *C2, bool HasNUW, bool HasNSW) { unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) | @@ -2705,10 +2675,6 @@ Constant *ConstantExpr::getSub(Constant *C1, Constant *C2, return get(Instruction::Sub, C1, C2, Flags); } -Constant *ConstantExpr::getFSub(Constant *C1, Constant *C2) { - return get(Instruction::FSub, C1, C2); -} - Constant *ConstantExpr::getMul(Constant *C1, Constant *C2, bool HasNUW, bool HasNSW) { unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) | @@ -2716,36 +2682,6 @@ Constant *ConstantExpr::getMul(Constant *C1, Constant *C2, return get(Instruction::Mul, C1, C2, Flags); } -Constant *ConstantExpr::getFMul(Constant *C1, Constant *C2) { - return get(Instruction::FMul, C1, C2); -} - -Constant *ConstantExpr::getUDiv(Constant *C1, Constant *C2, bool isExact) { - return get(Instruction::UDiv, C1, C2, - isExact ? PossiblyExactOperator::IsExact : 0); -} - -Constant *ConstantExpr::getSDiv(Constant *C1, Constant *C2, bool isExact) { - return get(Instruction::SDiv, C1, C2, - isExact ? PossiblyExactOperator::IsExact : 0); -} - -Constant *ConstantExpr::getFDiv(Constant *C1, Constant *C2) { - return get(Instruction::FDiv, C1, C2); -} - -Constant *ConstantExpr::getURem(Constant *C1, Constant *C2) { - return get(Instruction::URem, C1, C2); -} - -Constant *ConstantExpr::getSRem(Constant *C1, Constant *C2) { - return get(Instruction::SRem, C1, C2); -} - -Constant *ConstantExpr::getFRem(Constant *C1, Constant *C2) { - return get(Instruction::FRem, C1, C2); -} - Constant *ConstantExpr::getAnd(Constant *C1, Constant *C2) { return get(Instruction::And, C1, C2); } @@ -3517,9 +3453,6 @@ Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const { return InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "", InsertBefore); case Instruction::ExtractElement: return ExtractElementInst::Create(Ops[0], Ops[1], "", InsertBefore); - case Instruction::InsertValue: - return InsertValueInst::Create(Ops[0], Ops[1], getIndices(), "", - InsertBefore); case Instruction::ShuffleVector: return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "", InsertBefore); diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 21ef1c0d9f64..1d74e2d49f35 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -209,37 +209,6 @@ public: } }; -/// InsertValueConstantExpr - This class is private to -/// Constants.cpp, and is used behind the scenes to implement -/// insertvalue constant exprs. -class InsertValueConstantExpr final : public ConstantExpr { -public: - InsertValueConstantExpr(Constant *Agg, Constant *Val, - ArrayRef<unsigned> IdxList, Type *DestTy) - : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2), - Indices(IdxList.begin(), IdxList.end()) { - Op<0>() = Agg; - Op<1>() = Val; - } - - // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 2); } - void operator delete(void *Ptr) { User::operator delete(Ptr); } - - /// Indices - These identify the position for the insertion. - const SmallVector<unsigned, 4> Indices; - - /// Transparently provide more efficient getOperand methods. - DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); - - static bool classof(const ConstantExpr *CE) { - return CE->getOpcode() == Instruction::InsertValue; - } - static bool classof(const Value *V) { - return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)); - } -}; - /// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is /// used behind the scenes to implement getelementpr constant exprs. class GetElementPtrConstantExpr final : public ConstantExpr { @@ -333,11 +302,6 @@ struct OperandTraits<ShuffleVectorConstantExpr> DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value) template <> -struct OperandTraits<InsertValueConstantExpr> - : public FixedNumOperandTraits<InsertValueConstantExpr, 2> {}; -DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueConstantExpr, Value) - -template <> struct OperandTraits<GetElementPtrConstantExpr> : public VariadicOperandTraits<GetElementPtrConstantExpr, 1> {}; @@ -472,7 +436,6 @@ private: uint8_t SubclassOptionalData; uint16_t SubclassData; ArrayRef<Constant *> Ops; - ArrayRef<unsigned> Indexes; ArrayRef<int> ShuffleMask; Type *ExplicitTy; @@ -482,12 +445,6 @@ private: return None; } - static ArrayRef<unsigned> getIndicesIfValid(const ConstantExpr *CE) { - if (CE->hasIndices()) - return CE->getIndices(); - return None; - } - static Type *getSourceElementTypeIfValid(const ConstantExpr *CE) { if (auto *GEPCE = dyn_cast<GetElementPtrConstantExpr>(CE)) return GEPCE->getSourceElementType(); @@ -498,18 +455,17 @@ public: ConstantExprKeyType(unsigned Opcode, ArrayRef<Constant *> Ops, unsigned short SubclassData = 0, unsigned short SubclassOptionalData = 0, - ArrayRef<unsigned> Indexes = None, ArrayRef<int> ShuffleMask = None, Type *ExplicitTy = nullptr) : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData), - SubclassData(SubclassData), Ops(Ops), Indexes(Indexes), - ShuffleMask(ShuffleMask), ExplicitTy(ExplicitTy) {} + SubclassData(SubclassData), Ops(Ops), ShuffleMask(ShuffleMask), + ExplicitTy(ExplicitTy) {} ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE) : Opcode(CE->getOpcode()), SubclassOptionalData(CE->getRawSubclassOptionalData()), SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands), - Indexes(getIndicesIfValid(CE)), ShuffleMask(getShuffleMaskIfValid(CE)), + ShuffleMask(getShuffleMaskIfValid(CE)), ExplicitTy(getSourceElementTypeIfValid(CE)) {} ConstantExprKeyType(const ConstantExpr *CE, @@ -517,7 +473,7 @@ public: : Opcode(CE->getOpcode()), SubclassOptionalData(CE->getRawSubclassOptionalData()), SubclassData(CE->isCompare() ? CE->getPredicate() : 0), - Indexes(getIndicesIfValid(CE)), ShuffleMask(getShuffleMaskIfValid(CE)), + ShuffleMask(getShuffleMaskIfValid(CE)), ExplicitTy(getSourceElementTypeIfValid(CE)) { assert(Storage.empty() && "Expected empty storage"); for (unsigned I = 0, E = CE->getNumOperands(); I != E; ++I) @@ -528,8 +484,7 @@ public: bool operator==(const ConstantExprKeyType &X) const { return Opcode == X.Opcode && SubclassData == X.SubclassData && SubclassOptionalData == X.SubclassOptionalData && Ops == X.Ops && - Indexes == X.Indexes && ShuffleMask == X.ShuffleMask && - ExplicitTy == X.ExplicitTy; + ShuffleMask == X.ShuffleMask && ExplicitTy == X.ExplicitTy; } bool operator==(const ConstantExpr *CE) const { @@ -544,8 +499,6 @@ public: for (unsigned I = 0, E = Ops.size(); I != E; ++I) if (Ops[I] != CE->getOperand(I)) return false; - if (Indexes != getIndicesIfValid(CE)) - return false; if (ShuffleMask != getShuffleMaskIfValid(CE)) return false; if (ExplicitTy != getSourceElementTypeIfValid(CE)) @@ -557,7 +510,6 @@ public: return hash_combine( Opcode, SubclassOptionalData, SubclassData, hash_combine_range(Ops.begin(), Ops.end()), - hash_combine_range(Indexes.begin(), Indexes.end()), hash_combine_range(ShuffleMask.begin(), ShuffleMask.end()), ExplicitTy); } @@ -583,8 +535,6 @@ public: return new InsertElementConstantExpr(Ops[0], Ops[1], Ops[2]); case Instruction::ShuffleVector: return new ShuffleVectorConstantExpr(Ops[0], Ops[1], ShuffleMask); - case Instruction::InsertValue: - return new InsertValueConstantExpr(Ops[0], Ops[1], Indexes, Ty); case Instruction::GetElementPtr: return GetElementPtrConstantExpr::Create(ExplicitTy, Ops[0], Ops.slice(1), Ty, SubclassOptionalData); diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 4b9189ca5baa..08b7b0e1f956 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -74,13 +74,16 @@ void LLVMDisposeMessage(char *Message) { /*===-- Operations on contexts --------------------------------------------===*/ -static ManagedStatic<LLVMContext> GlobalContext; +static LLVMContext &getGlobalContext() { + static LLVMContext GlobalContext; + return GlobalContext; +} LLVMContextRef LLVMContextCreate() { return wrap(new LLVMContext()); } -LLVMContextRef LLVMGetGlobalContext() { return wrap(&*GlobalContext); } +LLVMContextRef LLVMGetGlobalContext() { return wrap(&getGlobalContext()); } void LLVMContextSetDiagnosticHandler(LLVMContextRef C, LLVMDiagnosticHandler Handler, @@ -251,7 +254,7 @@ LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI) { /*===-- Operations on modules ---------------------------------------------===*/ LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) { - return wrap(new Module(ModuleID, *GlobalContext)); + return wrap(new Module(ModuleID, getGlobalContext())); } LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID, @@ -1571,11 +1574,6 @@ LLVMValueRef LLVMConstNUWAdd(LLVMValueRef LHSConstant, unwrap<Constant>(RHSConstant))); } -LLVMValueRef LLVMConstFAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getFAdd(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - LLVMValueRef LLVMConstSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getSub(unwrap<Constant>(LHSConstant), unwrap<Constant>(RHSConstant))); @@ -1593,11 +1591,6 @@ LLVMValueRef LLVMConstNUWSub(LLVMValueRef LHSConstant, unwrap<Constant>(RHSConstant))); } -LLVMValueRef LLVMConstFSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getFSub(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - LLVMValueRef LLVMConstMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getMul(unwrap<Constant>(LHSConstant), unwrap<Constant>(RHSConstant))); @@ -1615,53 +1608,6 @@ LLVMValueRef LLVMConstNUWMul(LLVMValueRef LHSConstant, unwrap<Constant>(RHSConstant))); } -LLVMValueRef LLVMConstFMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getFMul(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstUDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getUDiv(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstExactUDiv(LLVMValueRef LHSConstant, - LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getExactUDiv(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstSDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getSDiv(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstExactSDiv(LLVMValueRef LHSConstant, - LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getExactSDiv(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstFDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getFDiv(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstURem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getURem(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstSRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getSRem(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - -LLVMValueRef LLVMConstFRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { - return wrap(ConstantExpr::getFRem(unwrap<Constant>(LHSConstant), - unwrap<Constant>(RHSConstant))); -} - LLVMValueRef LLVMConstAnd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getAnd(unwrap<Constant>(LHSConstant), unwrap<Constant>(RHSConstant))); @@ -1875,14 +1821,6 @@ LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant, IntMask)); } -LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant, - LLVMValueRef ElementValueConstant, - unsigned *IdxList, unsigned NumIdx) { - return wrap(ConstantExpr::getInsertValue(unwrap<Constant>(AggConstant), - unwrap<Constant>(ElementValueConstant), - makeArrayRef(IdxList, NumIdx))); -} - LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString, const char *Constraints, LLVMBool HasSideEffects, @@ -2843,6 +2781,10 @@ void LLVMInstructionEraseFromParent(LLVMValueRef Inst) { unwrap<Instruction>(Inst)->eraseFromParent(); } +void LLVMDeleteInstruction(LLVMValueRef Inst) { + unwrap<Instruction>(Inst)->deleteValue(); +} + LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst) { if (ICmpInst *I = dyn_cast<ICmpInst>(unwrap(Inst))) return (LLVMIntPredicate)I->getPredicate(); @@ -3079,8 +3021,6 @@ unsigned LLVMGetNumIndices(LLVMValueRef Inst) { return EV->getNumIndices(); if (auto *IV = dyn_cast<InsertValueInst>(I)) return IV->getNumIndices(); - if (auto *CE = dyn_cast<ConstantExpr>(I)) - return CE->getIndices().size(); llvm_unreachable( "LLVMGetNumIndices applies only to extractvalue and insertvalue!"); } @@ -3091,8 +3031,6 @@ const unsigned *LLVMGetIndices(LLVMValueRef Inst) { return EV->getIndices().data(); if (auto *IV = dyn_cast<InsertValueInst>(I)) return IV->getIndices().data(); - if (auto *CE = dyn_cast<ConstantExpr>(I)) - return CE->getIndices().data(); llvm_unreachable( "LLVMGetIndices applies only to extractvalue and insertvalue!"); } @@ -3664,6 +3602,8 @@ static AtomicRMWInst::BinOp mapFromLLVMRMWBinOp(LLVMAtomicRMWBinOp BinOp) { case LLVMAtomicRMWBinOpUMin: return AtomicRMWInst::UMin; case LLVMAtomicRMWBinOpFAdd: return AtomicRMWInst::FAdd; case LLVMAtomicRMWBinOpFSub: return AtomicRMWInst::FSub; + case LLVMAtomicRMWBinOpFMax: return AtomicRMWInst::FMax; + case LLVMAtomicRMWBinOpFMin: return AtomicRMWInst::FMin; } llvm_unreachable("Invalid LLVMAtomicRMWBinOp value!"); @@ -3684,6 +3624,8 @@ static LLVMAtomicRMWBinOp mapToLLVMRMWBinOp(AtomicRMWInst::BinOp BinOp) { case AtomicRMWInst::UMin: return LLVMAtomicRMWBinOpUMin; case AtomicRMWInst::FAdd: return LLVMAtomicRMWBinOpFAdd; case AtomicRMWInst::FSub: return LLVMAtomicRMWBinOpFSub; + case AtomicRMWInst::FMax: return LLVMAtomicRMWBinOpFMax; + case AtomicRMWInst::FMin: return LLVMAtomicRMWBinOpFMin; default: break; } diff --git a/llvm/lib/IR/InlineAsm.cpp b/llvm/lib/IR/InlineAsm.cpp index 203ad6dae1ff..c75b1aa7c1d6 100644 --- a/llvm/lib/IR/InlineAsm.cpp +++ b/llvm/lib/IR/InlineAsm.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" #include <algorithm> #include <cassert> #include <cctype> @@ -33,9 +34,10 @@ InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString, AsmString(asmString), Constraints(constraints), FTy(FTy), HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack), Dialect(asmDialect), CanThrow(canThrow) { +#ifndef NDEBUG // Do various checks on the constraint string and type. - assert(Verify(getFunctionType(), constraints) && - "Function type not legal for constraints!"); + cantFail(verify(getFunctionType(), constraints)); +#endif } InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString, @@ -248,15 +250,19 @@ InlineAsm::ParseConstraints(StringRef Constraints) { return Result; } -/// Verify - Verify that the specified constraint string is reasonable for the -/// specified function type, and otherwise validate the constraint string. -bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) { - if (Ty->isVarArg()) return false; +static Error makeStringError(const char *Msg) { + return createStringError(errc::invalid_argument, Msg); +} + +Error InlineAsm::verify(FunctionType *Ty, StringRef ConstStr) { + if (Ty->isVarArg()) + return makeStringError("inline asm cannot be variadic"); ConstraintInfoVector Constraints = ParseConstraints(ConstStr); // Error parsing constraints. - if (Constraints.empty() && !ConstStr.empty()) return false; + if (Constraints.empty() && !ConstStr.empty()) + return makeStringError("failed to parse constraints"); unsigned NumOutputs = 0, NumInputs = 0, NumClobbers = 0; unsigned NumIndirect = 0; @@ -265,7 +271,9 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) { switch (Constraint.Type) { case InlineAsm::isOutput: if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0) - return false; // outputs before inputs and clobbers. + return makeStringError("output constraint occurs after input " + "or clobber constraint"); + if (!Constraint.isIndirect) { ++NumOutputs; break; @@ -273,7 +281,9 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) { ++NumIndirect; LLVM_FALLTHROUGH; // We fall through for Indirect Outputs. case InlineAsm::isInput: - if (NumClobbers) return false; // inputs before clobbers. + if (NumClobbers) + return makeStringError("input constraint occurs after clobber " + "constraint"); ++NumInputs; break; case InlineAsm::isClobber: @@ -284,18 +294,23 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) { switch (NumOutputs) { case 0: - if (!Ty->getReturnType()->isVoidTy()) return false; + if (!Ty->getReturnType()->isVoidTy()) + return makeStringError("inline asm without outputs must return void"); break; case 1: - if (Ty->getReturnType()->isStructTy()) return false; + if (Ty->getReturnType()->isStructTy()) + return makeStringError("inline asm with one output cannot return struct"); break; default: StructType *STy = dyn_cast<StructType>(Ty->getReturnType()); if (!STy || STy->getNumElements() != NumOutputs) - return false; + return makeStringError("number of output constraints does not match " + "number of return struct elements"); break; } - if (Ty->getNumParams() != NumInputs) return false; - return true; + if (Ty->getNumParams() != NumInputs) + return makeStringError("number of input constraints does not match number " + "of parameters"); + return Error::success(); } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 6a91edb75dd2..b333f40f3ce9 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1696,6 +1696,10 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) { return "fadd"; case AtomicRMWInst::FSub: return "fsub"; + case AtomicRMWInst::FMax: + return "fmax"; + case AtomicRMWInst::FMin: + return "fmin"; case AtomicRMWInst::BAD_BINOP: return "<invalid operation>"; } @@ -4423,10 +4427,9 @@ MDNode *SwitchInstProfUpdateWrapper::buildProfBranchWeightsMD() { assert(SI.getNumSuccessors() == Weights->size() && "num of prof branch_weights must accord with num of successors"); - bool AllZeroes = - all_of(Weights.getValue(), [](uint32_t W) { return W == 0; }); + bool AllZeroes = all_of(Weights.value(), [](uint32_t W) { return W == 0; }); - if (AllZeroes || Weights.getValue().size() < 2) + if (AllZeroes || Weights.value().size() < 2) return nullptr; return MDBuilder(SI.getParent()->getContext()).createBranchWeights(*Weights); @@ -4460,8 +4463,8 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { // Copy the last case to the place of the removed one and shrink. // This is tightly coupled with the way SwitchInst::removeCase() removes // the cases in SwitchInst::removeCase(CaseIt). - Weights.getValue()[I->getCaseIndex() + 1] = Weights.getValue().back(); - Weights.getValue().pop_back(); + Weights.value()[I->getCaseIndex() + 1] = Weights.value().back(); + Weights.value().pop_back(); } return SI.removeCase(I); } @@ -4474,10 +4477,10 @@ void SwitchInstProfUpdateWrapper::addCase( if (!Weights && W && *W) { Changed = true; Weights = SmallVector<uint32_t, 8>(SI.getNumSuccessors(), 0); - Weights.getValue()[SI.getNumSuccessors() - 1] = *W; + Weights.value()[SI.getNumSuccessors() - 1] = *W; } else if (Weights) { Changed = true; - Weights.getValue().push_back(W.value_or(0)); + Weights.value().push_back(W.value_or(0)); } if (Weights) assert(SI.getNumSuccessors() == Weights->size() && diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index b132a9dcb812..65a9a32ad2c5 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -223,13 +223,13 @@ ConstrainedFPIntrinsic::getExceptionBehavior() const { bool ConstrainedFPIntrinsic::isDefaultFPEnvironment() const { Optional<fp::ExceptionBehavior> Except = getExceptionBehavior(); if (Except) { - if (Except.getValue() != fp::ebIgnore) + if (Except.value() != fp::ebIgnore) return false; } Optional<RoundingMode> Rounding = getRoundingMode(); if (Rounding) { - if (Rounding.getValue() != RoundingMode::NearestTiesToEven) + if (Rounding.value() != RoundingMode::NearestTiesToEven) return false; } @@ -364,13 +364,13 @@ VPIntrinsic::getVectorLengthParamPos(Intrinsic::ID IntrinsicID) { MaybeAlign VPIntrinsic::getPointerAlignment() const { Optional<unsigned> PtrParamOpt = getMemoryPointerParamPos(getIntrinsicID()); assert(PtrParamOpt && "no pointer argument!"); - return getParamAlign(PtrParamOpt.getValue()); + return getParamAlign(PtrParamOpt.value()); } /// \return The pointer operand of this load,store, gather or scatter. Value *VPIntrinsic::getMemoryPointerParam() const { if (auto PtrParamOpt = getMemoryPointerParamPos(getIntrinsicID())) - return getArgOperand(PtrParamOpt.getValue()); + return getArgOperand(PtrParamOpt.value()); return nullptr; } @@ -391,7 +391,7 @@ Value *VPIntrinsic::getMemoryDataParam() const { auto DataParamOpt = getMemoryDataParamPos(getIntrinsicID()); if (!DataParamOpt) return nullptr; - return getArgOperand(DataParamOpt.getValue()); + return getArgOperand(DataParamOpt.value()); } Optional<unsigned> VPIntrinsic::getMemoryDataParamPos(Intrinsic::ID VPID) { diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp index 06b3a3afef9d..d7aaf0008564 100644 --- a/llvm/lib/IR/LLVMContextImpl.cpp +++ b/llvm/lib/IR/LLVMContextImpl.cpp @@ -27,7 +27,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/TypeSize.h" #include <cassert> #include <utility> @@ -241,7 +240,7 @@ void LLVMContextImpl::getSyncScopeNames( /// singleton OptBisect if not explicitly set. OptPassGate &LLVMContextImpl::getOptPassGate() const { if (!OPG) - OPG = &(*OptBisector); + OPG = &getOptBisector(); return *OPG; } @@ -260,7 +259,7 @@ bool LLVMContextImpl::getOpaquePointers() { } void LLVMContextImpl::setOpaquePointers(bool OP) { - assert((!OpaquePointers || OpaquePointers.getValue() == OP) && + assert((!OpaquePointers || OpaquePointers.value() == OP) && "Cannot change opaque pointers mode once set"); OpaquePointers = OP; } diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index ae2401026ebf..2a1a514922fd 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -592,13 +592,6 @@ MDNode::Header::~Header() { (void)(O - 1)->~MDOperand(); } -void *MDNode::Header::getLargePtr() const { - static_assert(alignof(LargeStorageVector) <= alignof(Header), - "LargeStorageVector too strongly aligned"); - return reinterpret_cast<char *>(const_cast<Header *>(this)) - - sizeof(LargeStorageVector); -} - void *MDNode::Header::getSmallPtr() { static_assert(alignof(MDOperand) <= alignof(Header), "MDOperand too strongly aligned"); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 5cd74d53da75..b51ea45f651a 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -714,6 +714,18 @@ void Module::setStackProtectorGuardReg(StringRef Reg) { addModuleFlag(ModFlagBehavior::Error, "stack-protector-guard-reg", ID); } +StringRef Module::getStackProtectorGuardSymbol() const { + Metadata *MD = getModuleFlag("stack-protector-guard-symbol"); + if (auto *MDS = dyn_cast_or_null<MDString>(MD)) + return MDS->getString(); + return {}; +} + +void Module::setStackProtectorGuardSymbol(StringRef Symbol) { + MDString *ID = MDString::get(getContext(), Symbol); + addModuleFlag(ModFlagBehavior::Error, "stack-protector-guard-symbol", ID); +} + int Module::getStackProtectorGuardOffset() const { Metadata *MD = getModuleFlag("stack-protector-guard-offset"); if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD)) diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp index 418311eac814..c9054dba344a 100644 --- a/llvm/lib/IR/OptBisect.cpp +++ b/llvm/lib/IR/OptBisect.cpp @@ -23,7 +23,7 @@ using namespace llvm; static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden, cl::init(OptBisect::Disabled), cl::Optional, cl::cb<void, int>([](int Limit) { - llvm::OptBisector->setLimit(Limit); + llvm::getOptBisector().setLimit(Limit); }), cl::desc("Maximum optimization to perform")); @@ -52,4 +52,7 @@ bool OptBisect::checkPass(const StringRef PassName, const int OptBisect::Disabled; -ManagedStatic<OptBisect> llvm::OptBisector; +OptBisect &llvm::getOptBisector() { + static OptBisect OptBisector; + return OptBisector; +} diff --git a/llvm/lib/IR/PassRegistry.cpp b/llvm/lib/IR/PassRegistry.cpp index 94f607afec47..6c22fcd34769 100644 --- a/llvm/lib/IR/PassRegistry.cpp +++ b/llvm/lib/IR/PassRegistry.cpp @@ -15,21 +15,15 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Pass.h" #include "llvm/PassInfo.h" -#include "llvm/Support/ManagedStatic.h" #include <cassert> #include <memory> #include <utility> using namespace llvm; -// FIXME: We use ManagedStatic to erase the pass registrar on shutdown. -// Unfortunately, passes are registered with static ctors, and having -// llvm_shutdown clear this map prevents successful resurrection after -// llvm_shutdown is run. Ideally we should find a solution so that we don't -// leak the map, AND can still resurrect after shutdown. -static ManagedStatic<PassRegistry> PassRegistryObj; PassRegistry *PassRegistry::getPassRegistry() { - return &*PassRegistryObj; + static PassRegistry PassRegistryObj; + return &PassRegistryObj; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp index 71189e79360e..4edaeb74d6a7 100644 --- a/llvm/lib/InterfaceStub/IFSHandler.cpp +++ b/llvm/lib/InterfaceStub/IFSHandler.cpp @@ -202,8 +202,8 @@ Error ifs::writeIFSToOutputStream(raw_ostream &OS, const IFSStub &Stub) { yaml::Output YamlOut(OS, nullptr, /*WrapColumn =*/0); std::unique_ptr<IFSStubTriple> CopyStub(new IFSStubTriple(Stub)); if (Stub.Target.Arch) { - CopyStub->Target.ArchString = std::string( - ELF::convertEMachineToArchName(Stub.Target.Arch.getValue())); + CopyStub->Target.ArchString = + std::string(ELF::convertEMachineToArchName(Stub.Target.Arch.value())); } IFSTarget Target = Stub.Target; @@ -222,36 +222,35 @@ Error ifs::overrideIFSTarget(IFSStub &Stub, Optional<IFSArch> OverrideArch, Optional<std::string> OverrideTriple) { std::error_code OverrideEC(1, std::generic_category()); if (OverrideArch) { - if (Stub.Target.Arch && - Stub.Target.Arch.getValue() != OverrideArch.getValue()) { + if (Stub.Target.Arch && Stub.Target.Arch.value() != OverrideArch.value()) { return make_error<StringError>( "Supplied Arch conflicts with the text stub", OverrideEC); } - Stub.Target.Arch = OverrideArch.getValue(); + Stub.Target.Arch = OverrideArch.value(); } if (OverrideEndianness) { if (Stub.Target.Endianness && - Stub.Target.Endianness.getValue() != OverrideEndianness.getValue()) { + Stub.Target.Endianness.value() != OverrideEndianness.value()) { return make_error<StringError>( "Supplied Endianness conflicts with the text stub", OverrideEC); } - Stub.Target.Endianness = OverrideEndianness.getValue(); + Stub.Target.Endianness = OverrideEndianness.value(); } if (OverrideBitWidth) { if (Stub.Target.BitWidth && - Stub.Target.BitWidth.getValue() != OverrideBitWidth.getValue()) { + Stub.Target.BitWidth.value() != OverrideBitWidth.value()) { return make_error<StringError>( "Supplied BitWidth conflicts with the text stub", OverrideEC); } - Stub.Target.BitWidth = OverrideBitWidth.getValue(); + Stub.Target.BitWidth = OverrideBitWidth.value(); } if (OverrideTriple) { if (Stub.Target.Triple && - Stub.Target.Triple.getValue() != OverrideTriple.getValue()) { + Stub.Target.Triple.value() != OverrideTriple.value()) { return make_error<StringError>( "Supplied Triple conflicts with the text stub", OverrideEC); } - Stub.Target.Triple = OverrideTriple.getValue(); + Stub.Target.Triple = OverrideTriple.value(); } return Error::success(); } diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 5d50e92ae377..e248e58e4e4e 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -81,17 +81,19 @@ extern cl::opt<bool> NoPGOWarnMismatch; exit(1); } -Error Config::addSaveTemps(std::string OutputFileName, - bool UseInputModulePath) { +Error Config::addSaveTemps(std::string OutputFileName, bool UseInputModulePath, + const DenseSet<StringRef> &SaveTempsArgs) { ShouldDiscardValueNames = false; std::error_code EC; - ResolutionFile = - std::make_unique<raw_fd_ostream>(OutputFileName + "resolution.txt", EC, - sys::fs::OpenFlags::OF_TextWithCRLF); - if (EC) { - ResolutionFile.reset(); - return errorCodeToError(EC); + if (SaveTempsArgs.empty() || SaveTempsArgs.contains("resolution")) { + ResolutionFile = + std::make_unique<raw_fd_ostream>(OutputFileName + "resolution.txt", EC, + sys::fs::OpenFlags::OF_TextWithCRLF); + if (EC) { + ResolutionFile.reset(); + return errorCodeToError(EC); + } } auto setHook = [&](std::string PathSuffix, ModuleHookFn &Hook) { @@ -125,14 +127,7 @@ Error Config::addSaveTemps(std::string OutputFileName, }; }; - setHook("0.preopt", PreOptModuleHook); - setHook("1.promote", PostPromoteModuleHook); - setHook("2.internalize", PostInternalizeModuleHook); - setHook("3.import", PostImportModuleHook); - setHook("4.opt", PostOptModuleHook); - setHook("5.precodegen", PreCodeGenModuleHook); - - CombinedIndexHook = + auto SaveCombinedIndex = [=](const ModuleSummaryIndex &Index, const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) { std::string Path = OutputFileName + "index.bc"; @@ -152,6 +147,31 @@ Error Config::addSaveTemps(std::string OutputFileName, return true; }; + if (SaveTempsArgs.empty()) { + setHook("0.preopt", PreOptModuleHook); + setHook("1.promote", PostPromoteModuleHook); + setHook("2.internalize", PostInternalizeModuleHook); + setHook("3.import", PostImportModuleHook); + setHook("4.opt", PostOptModuleHook); + setHook("5.precodegen", PreCodeGenModuleHook); + CombinedIndexHook = SaveCombinedIndex; + } else { + if (SaveTempsArgs.contains("preopt")) + setHook("0.preopt", PreOptModuleHook); + if (SaveTempsArgs.contains("promote")) + setHook("1.promote", PostPromoteModuleHook); + if (SaveTempsArgs.contains("internalize")) + setHook("2.internalize", PostInternalizeModuleHook); + if (SaveTempsArgs.contains("import")) + setHook("3.import", PostImportModuleHook); + if (SaveTempsArgs.contains("opt")) + setHook("4.opt", PostOptModuleHook); + if (SaveTempsArgs.contains("precodegen")) + setHook("5.precodegen", PreCodeGenModuleHook); + if (SaveTempsArgs.contains("combinedindex")) + CombinedIndexHook = SaveCombinedIndex; + } + return Error::success(); } diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 5a819e2d736c..9e89cce8312e 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1357,11 +1357,18 @@ Error IRLinker::linkModuleFlagsMetadata() { DstM.getModuleIdentifier() + "'"); } - auto replaceDstValue = [&](MDNode *New) { + auto ensureDistinctOp = [&](MDNode *DstValue) { + assert(isa<MDTuple>(DstValue) && + "Expected MDTuple when appending module flags"); + if (DstValue->isDistinct()) + return dyn_cast<MDTuple>(DstValue); + MDTuple *New = MDTuple::getDistinct( + DstM.getContext(), SmallVector<Metadata *, 4>(DstValue->operands())); Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New}; - MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps); + MDNode *Flag = MDTuple::getDistinct(DstM.getContext(), FlagOps); DstModFlags->setOperand(DstIndex, Flag); Flags[ID].first = Flag; + return New; }; // Emit a warning if the values differ and either source or destination @@ -1438,25 +1445,20 @@ Error IRLinker::linkModuleFlagsMetadata() { break; } case Module::Append: { - MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); + MDTuple *DstValue = ensureDistinctOp(cast<MDNode>(DstOp->getOperand(2))); MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); - SmallVector<Metadata *, 8> MDs; - MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands()); - MDs.append(DstValue->op_begin(), DstValue->op_end()); - MDs.append(SrcValue->op_begin(), SrcValue->op_end()); - - replaceDstValue(MDNode::get(DstM.getContext(), MDs)); + for (const auto &O : SrcValue->operands()) + DstValue->push_back(O); break; } case Module::AppendUnique: { SmallSetVector<Metadata *, 16> Elts; - MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); + MDTuple *DstValue = ensureDistinctOp(cast<MDNode>(DstOp->getOperand(2))); MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); Elts.insert(DstValue->op_begin(), DstValue->op_end()); Elts.insert(SrcValue->op_begin(), SrcValue->op_end()); - - replaceDstValue(MDNode::get(DstM.getContext(), - makeArrayRef(Elts.begin(), Elts.end()))); + for (auto I = DstValue->getNumOperands(); I < Elts.size(); I++) + DstValue->push_back(Elts[I]); break; } } diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index eda495693595..78204ffe4c3b 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -145,7 +145,7 @@ struct ELFWriter { uint64_t align(unsigned Alignment); bool maybeWriteCompression(uint64_t Size, - SmallVectorImpl<char> &CompressedContents, + SmallVectorImpl<uint8_t> &CompressedContents, bool ZLibStyle, unsigned Alignment); public: @@ -819,7 +819,7 @@ MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx, // Include the debug info compression header. bool ELFWriter::maybeWriteCompression( - uint64_t Size, SmallVectorImpl<char> &CompressedContents, bool ZLibStyle, + uint64_t Size, SmallVectorImpl<uint8_t> &CompressedContents, bool ZLibStyle, unsigned Alignment) { if (ZLibStyle) { uint64_t HdrSize = @@ -875,9 +875,11 @@ void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec, raw_svector_ostream VecOS(UncompressedData); Asm.writeSectionData(VecOS, &Section, Layout); - SmallVector<char, 128> CompressedContents; - zlib::compress(StringRef(UncompressedData.data(), UncompressedData.size()), - CompressedContents); + SmallVector<uint8_t, 128> CompressedContents; + compression::zlib::compress( + makeArrayRef(reinterpret_cast<uint8_t *>(UncompressedData.data()), + UncompressedData.size()), + CompressedContents); bool ZlibStyle = MAI->compressDebugSections() == DebugCompressionType::Z; if (!maybeWriteCompression(UncompressedData.size(), CompressedContents, @@ -896,7 +898,7 @@ void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec, // Add "z" prefix to section name. This is zlib-gnu style. MC.renameELFSection(&Section, (".z" + SectionName.drop_front(1)).str()); } - W.OS << CompressedContents; + W.OS << toStringRef(CompressedContents); } void ELFWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags, diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 4be84ca7feb5..d312e3521c9e 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -773,7 +773,7 @@ MCSectionXCOFF *MCContext::getXCOFFSection( // Do the lookup. If we have a hit, return it. auto IterBool = XCOFFUniquingMap.insert(std::make_pair( IsDwarfSec - ? XCOFFSectionKey(Section.str(), DwarfSectionSubtypeFlags.getValue()) + ? XCOFFSectionKey(Section.str(), DwarfSectionSubtypeFlags.value()) : XCOFFSectionKey(Section.str(), CsectProp->MappingClass), nullptr)); auto &Entry = *IterBool.first; @@ -806,7 +806,7 @@ MCSectionXCOFF *MCContext::getXCOFFSection( if (IsDwarfSec) Result = new (XCOFFAllocator.Allocate()) MCSectionXCOFF(QualName->getUnqualifiedName(), Kind, QualName, - DwarfSectionSubtypeFlags.getValue(), Begin, CachedName, + DwarfSectionSubtypeFlags.value(), Begin, CachedName, MultiSymbolsAllowed); else Result = new (XCOFFAllocator.Allocate()) diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp index 0c041186936d..cf98cb8ff59f 100644 --- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp +++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp @@ -88,8 +88,8 @@ bool XCOFFSymbolInfo::operator<(const XCOFFSymbolInfo &SymInfo) const { return SymInfo.StorageMappingClass.has_value(); if (StorageMappingClass) { - return getSMCPriority(StorageMappingClass.getValue()) < - getSMCPriority(SymInfo.StorageMappingClass.getValue()); + return getSMCPriority(StorageMappingClass.value()) < + getSMCPriority(SymInfo.StorageMappingClass.value()); } return false; diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 04a234be3b47..563d3487ef50 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -660,6 +660,8 @@ EndStmt: Type = ELF::SHT_LLVM_SYMPART; else if (TypeName == "llvm_bb_addr_map") Type = ELF::SHT_LLVM_BB_ADDR_MAP; + else if (TypeName == "llvm_offloading") + Type = ELF::SHT_LLVM_OFFLOADING; else if (TypeName.getAsInteger(0, Type)) return TokError("unknown section type"); } diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 8c582d225e30..694ea395fdec 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -1585,6 +1585,16 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc); return false; } + // Parse directional local label references. + if (Identifier.equals_insensitive("@b") || + Identifier.equals_insensitive("@f")) { + bool Before = Identifier.equals_insensitive("@b"); + MCSymbol *Sym = getContext().getDirectionalLocalSymbol(0, Before); + if (Before && Sym->isUndefined()) + return Error(FirstTokenLoc, "Expected @@ label before @B reference"); + Res = MCSymbolRefExpr::create(Sym, getContext()); + return false; + } // Parse symbol variant. std::pair<StringRef, StringRef> Split; if (!MAI.useParensForSymbolVariant()) { @@ -1714,34 +1724,10 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, case AsmToken::BigNum: return TokError("literal value out of range for directive"); case AsmToken::Integer: { - SMLoc Loc = getTok().getLoc(); int64_t IntVal = getTok().getIntVal(); Res = MCConstantExpr::create(IntVal, getContext()); EndLoc = Lexer.getTok().getEndLoc(); Lex(); // Eat token. - // Look for 'b' or 'f' following an Integer as a directional label. - if (Lexer.getKind() == AsmToken::Identifier) { - StringRef IDVal = getTok().getString(); - // Look up the symbol variant if used. - std::pair<StringRef, StringRef> Split = IDVal.split('@'); - MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; - if (Split.first.size() != IDVal.size()) { - Variant = MCSymbolRefExpr::getVariantKindForName(Split.second); - if (Variant == MCSymbolRefExpr::VK_Invalid) - return TokError("invalid variant '" + Split.second + "'"); - IDVal = Split.first; - } - if (IDVal == "f" || IDVal == "b") { - MCSymbol *Sym = - Ctx.getDirectionalLocalSymbol(IntVal, IDVal == "b"); - Res = MCSymbolRefExpr::create(Sym, Variant, getContext()); - if (IDVal == "b" && Sym->isUndefined()) - return Error(Loc, "directional label undefined"); - DirLabels.push_back(std::make_tuple(Loc, CppHashInfo, Sym)); - EndLoc = Lexer.getTok().getEndLoc(); - Lex(); // Eat identifier. - } - } return false; } case AsmToken::String: { @@ -2042,6 +2028,9 @@ bool MasmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, .CaseLower("and", AsmToken::Amp) .CaseLower("not", AsmToken::Exclaim) .CaseLower("or", AsmToken::Pipe) + .CaseLower("xor", AsmToken::Caret) + .CaseLower("shl", AsmToken::LessLess) + .CaseLower("shr", AsmToken::GreaterGreater) .CaseLower("eq", AsmToken::EqualEqual) .CaseLower("ne", AsmToken::ExclaimEqual) .CaseLower("lt", AsmToken::Less) @@ -2110,29 +2099,9 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, AsmToken ID = getTok(); SMLoc IDLoc = ID.getLoc(); StringRef IDVal; - int64_t LocalLabelVal = -1; if (Lexer.is(AsmToken::HashDirective)) return parseCppHashLineFilenameComment(IDLoc); - // Allow an integer followed by a ':' as a directional local label. - if (Lexer.is(AsmToken::Integer)) { - LocalLabelVal = getTok().getIntVal(); - if (LocalLabelVal < 0) { - if (!TheCondState.Ignore) { - Lex(); // always eat a token - return Error(IDLoc, "unexpected token at start of statement"); - } - IDVal = ""; - } else { - IDVal = getTok().getString(); - Lex(); // Consume the integer token to be used as an identifier token. - if (Lexer.getKind() != AsmToken::Colon) { - if (!TheCondState.Ignore) { - Lex(); // always eat a token - return Error(IDLoc, "unexpected token at start of statement"); - } - } - } - } else if (Lexer.is(AsmToken::Dot)) { + if (Lexer.is(AsmToken::Dot)) { // Treat '.' as a valid identifier in this context. Lex(); IDVal = "."; @@ -2257,19 +2226,22 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, // FIXME: This doesn't diagnose assignment to a symbol which has been // implicitly marked as external. MCSymbol *Sym; - if (LocalLabelVal == -1) { - if (ParsingMSInlineAsm && SI) { - StringRef RewrittenLabel = - SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true); - assert(!RewrittenLabel.empty() && - "We should have an internal name here."); - Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(), - RewrittenLabel); - IDVal = RewrittenLabel; - } + if (ParsingMSInlineAsm && SI) { + StringRef RewrittenLabel = + SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true); + assert(!RewrittenLabel.empty() && + "We should have an internal name here."); + Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(), + RewrittenLabel); + IDVal = RewrittenLabel; + } + // Handle directional local labels + if (IDVal == "@@") { + Sym = Ctx.createDirectionalLocalSymbol(0); + } else { Sym = getContext().getOrCreateSymbol(IDVal); - } else - Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal); + } + // End of Labels should be treated as end of line for lexing // purposes but that information is not available to the Lexer who // does not understand Labels. This may cause us to see a Hash @@ -4241,7 +4213,7 @@ bool MasmParser::parseStructInitializer(const StructInfo &Structure, size_t FieldIndex = 0; if (EndToken) { // Initialize all fields with given initializers. - while (getTok().isNot(EndToken.getValue()) && + while (getTok().isNot(EndToken.value()) && FieldIndex < Structure.Fields.size()) { const FieldInfo &Field = Structure.Fields[FieldIndex++]; if (parseOptionalToken(AsmToken::Comma)) { @@ -4273,10 +4245,10 @@ bool MasmParser::parseStructInitializer(const StructInfo &Structure, } if (EndToken) { - if (EndToken.getValue() == AsmToken::Greater) + if (EndToken.value() == AsmToken::Greater) return parseAngleBracketClose(); - return parseToken(EndToken.getValue()); + return parseToken(EndToken.value()); } return false; diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp index 98eb7eada064..71c8e6f02f8e 100644 --- a/llvm/lib/MC/MCSchedule.cpp +++ b/llvm/lib/MC/MCSchedule.cpp @@ -96,10 +96,10 @@ MCSchedModel::getReciprocalThroughput(const MCSubtargetInfo &STI, continue; unsigned NumUnits = SM.getProcResource(I->ProcResourceIdx)->NumUnits; double Temp = NumUnits * 1.0 / I->Cycles; - Throughput = Throughput ? std::min(Throughput.getValue(), Temp) : Temp; + Throughput = Throughput ? std::min(Throughput.value(), Temp) : Temp; } if (Throughput) - return 1.0 / Throughput.getValue(); + return 1.0 / Throughput.value(); // If no throughput value was calculated, assume that we can execute at the // maximum issue width scaled by number of micro-ops for the schedule class. @@ -140,10 +140,10 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass, if (!I->getCycles()) continue; double Temp = countPopulation(I->getUnits()) * 1.0 / I->getCycles(); - Throughput = Throughput ? std::min(Throughput.getValue(), Temp) : Temp; + Throughput = Throughput ? std::min(Throughput.value(), Temp) : Temp; } if (Throughput) - return 1.0 / Throughput.getValue(); + return 1.0 / Throughput.value(); // If there are no execution resources specified for this class, then assume // that it can execute at the maximum default issue width. diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp index 27dc1826819b..077cee132338 100644 --- a/llvm/lib/MC/MCSectionELF.cpp +++ b/llvm/lib/MC/MCSectionELF.cpp @@ -167,6 +167,8 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, OS << "llvm_bb_addr_map"; else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP_V0) OS << "llvm_bb_addr_map_v0"; + else if (Type == ELF::SHT_LLVM_OFFLOADING) + OS << "llvm_offloading"; else report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) + " for section " + getName()); diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp index ee8fa04c421f..9a35ac69c47c 100644 --- a/llvm/lib/MC/MCSectionXCOFF.cpp +++ b/llvm/lib/MC/MCSectionXCOFF.cpp @@ -110,8 +110,8 @@ void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, // XCOFF debug sections. if (getKind().isMetadata() && isDwarfSect()) { - OS << "\n\t.dwsect " - << format("0x%" PRIx32, getDwarfSubtypeFlags().getValue()) << '\n'; + OS << "\n\t.dwsect " << format("0x%" PRIx32, getDwarfSubtypeFlags().value()) + << '\n'; OS << MAI.getPrivateLabelPrefix() << getName() << ':' << '\n'; return; } diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp index 9d8883a15c0b..77321829e614 100644 --- a/llvm/lib/ObjCopy/ConfigManager.cpp +++ b/llvm/lib/ObjCopy/ConfigManager.cpp @@ -20,9 +20,9 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const { !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() || - Common.ExtractDWO || Common.PreserveDates || Common.StripDWO || - Common.StripNonAlloc || Common.StripSections || Common.Weaken || - Common.DecompressDebugSections || + !Common.SetSectionType.empty() || Common.ExtractDWO || + Common.PreserveDates || Common.StripDWO || Common.StripNonAlloc || + Common.StripSections || Common.Weaken || Common.DecompressDebugSections || Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty()) return createStringError(llvm::errc::invalid_argument, "option is not supported for COFF"); @@ -38,9 +38,10 @@ Expected<const MachOConfig &> ConfigManager::getMachOConfig() const { !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() || !Common.UnneededSymbolsToRemove.empty() || !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() || - Common.ExtractDWO || Common.PreserveDates || Common.StripAllGNU || - Common.StripDWO || Common.StripNonAlloc || Common.StripSections || - Common.Weaken || Common.DecompressDebugSections || Common.StripUnneeded || + !Common.SetSectionType.empty() || Common.ExtractDWO || + Common.PreserveDates || Common.StripAllGNU || Common.StripDWO || + Common.StripNonAlloc || Common.StripSections || Common.Weaken || + Common.DecompressDebugSections || Common.StripUnneeded || Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty()) return createStringError(llvm::errc::invalid_argument, "option is not supported for MachO"); @@ -58,7 +59,8 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const { !Common.UnneededSymbolsToRemove.empty() || !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() || - !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty()) + !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() || + !Common.SymbolsToRename.empty()) return createStringError(llvm::errc::invalid_argument, "only flags for section dumping, removal, and " "addition are supported"); @@ -79,12 +81,12 @@ Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const { !Common.UnneededSymbolsToRemove.empty() || !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() || - !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty() || - Common.ExtractDWO || Common.ExtractMainPartition || - Common.OnlyKeepDebug || Common.PreserveDates || Common.StripAllGNU || - Common.StripDWO || Common.StripDebug || Common.StripNonAlloc || - Common.StripSections || Common.Weaken || Common.StripUnneeded || - Common.DecompressDebugSections) { + !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() || + !Common.SymbolsToRename.empty() || Common.ExtractDWO || + Common.ExtractMainPartition || Common.OnlyKeepDebug || + Common.PreserveDates || Common.StripAllGNU || Common.StripDWO || + Common.StripDebug || Common.StripNonAlloc || Common.StripSections || + Common.Weaken || Common.StripUnneeded || Common.DecompressDebugSections) { return createStringError( llvm::errc::invalid_argument, "no flags are supported yet, only basic copying is allowed"); diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp index 2d388f8a867e..781be3d8aeb1 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp @@ -600,8 +600,8 @@ handleUserSection(const NewSectionInfo &NewSection, static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, Object &Obj) { if (Config.OutputArch) { - Obj.Machine = Config.OutputArch.getValue().EMachine; - Obj.OSABI = Config.OutputArch.getValue().OSABI; + Obj.Machine = Config.OutputArch.value().EMachine; + Obj.OSABI = Config.OutputArch.value().OSABI; } if (!Config.SplitDWO.empty() && Config.ExtractDWO) { @@ -629,6 +629,66 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, if (Error E = updateAndRemoveSymbols(Config, ELFConfig, Obj)) return E; + if (!Config.SetSectionAlignment.empty()) { + for (SectionBase &Sec : Obj.sections()) { + auto I = Config.SetSectionAlignment.find(Sec.Name); + if (I != Config.SetSectionAlignment.end()) + Sec.Align = I->second; + } + } + + if (Config.OnlyKeepDebug) + for (auto &Sec : Obj.sections()) + if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE) + Sec.Type = SHT_NOBITS; + + for (const NewSectionInfo &AddedSection : Config.AddSection) { + auto AddSection = [&](StringRef Name, ArrayRef<uint8_t> Data) { + OwnedDataSection &NewSection = + Obj.addSection<OwnedDataSection>(Name, Data); + if (Name.startswith(".note") && Name != ".note.GNU-stack") + NewSection.Type = SHT_NOTE; + return Error::success(); + }; + if (Error E = handleUserSection(AddedSection, AddSection)) + return E; + } + + for (const NewSectionInfo &NewSection : Config.UpdateSection) { + auto UpdateSection = [&](StringRef Name, ArrayRef<uint8_t> Data) { + return Obj.updateSection(Name, Data); + }; + if (Error E = handleUserSection(NewSection, UpdateSection)) + return E; + } + + if (!Config.AddGnuDebugLink.empty()) + Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink, + Config.GnuDebugLinkCRC32); + + // If the symbol table was previously removed, we need to create a new one + // before adding new symbols. + if (!Obj.SymbolTable && !Config.SymbolsToAdd.empty()) + if (Error E = Obj.addNewSymbolTable()) + return E; + + for (const NewSymbolInfo &SI : Config.SymbolsToAdd) + addSymbol(Obj, SI, ELFConfig.NewSymbolVisibility); + + // --set-section-{flags,type} work with sections added by --add-section. + if (!Config.SetSectionFlags.empty() || !Config.SetSectionType.empty()) { + for (auto &Sec : Obj.sections()) { + const auto Iter = Config.SetSectionFlags.find(Sec.Name); + if (Iter != Config.SetSectionFlags.end()) { + const SectionFlagsUpdate &SFU = Iter->second; + setSectionFlagsAndType(Sec, SFU.NewFlags); + } + auto It2 = Config.SetSectionType.find(Sec.Name); + if (It2 != Config.SetSectionType.end()) + Sec.Type = It2->second; + } + } + if (!Config.SectionsToRename.empty()) { std::vector<RelocationSectionBase *> RelocSections; DenseSet<SectionBase *> RenamedSections; @@ -639,7 +699,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, const SectionRename &SR = Iter->second; Sec.Name = std::string(SR.NewName); if (SR.NewFlags) - setSectionFlagsAndType(Sec, SR.NewFlags.getValue()); + setSectionFlagsAndType(Sec, SR.NewFlags.value()); RenamedSections.insert(&Sec); } else if (RelocSec && !(Sec.Flags & SHF_ALLOC)) // Postpone processing relocation sections which are not specified in @@ -693,63 +753,6 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, } } - if (!Config.SetSectionAlignment.empty()) { - for (SectionBase &Sec : Obj.sections()) { - auto I = Config.SetSectionAlignment.find(Sec.Name); - if (I != Config.SetSectionAlignment.end()) - Sec.Align = I->second; - } - } - - if (Config.OnlyKeepDebug) - for (auto &Sec : Obj.sections()) - if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE) - Sec.Type = SHT_NOBITS; - - for (const NewSectionInfo &AddedSection : Config.AddSection) { - auto AddSection = [&](StringRef Name, ArrayRef<uint8_t> Data) { - OwnedDataSection &NewSection = - Obj.addSection<OwnedDataSection>(Name, Data); - if (Name.startswith(".note") && Name != ".note.GNU-stack") - NewSection.Type = SHT_NOTE; - return Error::success(); - }; - if (Error E = handleUserSection(AddedSection, AddSection)) - return E; - } - - for (const NewSectionInfo &NewSection : Config.UpdateSection) { - auto UpdateSection = [&](StringRef Name, ArrayRef<uint8_t> Data) { - return Obj.updateSection(Name, Data); - }; - if (Error E = handleUserSection(NewSection, UpdateSection)) - return E; - } - - if (!Config.AddGnuDebugLink.empty()) - Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink, - Config.GnuDebugLinkCRC32); - - // If the symbol table was previously removed, we need to create a new one - // before adding new symbols. - if (!Obj.SymbolTable && !Config.SymbolsToAdd.empty()) - if (Error E = Obj.addNewSymbolTable()) - return E; - - for (const NewSymbolInfo &SI : Config.SymbolsToAdd) - addSymbol(Obj, SI, ELFConfig.NewSymbolVisibility); - - // --set-section-flags works with sections added by --add-section. - if (!Config.SetSectionFlags.empty()) { - for (auto &Sec : Obj.sections()) { - const auto Iter = Config.SetSectionFlags.find(Sec.Name); - if (Iter != Config.SetSectionFlags.end()) { - const SectionFlagsUpdate &SFU = Iter->second; - setSectionFlagsAndType(Sec, SFU.NewFlags); - } - } - } - if (ELFConfig.EntryExpr) Obj.Entry = ELFConfig.EntryExpr(Obj.Entry); return Error::success(); @@ -808,7 +811,7 @@ Error objcopy::elf::executeObjcopyOnBinary(const CommonConfig &Config, return Obj.takeError(); // Prefer OutputArch (-O<format>) if set, otherwise infer it from the input. const ElfType OutputElfType = - Config.OutputArch ? getOutputElfType(Config.OutputArch.getValue()) + Config.OutputArch ? getOutputElfType(Config.OutputArch.value()) : getOutputElfType(In); if (Error E = handleArgs(Config, ELFConfig, **Obj)) diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp index b241bd817ff5..f0e4f91cd347 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp @@ -463,13 +463,12 @@ Error ELFSectionWriter<ELFT>::visit(const DecompressedSection &Sec) { ? (ZlibGnuMagic.size() + sizeof(Sec.Size)) : sizeof(Elf_Chdr_Impl<ELFT>); - StringRef CompressedContent( - reinterpret_cast<const char *>(Sec.OriginalData.data()) + DataOffset, - Sec.OriginalData.size() - DataOffset); - - SmallVector<char, 128> DecompressedContent; - if (Error Err = zlib::uncompress(CompressedContent, DecompressedContent, - static_cast<size_t>(Sec.Size))) + ArrayRef<uint8_t> CompressedContent(Sec.OriginalData.data() + DataOffset, + Sec.OriginalData.size() - DataOffset); + SmallVector<uint8_t, 128> DecompressedContent; + if (Error Err = + compression::zlib::uncompress(CompressedContent, DecompressedContent, + static_cast<size_t>(Sec.Size))) return createStringError(errc::invalid_argument, "'" + Sec.Name + "': " + toString(std::move(Err))); @@ -544,9 +543,7 @@ CompressedSection::CompressedSection(const SectionBase &Sec, DebugCompressionType CompressionType) : SectionBase(Sec), CompressionType(CompressionType), DecompressedSize(Sec.OriginalData.size()), DecompressedAlign(Sec.Align) { - zlib::compress(StringRef(reinterpret_cast<const char *>(OriginalData.data()), - OriginalData.size()), - CompressedData); + compression::zlib::compress(OriginalData, CompressedData); assert(CompressionType != DebugCompressionType::None); Flags |= ELF::SHF_COMPRESSED; @@ -2643,9 +2640,12 @@ Error BinaryWriter::finalize() { // MinAddr will be skipped. uint64_t MinAddr = UINT64_MAX; for (SectionBase &Sec : Obj.allocSections()) { + // If Sec's type is changed from SHT_NOBITS due to --set-section-flags, + // Offset may not be aligned. Align it to max(Align, 1). if (Sec.ParentSegment != nullptr) - Sec.Addr = - Sec.Offset - Sec.ParentSegment->Offset + Sec.ParentSegment->PAddr; + Sec.Addr = alignTo(Sec.Offset - Sec.ParentSegment->Offset + + Sec.ParentSegment->PAddr, + std::max(Sec.Align, uint64_t(1))); if (Sec.Type != SHT_NOBITS && Sec.Size > 0) MinAddr = std::min(MinAddr, Sec.Addr); } diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index f33bbb029c9b..799db5034532 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -539,7 +539,7 @@ class CompressedSection : public SectionBase { DebugCompressionType CompressionType; uint64_t DecompressedSize; uint64_t DecompressedAlign; - SmallVector<char, 128> CompressedData; + SmallVector<uint8_t, 128> CompressedData; public: CompressedSection(const SectionBase &Sec, diff --git a/llvm/lib/Object/Decompressor.cpp b/llvm/lib/Object/Decompressor.cpp index de067ed59ac5..a6a28a0589ac 100644 --- a/llvm/lib/Object/Decompressor.cpp +++ b/llvm/lib/Object/Decompressor.cpp @@ -19,7 +19,7 @@ using namespace object; Expected<Decompressor> Decompressor::create(StringRef Name, StringRef Data, bool IsLE, bool Is64Bit) { - if (!zlib::isAvailable()) + if (!compression::zlib::isAvailable()) return createError("zlib is not available"); Decompressor D(Data); @@ -92,7 +92,8 @@ bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) { return (Flags & ELF::SHF_COMPRESSED) || isGnuStyle(Name); } -Error Decompressor::decompress(MutableArrayRef<char> Buffer) { +Error Decompressor::decompress(MutableArrayRef<uint8_t> Buffer) { size_t Size = Buffer.size(); - return zlib::uncompress(SectionData, Buffer.data(), Size); + return compression::zlib::uncompress(arrayRefFromStringRef(SectionData), + Buffer.data(), Size); } diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 6acf4543be5a..0d5aa91c1348 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -297,6 +297,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) { STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_PART_PHDR); STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP_V0); STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_BB_ADDR_MAP); + STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_OFFLOADING); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef); diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 38de669f1d3d..1f342e55e77f 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -168,11 +168,11 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const { Optional<unsigned> Attr = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch); if (Attr) - isV7 = Attr.getValue() == ARMBuildAttrs::v7; + isV7 = Attr.value() == ARMBuildAttrs::v7; Attr = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { case ARMBuildAttrs::ApplicationProfile: Features.AddFeature("aclass"); break; @@ -191,7 +191,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const { Attr = Attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { default: break; case ARMBuildAttrs::Not_Allowed: @@ -206,7 +206,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const { Attr = Attributes.getAttributeValue(ARMBuildAttrs::FP_arch); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { default: break; case ARMBuildAttrs::Not_Allowed: @@ -230,7 +230,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const { Attr = Attributes.getAttributeValue(ARMBuildAttrs::Advanced_SIMD_arch); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { default: break; case ARMBuildAttrs::Not_Allowed: @@ -249,7 +249,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const { Attr = Attributes.getAttributeValue(ARMBuildAttrs::MVE_arch); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { default: break; case ARMBuildAttrs::Not_Allowed: @@ -268,7 +268,7 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const { Attr = Attributes.getAttributeValue(ARMBuildAttrs::DIV_use); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { default: break; case ARMBuildAttrs::DisallowDIV: @@ -524,7 +524,7 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const { Optional<unsigned> Attr = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch); if (Attr) { - switch (Attr.getValue()) { + switch (Attr.value()) { case ARMBuildAttrs::v4: Triple += "v4"; break; @@ -556,7 +556,7 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const { Optional<unsigned> ArchProfileAttr = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile); if (ArchProfileAttr && - ArchProfileAttr.getValue() == ARMBuildAttrs::MicroControllerProfile) + ArchProfileAttr.value() == ARMBuildAttrs::MicroControllerProfile) Triple += "v7m"; else Triple += "v7"; diff --git a/llvm/lib/Object/Error.cpp b/llvm/lib/Object/Error.cpp index 6d1e3f2a59d0..62cb51ca09e4 100644 --- a/llvm/lib/Object/Error.cpp +++ b/llvm/lib/Object/Error.cpp @@ -13,7 +13,6 @@ #include "llvm/Object/Error.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; using namespace object; @@ -75,10 +74,9 @@ void GenericBinaryError::log(raw_ostream &OS) const { OS << Msg; } -static ManagedStatic<_object_error_category> error_category; - const std::error_category &object::object_category() { - return *error_category; + static _object_error_category error_category; + return error_category; } llvm::Error llvm::object::isNotObjectErrorInvalidFileType(llvm::Error Err) { diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 9834b036de90..60870bbb801f 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -133,17 +133,17 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { // Compute the optional fields if needed... if (P.Program->DXILOffset) - Header.Bitcode.Offset = P.Program->DXILOffset.getValue(); + Header.Bitcode.Offset = P.Program->DXILOffset.value(); else Header.Bitcode.Offset = sizeof(dxbc::BitcodeHeader); if (P.Program->DXILSize) - Header.Bitcode.Size = P.Program->DXILSize.getValue(); + Header.Bitcode.Size = P.Program->DXILSize.value(); else Header.Bitcode.Size = P.Program->DXIL ? P.Program->DXIL->size() : 0; if (P.Program->Size) - Header.Size = P.Program->Size.getValue(); + Header.Size = P.Program->Size.value(); else Header.Size = sizeof(dxbc::ProgramHeader) + Header.Bitcode.Size; diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index cdd180cdc15d..b778006cf66e 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -656,6 +656,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration( ECase(SHT_LLVM_PART_PHDR); ECase(SHT_LLVM_BB_ADDR_MAP_V0); ECase(SHT_LLVM_BB_ADDR_MAP); + ECase(SHT_LLVM_OFFLOADING); ECase(SHT_GNU_ATTRIBUTES); ECase(SHT_GNU_HASH); ECase(SHT_GNU_verdef); diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index a5345172aae1..593243144f01 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1728,8 +1728,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // in ICP (which is performed earlier than this in the regular LTO pipeline). MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); - // Enable splitting late in the FullLTO post-link pipeline. This is done in - // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses). + // Enable splitting late in the FullLTO post-link pipeline. if (EnableHotColdSplit) MPM.addPass(HotColdSplittingPass()); diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index ab9f8bf9c957..bad8184dffcf 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -901,10 +901,11 @@ bool OptNoneInstrumentation::shouldRun(StringRef PassID, Any IR) { void OptBisectInstrumentation::registerCallbacks( PassInstrumentationCallbacks &PIC) { - if (!OptBisector->isEnabled()) + if (!getOptBisector().isEnabled()) return; PIC.registerShouldRunOptionalPassCallback([](StringRef PassID, Any IR) { - return isIgnored(PassID) || OptBisector->checkPass(PassID, getIRName(IR)); + return isIgnored(PassID) || + getOptBisector().checkPass(PassID, getIRName(IR)); }); } diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp index f9e58fd6afa5..f4f13bafb233 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -897,10 +896,9 @@ std::string CoverageMapError::message() const { return getCoverageMapErrString(Err); } -static ManagedStatic<CoverageMappingErrorCategoryType> ErrorCategory; - const std::error_category &llvm::coverage::coveragemap_category() { - return *ErrorCategory; + static CoverageMappingErrorCategoryType ErrorCategory; + return ErrorCategory; } char CoverageMapError::ID = 0; diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp index 1a187795a8a0..552140a52ad4 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp @@ -119,26 +119,26 @@ Error RawCoverageFilenamesReader::read(CovMapVersion Version) { return Err; if (CompressedLen > 0) { - if (!zlib::isAvailable()) + if (!compression::zlib::isAvailable()) return make_error<CoverageMapError>( coveragemap_error::decompression_failed); // Allocate memory for the decompressed filenames. - SmallVector<char, 0> StorageBuf; + SmallVector<uint8_t, 0> StorageBuf; // Read compressed filenames. StringRef CompressedFilenames = Data.substr(0, CompressedLen); Data = Data.substr(CompressedLen); - auto Err = - zlib::uncompress(CompressedFilenames, StorageBuf, UncompressedLen); + auto Err = compression::zlib::uncompress( + arrayRefFromStringRef(CompressedFilenames), StorageBuf, + UncompressedLen); if (Err) { consumeError(std::move(Err)); return make_error<CoverageMapError>( coveragemap_error::decompression_failed); } - StringRef UncompressedFilenames(StorageBuf.data(), StorageBuf.size()); - RawCoverageFilenamesReader Delegate(UncompressedFilenames, Filenames, + RawCoverageFilenamesReader Delegate(toStringRef(StorageBuf), Filenames, CompilationDir); return Delegate.readUncompressed(Version, NumFilenames); } diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp index 781a2901dbb9..db9be34d5248 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp @@ -46,11 +46,13 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS, bool Compress) { } } - SmallString<128> CompressedStr; - bool doCompression = - Compress && zlib::isAvailable() && DoInstrProfNameCompression; + SmallVector<uint8_t, 128> CompressedStr; + bool doCompression = Compress && compression::zlib::isAvailable() && + DoInstrProfNameCompression; if (doCompression) - zlib::compress(FilenamesStr, CompressedStr, zlib::BestSizeCompression); + compression::zlib::compress(arrayRefFromStringRef(FilenamesStr), + CompressedStr, + compression::zlib::BestSizeCompression); // ::= <num-filenames> // <uncompressed-len> @@ -59,7 +61,7 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS, bool Compress) { encodeULEB128(Filenames.size(), OS); encodeULEB128(FilenamesStr.size(), OS); encodeULEB128(doCompression ? CompressedStr.size() : 0U, OS); - OS << (doCompression ? CompressedStr.str() : StringRef(FilenamesStr)); + OS << (doCompression ? toStringRef(CompressedStr) : StringRef(FilenamesStr)); } namespace { diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 48ac5ce0d607..f8d7c4d36481 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -39,7 +39,6 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" #include "llvm/Support/SwapByteOrder.h" @@ -177,10 +176,9 @@ class InstrProfErrorCategoryType : public std::error_category { } // end anonymous namespace -static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory; - const std::error_category &llvm::instrprof_category() { - return *ErrorCategory; + static InstrProfErrorCategoryType ErrorCategory; + return ErrorCategory; } namespace { @@ -466,12 +464,13 @@ Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs, return WriteStringToResult(0, UncompressedNameStrings); } - SmallString<128> CompressedNameStrings; - zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings, - zlib::BestSizeCompression); + SmallVector<uint8_t, 128> CompressedNameStrings; + compression::zlib::compress(arrayRefFromStringRef(UncompressedNameStrings), + CompressedNameStrings, + compression::zlib::BestSizeCompression); return WriteStringToResult(CompressedNameStrings.size(), - CompressedNameStrings); + toStringRef(CompressedNameStrings)); } StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar) { @@ -488,7 +487,7 @@ Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars, NameStrs.push_back(std::string(getPGOFuncNameVarInitializer(NameVar))); } return collectPGOFuncNameStrings( - NameStrs, zlib::isAvailable() && doCompression, Result); + NameStrs, compression::zlib::isAvailable() && doCompression, Result); } Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) { @@ -501,23 +500,20 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) { uint64_t CompressedSize = decodeULEB128(P, &N); P += N; bool isCompressed = (CompressedSize != 0); - SmallString<128> UncompressedNameStrings; + SmallVector<uint8_t, 128> UncompressedNameStrings; StringRef NameStrings; if (isCompressed) { - if (!llvm::zlib::isAvailable()) + if (!llvm::compression::zlib::isAvailable()) return make_error<InstrProfError>(instrprof_error::zlib_unavailable); - StringRef CompressedNameStrings(reinterpret_cast<const char *>(P), - CompressedSize); - if (Error E = - zlib::uncompress(CompressedNameStrings, UncompressedNameStrings, - UncompressedSize)) { + if (Error E = compression::zlib::uncompress( + makeArrayRef(P, CompressedSize), UncompressedNameStrings, + UncompressedSize)) { consumeError(std::move(E)); return make_error<InstrProfError>(instrprof_error::uncompress_failed); } P += CompressedSize; - NameStrings = StringRef(UncompressedNameStrings.data(), - UncompressedNameStrings.size()); + NameStrings = toStringRef(UncompressedNameStrings); } else { NameStrings = StringRef(reinterpret_cast<const char *>(P), UncompressedSize); diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp index f794e64a13e7..b4d5550a1721 100644 --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -20,7 +20,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/raw_ostream.h" #include <string> #include <system_error> @@ -98,10 +97,9 @@ class SampleProfErrorCategoryType : public std::error_category { } // end anonymous namespace -static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory; - const std::error_category &llvm::sampleprof_category() { - return *ErrorCategory; + static SampleProfErrorCategoryType ErrorCategory; + return ErrorCategory; } void LineLocation::print(raw_ostream &OS) const { diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 280e3c6cb8d1..204e34bff879 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -877,15 +877,13 @@ std::error_code SampleProfileReaderExtBinaryBase::decompressSection( if (std::error_code EC = CompressSize.getError()) return EC; - if (!llvm::zlib::isAvailable()) + if (!llvm::compression::zlib::isAvailable()) return sampleprof_error::zlib_unavailable; - StringRef CompressedStrings(reinterpret_cast<const char *>(Data), - *CompressSize); - char *Buffer = Allocator.Allocate<char>(DecompressBufSize); + uint8_t *Buffer = Allocator.Allocate<uint8_t>(DecompressBufSize); size_t UCSize = DecompressBufSize; - llvm::Error E = - zlib::uncompress(CompressedStrings, Buffer, UCSize); + llvm::Error E = compression::zlib::uncompress( + makeArrayRef(Data, *CompressSize), Buffer, UCSize); if (E) return sampleprof_error::uncompress_failed; DecompressBuf = reinterpret_cast<const uint8_t *>(Buffer); diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp index 8ec6b7ebc29e..093790afe2d6 100644 --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -78,19 +78,20 @@ SampleProfileWriterExtBinaryBase::markSectionStart(SecType Type, } std::error_code SampleProfileWriterExtBinaryBase::compressAndOutput() { - if (!llvm::zlib::isAvailable()) + if (!llvm::compression::zlib::isAvailable()) return sampleprof_error::zlib_unavailable; std::string &UncompressedStrings = static_cast<raw_string_ostream *>(LocalBufStream.get())->str(); if (UncompressedStrings.size() == 0) return sampleprof_error::success; auto &OS = *OutputStream; - SmallString<128> CompressedStrings; - zlib::compress(UncompressedStrings, CompressedStrings, - zlib::BestSizeCompression); + SmallVector<uint8_t, 128> CompressedStrings; + compression::zlib::compress(arrayRefFromStringRef(UncompressedStrings), + CompressedStrings, + compression::zlib::BestSizeCompression); encodeULEB128(UncompressedStrings.size(), OS); encodeULEB128(CompressedStrings.size(), OS); - OS << CompressedStrings.str(); + OS << toStringRef(CompressedStrings); UncompressedStrings.clear(); return sampleprof_error::success; } diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index eb6c04d987b3..e3df172ef113 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -1862,8 +1862,10 @@ void basic_parser_impl::printOptionInfo(const Option &O, outs() << " <" << getValueStr(O, ValName) << ">..."; } else if (O.getValueExpectedFlag() == ValueOptional) outs() << "[=<" << getValueStr(O, ValName) << ">]"; - else - outs() << "=<" << getValueStr(O, ValName) << '>'; + else { + outs() << (O.ArgStr.size() == 1 ? " <" : "=<") << getValueStr(O, ValName) + << '>'; + } } Option::printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O)); diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp index 983a6348bbe4..21191972fb8b 100644 --- a/llvm/lib/Support/Compression.cpp +++ b/llvm/lib/Support/Compression.cpp @@ -22,11 +22,9 @@ #endif using namespace llvm; +using namespace llvm::compression; #if LLVM_ENABLE_ZLIB -static Error createError(StringRef Err) { - return make_error<StringError>(Err, inconvertibleErrorCode()); -} static StringRef convertZlibCodeToString(int Code) { switch (Code) { @@ -46,63 +44,59 @@ static StringRef convertZlibCodeToString(int Code) { bool zlib::isAvailable() { return true; } -void zlib::compress(StringRef InputBuffer, - SmallVectorImpl<char> &CompressedBuffer, int Level) { - unsigned long CompressedSize = ::compressBound(InputBuffer.size()); +void zlib::compress(ArrayRef<uint8_t> Input, + SmallVectorImpl<uint8_t> &CompressedBuffer, int Level) { + unsigned long CompressedSize = ::compressBound(Input.size()); CompressedBuffer.resize_for_overwrite(CompressedSize); - int Res = - ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize, - (const Bytef *)InputBuffer.data(), InputBuffer.size(), Level); + int Res = ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize, + (const Bytef *)Input.data(), Input.size(), Level); if (Res == Z_MEM_ERROR) report_bad_alloc_error("Allocation failed"); assert(Res == Z_OK); // Tell MemorySanitizer that zlib output buffer is fully initialized. // This avoids a false report when running LLVM with uninstrumented ZLib. __msan_unpoison(CompressedBuffer.data(), CompressedSize); - CompressedBuffer.truncate(CompressedSize); + if (CompressedSize < CompressedBuffer.size()) + CompressedBuffer.truncate(CompressedSize); } -Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer, +Error zlib::uncompress(ArrayRef<uint8_t> Input, uint8_t *UncompressedBuffer, size_t &UncompressedSize) { int Res = ::uncompress((Bytef *)UncompressedBuffer, (uLongf *)&UncompressedSize, - (const Bytef *)InputBuffer.data(), InputBuffer.size()); + (const Bytef *)Input.data(), Input.size()); // Tell MemorySanitizer that zlib output buffer is fully initialized. // This avoids a false report when running LLVM with uninstrumented ZLib. __msan_unpoison(UncompressedBuffer, UncompressedSize); - return Res ? createError(convertZlibCodeToString(Res)) : Error::success(); + return Res ? make_error<StringError>(convertZlibCodeToString(Res), + inconvertibleErrorCode()) + : Error::success(); } -Error zlib::uncompress(StringRef InputBuffer, - SmallVectorImpl<char> &UncompressedBuffer, +Error zlib::uncompress(ArrayRef<uint8_t> Input, + SmallVectorImpl<uint8_t> &UncompressedBuffer, size_t UncompressedSize) { UncompressedBuffer.resize_for_overwrite(UncompressedSize); Error E = - uncompress(InputBuffer, UncompressedBuffer.data(), UncompressedSize); - UncompressedBuffer.truncate(UncompressedSize); + zlib::uncompress(Input, UncompressedBuffer.data(), UncompressedSize); + if (UncompressedSize < UncompressedBuffer.size()) + UncompressedBuffer.truncate(UncompressedSize); return E; } -uint32_t zlib::crc32(StringRef Buffer) { - return ::crc32(0, (const Bytef *)Buffer.data(), Buffer.size()); -} - #else bool zlib::isAvailable() { return false; } -void zlib::compress(StringRef InputBuffer, - SmallVectorImpl<char> &CompressedBuffer, int Level) { +void zlib::compress(ArrayRef<uint8_t> Input, + SmallVectorImpl<uint8_t> &CompressedBuffer, int Level) { llvm_unreachable("zlib::compress is unavailable"); } -Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer, +Error zlib::uncompress(ArrayRef<uint8_t> Input, uint8_t *UncompressedBuffer, size_t &UncompressedSize) { llvm_unreachable("zlib::uncompress is unavailable"); } -Error zlib::uncompress(StringRef InputBuffer, - SmallVectorImpl<char> &UncompressedBuffer, +Error zlib::uncompress(ArrayRef<uint8_t> Input, + SmallVectorImpl<uint8_t> &UncompressedBuffer, size_t UncompressedSize) { llvm_unreachable("zlib::uncompress is unavailable"); } -uint32_t zlib::crc32(StringRef Buffer) { - llvm_unreachable("zlib::crc32 is unavailable"); -} #endif diff --git a/llvm/lib/Support/ConvertUTF.cpp b/llvm/lib/Support/ConvertUTF.cpp index e24a918c5c89..5436f557b993 100644 --- a/llvm/lib/Support/ConvertUTF.cpp +++ b/llvm/lib/Support/ConvertUTF.cpp @@ -417,6 +417,16 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { return isLegalUTF8(source, length); } +/* + * Exported function to return the size of the first utf-8 code unit sequence, + * Or 0 if the sequence is not valid; + */ +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length + : 0; +} + /* --------------------------------------------------------------------- */ static unsigned diff --git a/llvm/lib/Support/Error.cpp b/llvm/lib/Support/Error.cpp index 8bfc8ee7a8cc..fbe86f2b59e1 100644 --- a/llvm/lib/Support/Error.cpp +++ b/llvm/lib/Support/Error.cpp @@ -9,7 +9,6 @@ #include "llvm/Support/Error.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/ManagedStatic.h" #include <system_error> using namespace llvm; @@ -46,7 +45,10 @@ namespace { } -static ManagedStatic<ErrorErrorCategory> ErrorErrorCat; +ErrorErrorCategory &getErrorErrorCat() { + static ErrorErrorCategory ErrorErrorCat; + return ErrorErrorCat; +} namespace llvm { @@ -71,19 +73,19 @@ void logAllUnhandledErrors(Error E, raw_ostream &OS, Twine ErrorBanner) { std::error_code ErrorList::convertToErrorCode() const { return std::error_code(static_cast<int>(ErrorErrorCode::MultipleErrors), - *ErrorErrorCat); + getErrorErrorCat()); } std::error_code inconvertibleErrorCode() { return std::error_code(static_cast<int>(ErrorErrorCode::InconvertibleError), - *ErrorErrorCat); + getErrorErrorCat()); } std::error_code FileError::convertToErrorCode() const { std::error_code NestedEC = Err->convertToErrorCode(); if (NestedEC == inconvertibleErrorCode()) return std::error_code(static_cast<int>(ErrorErrorCode::FileError), - *ErrorErrorCat); + getErrorErrorCat()); return NestedEC; } diff --git a/llvm/lib/Support/Process.cpp b/llvm/lib/Support/Process.cpp index cf3962ae927b..5476becc2945 100644 --- a/llvm/lib/Support/Process.cpp +++ b/llvm/lib/Support/Process.cpp @@ -47,7 +47,7 @@ Optional<std::string> Process::FindInEnvPath(StringRef EnvName, const char EnvPathSeparatorStr[] = {Separator, '\0'}; SmallVector<StringRef, 8> Dirs; - SplitString(OptPath.getValue(), Dirs, EnvPathSeparatorStr); + SplitString(OptPath.value(), Dirs, EnvPathSeparatorStr); for (StringRef Dir : Dirs) { if (Dir.empty()) diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp index 103710303094..d4d7e75b739d 100644 --- a/llvm/lib/Support/Unicode.cpp +++ b/llvm/lib/Support/Unicode.cpp @@ -269,7 +269,7 @@ bool isPrintable(int UCS) { } /// Unicode code points of the Cf category are considered -/// fornatting characters. +/// formatting characters. bool isFormatting(int UCS) { // https://unicode.org/Public/14.0.0/ucdxml/ diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc index 3c2d118977c5..c1959b5cc2ae 100644 --- a/llvm/lib/Support/Unix/Process.inc +++ b/llvm/lib/Support/Unix/Process.inc @@ -14,7 +14,6 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" -#include "llvm/Support/ManagedStatic.h" #include <mutex> #if HAVE_FCNTL_H #include <fcntl.h> @@ -327,10 +326,6 @@ extern "C" int del_curterm(struct term *termp); extern "C" int tigetnum(char *capname); #endif -#ifdef LLVM_ENABLE_TERMINFO -static ManagedStatic<std::mutex> TermColorMutex; -#endif - bool checkTerminalEnvironmentForColors() { if (const char *TermStr = std::getenv("TERM")) { return StringSwitch<bool>(TermStr) @@ -351,7 +346,8 @@ bool checkTerminalEnvironmentForColors() { static bool terminalHasColors(int fd) { #ifdef LLVM_ENABLE_TERMINFO // First, acquire a global lock because these C routines are thread hostile. - std::lock_guard<std::mutex> G(*TermColorMutex); + static std::mutex TermColorMutex; + std::lock_guard<std::mutex> G(TermColorMutex); struct term *previous_term = set_curterm(nullptr); int errret = 0; diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 21f0c39bfd6e..97d63fff1069 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -2669,13 +2669,13 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries, " 'version': 0,\n"; if (IsCaseSensitive) OS << " 'case-sensitive': '" - << (IsCaseSensitive.getValue() ? "true" : "false") << "',\n"; + << (IsCaseSensitive.value() ? "true" : "false") << "',\n"; if (UseExternalNames) OS << " 'use-external-names': '" - << (UseExternalNames.getValue() ? "true" : "false") << "',\n"; + << (UseExternalNames.value() ? "true" : "false") << "',\n"; bool UseOverlayRelative = false; if (IsOverlayRelative) { - UseOverlayRelative = IsOverlayRelative.getValue(); + UseOverlayRelative = IsOverlayRelative.value(); OS << " 'overlay-relative': '" << (UseOverlayRelative ? "true" : "false") << "',\n"; } diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc index 32477de5184b..1621f4a54b79 100644 --- a/llvm/lib/Support/Windows/Signals.inc +++ b/llvm/lib/Support/Windows/Signals.inc @@ -731,6 +731,11 @@ static bool GetDumpType(HKEY Key, MINIDUMP_TYPE &ResultType) { /// otherwise. static std::error_code WINAPI WriteWindowsDumpFile(PMINIDUMP_EXCEPTION_INFORMATION ExceptionInfo) { + struct ScopedCriticalSection { + ScopedCriticalSection() { EnterCriticalSection(&CriticalSection); } + ~ScopedCriticalSection() { LeaveCriticalSection(&CriticalSection); } + } SCS; + using namespace llvm; using namespace llvm::sys; diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp index 10f9692d217e..2567f3ed8034 100644 --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -285,8 +285,9 @@ constexpr FeatureBitset FeaturesZNVER1 = FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_1 | FeatureSSE4_2 | FeatureSSE4_A | FeatureXSAVE | FeatureXSAVEC | FeatureXSAVEOPT | FeatureXSAVES; -constexpr FeatureBitset FeaturesZNVER2 = - FeaturesZNVER1 | FeatureCLWB | FeatureRDPID | FeatureWBNOINVD; +constexpr FeatureBitset FeaturesZNVER2 = FeaturesZNVER1 | FeatureCLWB | + FeatureRDPID | FeatureRDPRU | + FeatureWBNOINVD; static constexpr FeatureBitset FeaturesZNVER3 = FeaturesZNVER2 | FeatureINVPCID | FeaturePKU | FeatureVAES | FeatureVPCLMULQDQ; @@ -490,6 +491,7 @@ constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {}; constexpr FeatureBitset ImpliedFeaturesPRFCHW = {}; constexpr FeatureBitset ImpliedFeaturesPTWRITE = {}; constexpr FeatureBitset ImpliedFeaturesRDPID = {}; +constexpr FeatureBitset ImpliedFeaturesRDPRU = {}; constexpr FeatureBitset ImpliedFeaturesRDRND = {}; constexpr FeatureBitset ImpliedFeaturesRDSEED = {}; constexpr FeatureBitset ImpliedFeaturesRTM = {}; diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 98ceea3c3c7a..651949ad5765 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -429,7 +429,7 @@ raw_ostream &raw_ostream::operator<<(const FormattedBytes &FB) { indent(FB.IndentLevel); if (FB.FirstByteOffset) { - uint64_t Offset = FB.FirstByteOffset.getValue(); + uint64_t Offset = FB.FirstByteOffset.value(); llvm::write_hex(*this, Offset + LineIndex, HPS, OffsetWidth); *this << ": "; } diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 6c205104d569..75a99e95541a 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -2601,7 +2601,7 @@ StringRef Record::getValueAsString(StringRef FieldName) const { if (!S) PrintFatalError(getLoc(), "Record `" + getName() + "' does not have a field named `" + FieldName + "'!\n"); - return S.getValue(); + return S.value(); } llvm::Optional<StringRef> diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index f092c039b58e..b332e9dcb176 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -650,6 +650,7 @@ include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" include "AArch64SchedAmpere1.td" +include "AArch64SchedNeoverseN2.td" def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors">; @@ -1137,7 +1138,7 @@ def : ProcessorModel<"cortex-a78", CortexA57Model, ProcessorFeatures.A78, [TuneA78]>; def : ProcessorModel<"cortex-a78c", CortexA57Model, ProcessorFeatures.A78C, [TuneA78C]>; -def : ProcessorModel<"cortex-a710", CortexA57Model, ProcessorFeatures.A710, +def : ProcessorModel<"cortex-a710", NeoverseN2Model, ProcessorFeatures.A710, [TuneA710]>; def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82, [TuneR82]>; @@ -1145,17 +1146,17 @@ def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1, [TuneX1]>; def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C, [TuneX1]>; -def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2, +def : ProcessorModel<"cortex-x2", NeoverseN2Model, ProcessorFeatures.X2, [TuneX2]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; def : ProcessorModel<"neoverse-n1", CortexA57Model, ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>; -def : ProcessorModel<"neoverse-n2", CortexA57Model, +def : ProcessorModel<"neoverse-n2", NeoverseN2Model, ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>; -def : ProcessorModel<"neoverse-512tvb", CortexA57Model, +def : ProcessorModel<"neoverse-512tvb", NeoverseN2Model, ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>; -def : ProcessorModel<"neoverse-v1", CortexA57Model, +def : ProcessorModel<"neoverse-v1", NeoverseN2Model, ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3, [TuneExynosM3]>; diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index ef4860979dd3..c568f73471e1 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1173,6 +1173,8 @@ void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) { #include "AArch64GenMCPseudoLowering.inc" void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { + AArch64_MC::verifyInstructionPredicates(MI->getOpcode(), STI->getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index abfe2d507111..447ad10ddf22 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -237,6 +237,39 @@ static bool isMergePassthruOpcode(unsigned Opc) { } } +// Returns true if inactive lanes are known to be zeroed by construction. +static bool isZeroingInactiveLanes(SDValue Op) { + switch (Op.getOpcode()) { + default: + // We guarantee i1 splat_vectors to zero the other lanes by + // implementing it with ptrue and possibly a punpklo for nxv1i1. + if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) + return true; + return false; + case AArch64ISD::PTRUE: + case AArch64ISD::SETCC_MERGE_ZERO: + return true; + case ISD::INTRINSIC_WO_CHAIN: + switch (Op.getConstantOperandVal(0)) { + default: + return false; + case Intrinsic::aarch64_sve_ptrue: + case Intrinsic::aarch64_sve_pnext: + case Intrinsic::aarch64_sve_cmpeq_wide: + case Intrinsic::aarch64_sve_cmpne_wide: + case Intrinsic::aarch64_sve_cmpge_wide: + case Intrinsic::aarch64_sve_cmpgt_wide: + case Intrinsic::aarch64_sve_cmplt_wide: + case Intrinsic::aarch64_sve_cmple_wide: + case Intrinsic::aarch64_sve_cmphs_wide: + case Intrinsic::aarch64_sve_cmphi_wide: + case Intrinsic::aarch64_sve_cmplo_wide: + case Intrinsic::aarch64_sve_cmpls_wide: + return true; + } + } +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -1082,6 +1115,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); } + // FIXME: Move lowering for more nodes here if those are common between + // SVE and SME. + if (Subtarget->hasSVE() || Subtarget->hasSME()) { + for (auto VT : + {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + } + } + if (Subtarget->hasSVE()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -1162,14 +1205,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); @@ -2429,6 +2470,23 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // pn + MIB.add(MI.getOperand(2)); // pm + MIB.add(MI.getOperand(3)); // zn + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2561,6 +2619,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( BB); case AArch64::ZERO_M_PSEUDO: return EmitZero(MI, BB); + case AArch64::ADDHA_MPPZ_PSEUDO_S: + return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB); + case AArch64::ADDVA_MPPZ_PSEUDO_S: + return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB); + case AArch64::ADDHA_MPPZ_PSEUDO_D: + return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB); + case AArch64::ADDVA_MPPZ_PSEUDO_D: + return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB); } } @@ -4329,55 +4395,49 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern) { + if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all) + return DAG.getConstant(1, DL, MVT::nxv1i1); return DAG.getNode(AArch64ISD::PTRUE, DL, VT, DAG.getTargetConstant(Pattern, DL, MVT::i32)); } -static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { +// Returns a safe bitcast between two scalable vector predicates, where +// any newly created lanes from a widening bitcast are defined as zero. +static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - EVT OutVT = Op.getValueType(); - SDValue InOp = Op.getOperand(1); - EVT InVT = InOp.getValueType(); + EVT InVT = Op.getValueType(); + + assert(InVT.getVectorElementType() == MVT::i1 && + VT.getVectorElementType() == MVT::i1 && + "Expected a predicate-to-predicate bitcast"); + assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + InVT.isScalableVector() && + DAG.getTargetLoweringInfo().isTypeLegal(InVT) && + "Only expect to cast between legal scalable predicate types!"); // Return the operand if the cast isn't changing type, - // i.e. <n x 16 x i1> -> <n x 16 x i1> - if (InVT == OutVT) - return InOp; + // e.g. <n x 16 x i1> -> <n x 16 x i1> + if (InVT == VT) + return Op; - SDValue Reinterpret = - DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp); + SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); - // If the argument converted to an svbool is a ptrue or a comparison, the - // lanes introduced by the widening are zero by construction. - switch (InOp.getOpcode()) { - case AArch64ISD::SETCC_MERGE_ZERO: + // We only have to zero the lanes if new lanes are being defined, e.g. when + // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the + // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then + // we can return here. + if (InVT.bitsGT(VT)) return Reinterpret; - case ISD::INTRINSIC_WO_CHAIN: - switch (InOp.getConstantOperandVal(0)) { - case Intrinsic::aarch64_sve_ptrue: - case Intrinsic::aarch64_sve_cmpeq_wide: - case Intrinsic::aarch64_sve_cmpne_wide: - case Intrinsic::aarch64_sve_cmpge_wide: - case Intrinsic::aarch64_sve_cmpgt_wide: - case Intrinsic::aarch64_sve_cmplt_wide: - case Intrinsic::aarch64_sve_cmple_wide: - case Intrinsic::aarch64_sve_cmphs_wide: - case Intrinsic::aarch64_sve_cmphi_wide: - case Intrinsic::aarch64_sve_cmplo_wide: - case Intrinsic::aarch64_sve_cmpls_wide: - return Reinterpret; - } - } - // Splat vectors of one will generate ptrue instructions - if (ISD::isConstantSplatVectorAllOnes(InOp.getNode())) + // Check if the other lanes are already known to be zeroed by + // construction. + if (isZeroingInactiveLanes(Op)) return Reinterpret; - // Otherwise, zero the newly introduced lanes. - SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all); - SDValue MaskReinterpret = - DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask); - return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret); + // Zero the newly introduced lanes. + SDValue Mask = DAG.getConstant(1, DL, InVT); + Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask); + return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); } SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, @@ -4546,10 +4606,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_dupq_lane: return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_convert_from_svbool: - return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), - Op.getOperand(1)); + return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_convert_to_svbool: - return lowerConvertToSVBool(Op, DAG); + return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_fneg: return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); @@ -6393,9 +6452,8 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { if (SizeInBits < 8) return false; - APInt LowBits(SizeInBits, 0xFF); APInt RequredZero(SizeInBits, 0xFE); - KnownBits Bits = DAG.computeKnownBits(Arg, LowBits, 4); + KnownBits Bits = DAG.computeKnownBits(Arg, 4); bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; return ZExtBool; } @@ -14814,16 +14872,6 @@ static SDValue performANDCombine(SDNode *N, if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - // Although NEON has no EORV instruction, when only the least significant bit - // is required the operation is synonymous with ADDV. - if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) && - LHS.getOperand(0).getValueType().isFixedLengthVector() && - LHS.hasOneUse()) { - SDLoc DL(N); - SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0)); - return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS); - } - if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); @@ -16126,12 +16174,24 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, assert(Op.getValueType().isScalableVector() && TLI.isTypeLegal(Op.getValueType()) && "Expected legal scalable vector type!"); + assert(Op.getValueType() == Pg.getValueType() && + "Expected same type for PTEST operands"); // Ensure target specific opcodes are using legal type. EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue TVal = DAG.getConstant(1, DL, OutVT); SDValue FVal = DAG.getConstant(0, DL, OutVT); + // Ensure operands have type nxv16i1. + if (Op.getValueType() != MVT::nxv16i1) { + if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) && + isZeroingInactiveLanes(Op)) + Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg); + else + Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG); + Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); + } + // Set condition code (CC) flags. SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op); @@ -18026,6 +18086,54 @@ static SDValue performCSELCombine(SDNode *N, return performCONDCombine(N, DCI, DAG, 2, 3); } +// Try to re-use an already extended operand of a vector SetCC feeding a +// extended select. Doing so avoids requiring another full extension of the +// SET_CC result when lowering the select. +static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) { + EVT Op0MVT = Op->getOperand(0).getValueType(); + if (!Op0MVT.isVector() || Op->use_empty()) + return SDValue(); + + // Make sure that all uses of Op are VSELECTs with result matching types where + // the result type has a larger element type than the SetCC operand. + SDNode *FirstUse = *Op->use_begin(); + if (FirstUse->getOpcode() != ISD::VSELECT) + return SDValue(); + EVT UseMVT = FirstUse->getValueType(0); + if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits()) + return SDValue(); + if (any_of(Op->uses(), [&UseMVT](const SDNode *N) { + return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT; + })) + return SDValue(); + + APInt V; + if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V)) + return SDValue(); + + SDLoc DL(Op); + SDValue Op0ExtV; + SDValue Op1ExtV; + ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get(); + // Check if the first operand of the SET_CC is already extended. If it is, + // split the SET_CC and re-use the extended version of the operand. + SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT), + Op->getOperand(0)); + SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT), + Op->getOperand(0)); + if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { + Op0ExtV = SDValue(Op0SExt, 0); + Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1)); + } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { + Op0ExtV = SDValue(Op0ZExt, 0); + Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1)); + } else + return SDValue(); + + return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1), + Op0ExtV, Op1ExtV, Op->getOperand(2)); +} + static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); SDValue LHS = N->getOperand(0); @@ -18034,6 +18142,9 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if (SDValue V = tryToWidenSetCCOperands(N, DAG)) + return V; + // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X if (Cond == ISD::SETNE && isOneConstant(RHS) && LHS->getOpcode() == AArch64ISD::CSEL && @@ -21045,7 +21156,7 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, default: return SDValue(); case ISD::VECREDUCE_OR: - if (isAllActivePredicate(DAG, Pg)) + if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1) // The predicate can be 'Op' because // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op). return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE); @@ -21058,6 +21169,11 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, case ISD::VECREDUCE_XOR: { SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); + if (OpVT == MVT::nxv1i1) { + // Emulate a CNTP on .Q using .D and a different governing predicate. + Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg); + Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op); + } SDValue Cntp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); return DAG.getAnyExtOrTrunc(Cntp, DL, VT); @@ -21464,22 +21580,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT InVT = Op.getValueType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - (void)TLI; - assert(VT.isScalableVector() && TLI.isTypeLegal(VT) && - InVT.isScalableVector() && TLI.isTypeLegal(InVT) && + assert(VT.isScalableVector() && isTypeLegal(VT) && + InVT.isScalableVector() && isTypeLegal(InVT) && "Only expect to cast between legal scalable vector types!"); - assert((VT.getVectorElementType() == MVT::i1) == - (InVT.getVectorElementType() == MVT::i1) && - "Cannot cast between data and predicate scalable vector types!"); + assert(VT.getVectorElementType() != MVT::i1 && + InVT.getVectorElementType() != MVT::i1 && + "For predicate bitcasts, use getSVEPredicateBitCast"); if (InVT == VT) return Op; - if (VT.getVectorElementType() == MVT::i1) - return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); - EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 06ea918ea32e..e02b5e56fd2e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -571,6 +571,9 @@ public: MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitAddVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -1148,6 +1151,7 @@ private: // These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used // to transition between unpacked and packed types of the same element type, // with BITCAST used otherwise. + // This function does not handle predicate bitcasts. SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index c477a44b13b2..6839e73796a6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -29,21 +29,21 @@ def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>; // An atomic load operation that does not need either acquire or release // semantics. -class relaxed_load<PatFrag base> +class relaxed_load<PatFrags base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; let IsAtomicOrderingAcquireOrStronger = 0; } // A atomic load operation that actually needs acquire semantics. -class acquiring_load<PatFrag base> +class acquiring_load<PatFrags base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; let IsAtomicOrderingAcquire = 1; } // An atomic load operation that needs sequential consistency. -class seq_cst_load<PatFrag base> +class seq_cst_load<PatFrags base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; let IsAtomicOrderingSequentiallyConsistent = 1; @@ -63,34 +63,34 @@ let Predicates = [HasLDAPR] in { } // 8-bit loads -def : Pat<(seq_cst_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; -def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; -def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, +def : Pat<(seq_cst_load<atomic_load_az_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; +def : Pat<(acquiring_load<atomic_load_az_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_az_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)), (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>; -def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend8:$offset)), +def : Pat<(relaxed_load<atomic_load_az_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend8:$offset)), (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>; -def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset)), +def : Pat<(relaxed_load<atomic_load_az_8> (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(relaxed_load<atomic_load_8> +def : Pat<(relaxed_load<atomic_load_az_8> (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (LDURBBi GPR64sp:$Rn, simm9:$offset)>; // 16-bit loads -def : Pat<(seq_cst_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; -def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; -def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, +def : Pat<(seq_cst_load<atomic_load_az_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; +def : Pat<(acquiring_load<atomic_load_az_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_az_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)), (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; -def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend)), +def : Pat<(relaxed_load<atomic_load_az_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)), (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; -def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset)), +def : Pat<(relaxed_load<atomic_load_az_16> (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; -def : Pat<(relaxed_load<atomic_load_16> +def : Pat<(relaxed_load<atomic_load_az_16> (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), (LDURHHi GPR64sp:$Rn, simm9:$offset)>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 78bc1b8c6f02..02fa36a1df4b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1505,7 +1505,7 @@ class CRmSystemI<Operand crmtype, bits<3> opc, string asm, class SystemNoOperands<bits<3> op2, string asm, list<dag> pattern = []> : SimpleSystemI<0, (ins), asm, "", pattern>, - Sched<[]> { + Sched<[WriteHint]> { bits<4> CRm; let CRm = 0b0011; let Inst{31-12} = 0b11010101000000110010; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3802a45ad6c1..d444223e4494 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4356,10 +4356,12 @@ defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>; // AArch64's FCVT instructions saturate when out of range. multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> { + let Predicates = [HasFullFP16] in { def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)), (!cast<Instruction>(INST # v4f16) v4f16:$Rn)>; def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)), (!cast<Instruction>(INST # v8f16) v8f16:$Rn)>; + } def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)), (!cast<Instruction>(INST # v2f32) v2f32:$Rn)>; def : Pat<(v4i32 (to_int_sat v4f32:$Rn, i32)), diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 68ff1b78e84b..c66f9cfd9c22 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -778,7 +778,7 @@ let Predicates = [HasSVEorSME] in { defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>; defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; - def PTEST_PP : sve_int_ptest<0b010000, "ptest">; + def PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest>; defm PFALSE : sve_int_pfalse<0b000000, "pfalse">; defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; @@ -1531,6 +1531,14 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), (PUNPKHI_PP PPR:$Ps)>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), @@ -1539,7 +1547,6 @@ let Predicates = [HasSVEorSME] in { (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; - def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), @@ -1549,6 +1556,23 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; + + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 5))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 7))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), @@ -1566,6 +1590,39 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 5))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 7))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 9))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 11))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 13))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 15))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + // Extract subvectors from FP SVE vectors def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), (UUNPKLO_ZZ_D ZPR:$Zs)>; @@ -2074,15 +2131,6 @@ let Predicates = [HasSVEorSME] in { def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; } - def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - let AddedComplexity = 1 in { class LD1RPat<ValueType vt, SDPatternOperator operator, Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> : @@ -2347,6 +2395,9 @@ let Predicates = [HasSVEorSME] in { (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>; def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)), (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; + // Emulate .Q operation using a PTRUE_D when the other lanes don't matter. + def : Pat<(nxv1i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; // Add more complex addressing modes here as required multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load, diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td index d18a05fda191..e378b043d37e 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -28,7 +28,8 @@ def CortexA53Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td index c6b112d0d2f1..141cc6b79c8b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -29,7 +29,7 @@ def CortexA55Model : SchedMachineModel { let PostRAScheduler = 1; // Enable PostRA scheduler pass. let CompleteModel = 0; // Covers instructions applicable to Cortex-A55. - list<Predicate> UnsupportedFeatures = [HasSVE]; + list<Predicate> UnsupportedFeatures = [HasSVE, HasMTE]; // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index a860aa907fd1..8ce229374000 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -33,7 +33,8 @@ def CortexA57Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// @@ -459,9 +460,9 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCM // ASIMD FP convert, long and narrow def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>; // ASIMD FP convert, other, D-form -def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[A57Write_5cyc_1V], (instregex "^[FSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; // ASIMD FP convert, other, Q-form -def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2V], (instregex "^[FSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP divide, D-form, F32 def : InstRW<[A57Write_17cyc_1W], (instregex "FDIVv2f32")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td index 6b053f1969b4..4c65b6727d93 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td @@ -18,11 +18,11 @@ def A64FXModel : SchedMachineModel { // Determined via a mix of micro-arch details and experimentation. let LoopMicroOpBufferSize = 128; let PostRAScheduler = 1; // Using PostRA sched. - let CompleteModel = 1; + let CompleteModel = 0; list<Predicate> UnsupportedFeatures = [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth, - HasSVE2orSME]; + HasSVE2orSME, HasMTE, HasMatMulInt8, HasBF16]; let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td index 32f7299fbf87..b8d5a70d7ec6 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td @@ -25,7 +25,9 @@ def Ampere1Model : SchedMachineModel { let CompleteModel = 1; list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + PAUnsupported.F, + [HasMTE]); } let SchedModel = Ampere1Model in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 9fbb46919427..e2d916954060 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -20,7 +20,8 @@ def CycloneModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index d66efb82fccc..f2863f5a8e3b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -26,7 +26,8 @@ def ExynosM3Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index 94e70793e855..ab1e680f9e99 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -26,7 +26,8 @@ def ExynosM4Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index 1db5f5322a64..ae0b2b3eaeb6 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -26,7 +26,8 @@ def ExynosM5Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 7c9b0afdd169..a765cd1cdfe3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -25,7 +25,8 @@ def FalkorModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td index cc568a2f2f17..3551066ee7c3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -29,7 +29,8 @@ def KryoModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td new file mode 100644 index 000000000000..eb5b971d66e5 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -0,0 +1,2279 @@ +//=- AArch64SchedNeoverseN2.td - NeoverseN2 Scheduling Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for the Arm Neoverse N2 processors. +// +//===----------------------------------------------------------------------===// + +def NeoverseN2Model : SchedMachineModel { + let IssueWidth = 10; // Micro-ops dispatched at a time. + let MicroOpBufferSize = 160; // Entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 10; // Extra cycles for mispredicted branch. + let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = SMEUnsupported.F; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Neoverse N2. +// Instructions are first fetched and then decoded into internal macro-ops +// (MOPs). From there, the MOPs proceed through register renaming and dispatch +// stages. A MOP can be split into two micro-ops further down the pipeline +// after the decode stage. Once dispatched, micro-ops wait for their operands +// and issue out-of-order to one of thirteen issue pipelines. Each issue +// pipeline can accept one micro-op per cycle. + +let SchedModel = NeoverseN2Model in { + +// Define the (13) issue ports. +def N2UnitB : ProcResource<2>; // Branch 0/1 +def N2UnitS : ProcResource<2>; // Integer single Cycle 0/1 +def N2UnitM0 : ProcResource<1>; // Integer multicycle 0 +def N2UnitM1 : ProcResource<1>; // Integer multicycle 1 +def N2UnitL01 : ProcResource<2>; // Load/Store 0/1 +def N2UnitL2 : ProcResource<1>; // Load 2 +def N2UnitD : ProcResource<2>; // Store data 0/1 +def N2UnitV0 : ProcResource<1>; // FP/ASIMD 0 +def N2UnitV1 : ProcResource<1>; // FP/ASIMD 1 + +def N2UnitV : ProcResGroup<[N2UnitV0, N2UnitV1]>; // FP/ASIMD 0/1 +def N2UnitM : ProcResGroup<[N2UnitM0, N2UnitM1]>; // Integer single/multicycle 0/1 +def N2UnitL : ProcResGroup<[N2UnitL01, N2UnitL2]>; // Load/Store 0/1 and Load 2 +def N2UnitI : ProcResGroup<[N2UnitS, N2UnitM0, N2UnitM1]>; // Integer single cycle 0/1 and single/multicycle 0/1 + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadST, 0>; +def : ReadAdvance<ReadVLD, 0>; + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Neoverse N2. + +//===----------------------------------------------------------------------===// +// Define generic 1 micro-op types + +def N2Write_1cyc_1B : SchedWriteRes<[N2UnitB]> { let Latency = 1; } +def N2Write_1cyc_1I : SchedWriteRes<[N2UnitI]> { let Latency = 1; } +def N2Write_1cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 1; } +def N2Write_1cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 1; } +def N2Write_1cyc_1L01 : SchedWriteRes<[N2UnitL01]> { let Latency = 1; } +def N2Write_2cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 2; } +def N2Write_3cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 3; } +def N2Write_2cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 2; + let ResourceCycles = [2]; } +def N2Write_3cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 3; + let ResourceCycles = [3]; } +def N2Write_5cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 5; + let ResourceCycles = [5]; } +def N2Write_12cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 12; + let ResourceCycles = [12]; } +def N2Write_20cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 20; + let ResourceCycles = [20]; } +def N2Write_4cyc_1L : SchedWriteRes<[N2UnitL]> { let Latency = 4; } +def N2Write_6cyc_1L : SchedWriteRes<[N2UnitL]> { let Latency = 6; } +def N2Write_2cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 2; } +def N2Write_3cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 3; } +def N2Write_4cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 4; } +def N2Write_5cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 5; } +def N2Write_12cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 12; } +def N2Write_2cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 2; } +def N2Write_3cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 3; } +def N2Write_4cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 4; } +def N2Write_7cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 7; + let ResourceCycles = [7]; } +def N2Write_9cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 9; } +def N2Write_10cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 10; } +def N2Write_12cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 12; } +def N2Write_13cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 13; } +def N2Write_15cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 15; } +def N2Write_16cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 16; } +def N2Write_20cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 20; } +def N2Write_2cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 2; } +def N2Write_3cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 3; } +def N2Write_4cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 4; } +def N2Write_6cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 6; } +def N2Write_10cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 10; } +def N2Write_6cyc_1L01 : SchedWriteRes<[N2UnitL01]> { let Latency = 6; } + +//===----------------------------------------------------------------------===// +// Define generic 2 micro-op types + +def N2Write_1cyc_1B_1S : SchedWriteRes<[N2UnitB, N2UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1M0_1B : SchedWriteRes<[N2UnitM0, N2UnitB]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_9cyc_1M0_1L : SchedWriteRes<[N2UnitM0, N2UnitL]> { + let Latency = 9; + let NumMicroOps = 2; +} + +def N2Write_3cyc_1I_1M : SchedWriteRes<[N2UnitI, N2UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_5cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_7cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def N2Write_1cyc_1L01_1D : SchedWriteRes<[N2UnitL01, N2UnitD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def N2Write_5cyc_1M0_1V : SchedWriteRes<[N2UnitM0, N2UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_2cyc_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_4cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_10cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [5, 5]; +} + +def N2Write_13cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 13; + let NumMicroOps = 2; + let ResourceCycles = [6, 7]; +} + +def N2Write_15cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 15; + let NumMicroOps = 2; + let ResourceCycles = [7, 8]; +} + +def N2Write_16cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [8, 8]; +} + +def N2Write_4cyc_2V : SchedWriteRes<[N2UnitV, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_6cyc_2V : SchedWriteRes<[N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_6cyc_2L : SchedWriteRes<[N2UnitL, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_8cyc_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_3cyc_1M0_1M : SchedWriteRes<[N2UnitM0, N2UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def N2Write_2cyc_1M0_1M : SchedWriteRes<[N2UnitM0, N2UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def N2Write_6cyc_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1V0_1M : SchedWriteRes<[N2UnitV0, N2UnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_5cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_5cyc_1V1_1M0 : SchedWriteRes<[N2UnitV1, N2UnitM0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_7cyc_1M0_1V0 : SchedWriteRes<[N2UnitM0, N2UnitV0]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def N2Write_2cyc_1V0_1M : SchedWriteRes<[N2UnitV0, N2UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1V_1V1 : SchedWriteRes<[N2UnitV, N2UnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1L_1M : SchedWriteRes<[N2UnitL, N2UnitM]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1L_1S : SchedWriteRes<[N2UnitL, N2UnitS]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_9cyc_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} + +def N2Write_4cyc_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define generic 3 micro-op types + +def N2Write_1cyc_1L01_1D_1I : SchedWriteRes<[N2UnitL01, N2UnitD, N2UnitI]> { + let Latency = 1; + let NumMicroOps = 3; +} + +def N2Write_2cyc_1L01_1V_1I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitI]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def N2Write_2cyc_1L01_2V : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def N2Write_7cyc_1M_1M0_1V : SchedWriteRes<[N2UnitM, N2UnitM0, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 3; +} + +def N2Write_8cyc_1M0_1V1_1V : SchedWriteRes<[N2UnitM0, N2UnitV1, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def N2Write_10cyc_1V_1L_1S : SchedWriteRes<[N2UnitV, N2UnitL, N2UnitL]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def N2Write_2cyc_1L01_1S_1V : SchedWriteRes<[N2UnitL01, N2UnitS, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def N2Write_4cyc_1L01_1S_1V : SchedWriteRes<[N2UnitL01, N2UnitS, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def N2Write_6cyc_3L : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def N2Write_8cyc_1L_2V : SchedWriteRes<[N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define generic 4 micro-op types + +def N2Write_2cyc_1L01_2V_1I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV, + N2UnitI]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def N2Write_6cyc_4V0 : SchedWriteRes<[N2UnitV0, N2UnitV0, N2UnitV0, N2UnitV0]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def N2Write_4cyc_4V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def N2Write_6cyc_4V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def N2Write_8cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def N2Write_9cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} + +def N2Write_2cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def N2Write_4cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def N2Write_5cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def N2Write_8cyc_2M0_2V0 : SchedWriteRes<[N2UnitM0, N2UnitM0, N2UnitV0, + N2UnitV0]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def N2Write_11cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 11; + let NumMicroOps = 4; +} + +def N2Write_9cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 9; + let NumMicroOps = 4; +} + +def N2Write_8cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def N2Write_10cyc_2L_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 10; + let NumMicroOps = 4; +} + +def N2Write_10cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 10; + let NumMicroOps = 4; +} + +def N2Write_4cyc_2M0_2M : SchedWriteRes<[N2UnitM0, N2UnitM0, N2UnitM, + N2UnitM]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def N2Write_6cyc_2I_2L : SchedWriteRes<[N2UnitI, N2UnitI, N2UnitL, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def N2Write_7cyc_4L : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL]> { + let Latency = 7; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define generic 5 micro-op types + +def N2Write_2cyc_1L01_2V_2I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV, + N2UnitI, N2UnitI]> { + let Latency = 2; + let NumMicroOps = 5; +} + +def N2Write_8cyc_2L_3V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV, + N2UnitV]> { + let Latency = 8; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define generic 6 micro-op types + +def N2Write_8cyc_3L_3V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def N2Write_2cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 6; +} + +def N2Write_6cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def N2Write_4cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def N2Write_10cyc_2L_2V_2S : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV, + N2UnitS, N2UnitS]> { + let Latency = 10; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define generic 7 micro-op types + +def N2Write_8cyc_3L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 7; +} + +//===----------------------------------------------------------------------===// +// Define generic 8 micro-op types + +def N2Write_6cyc_8V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 8; +} + +def N2Write_2cyc_4L01_4V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitV, N2UnitV, N2UnitV, + N2UnitV]> { + let Latency = 2; + let NumMicroOps = 8; +} + +def N2Write_5cyc_4L01_4V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitV, N2UnitV, N2UnitV, + N2UnitV]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def N2Write_8cyc_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def N2Write_9cyc_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 9; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define generic 10 micro-op types + +def N2Write_7cyc_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define generic 12 micro-op types + +def N2Write_7cyc_6L01_6V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 12; +} + +//===----------------------------------------------------------------------===// +// Define generic 15 micro-op types + +def N2Write_7cyc_5L01_5S_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitS, + N2UnitS, N2UnitS, N2UnitS, + N2UnitS, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 15; +} + +//===----------------------------------------------------------------------===// +// Define generic 18 micro-op types + +def N2Write_11cyc_9L01_9V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 11; + let NumMicroOps = 18; +} + +//===----------------------------------------------------------------------===// +// Define generic 27 micro-op types + +def N2Write_11cyc_9L01_9S_9V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitS, N2UnitS, N2UnitS, + N2UnitS, N2UnitS, N2UnitS, + N2UnitS, N2UnitS, N2UnitS, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 11; + let NumMicroOps = 27; +} + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + +// Branch Instructions +// ----------------------------------------------------------------------------- + +// Branch, immed +// Compare and branch +def : SchedAlias<WriteBr, N2Write_1cyc_1B>; + +// Branch, register +def : SchedAlias<WriteBrReg, N2Write_1cyc_1B>; + +// Branch and link, immed +// Branch and link, register +def : InstRW<[N2Write_1cyc_1B_1S], (instrs BL, BLR)>; + +// Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- + +// ALU, basic +// ALU, basic, flagset +def : SchedAlias<WriteI, N2Write_1cyc_1I>; + +// ALU, extend and shift +def : SchedAlias<WriteISReg, N2Write_2cyc_1M>; +def : SchedAlias<WriteIEReg, N2Write_2cyc_1M>; + +// Arithmetic, immediate to logical address tag +def : InstRW<[N2Write_2cyc_1M], (instrs ADDG, SUBG)>; + +// Convert floating-point condition flags +// Flag manipulation instructions +def : WriteRes<WriteSys, []> { let Latency = 1; } + +// Insert Random Tags +def : InstRW<[N2Write_2cyc_1M], (instrs IRG, IRGstack)>; + +// Insert Tag Mask +// Subtract Pointer +// Subtract Pointer, flagset +def : InstRW<[N2Write_1cyc_1I], (instrs GMI, SUBP, SUBPS)>; + +// Move and shift instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteImm, N2Write_1cyc_1I>; + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- + +// SDIV, UDIV +def : SchedAlias<WriteID32, N2Write_12cyc_1M0>; +def : SchedAlias<WriteID64, N2Write_20cyc_1M0>; + +def : WriteRes<WriteIM32, [N2UnitM]> { let Latency = 2; } +def : WriteRes<WriteIM64, [N2UnitM]> { let Latency = 2; } + +// Multiply high +def : InstRW<[N2Write_3cyc_1M], (instrs SMULHrr, UMULHrr)>; + +// Pointer Authentication Instructions (v8.3 PAC) +// ----------------------------------------------------------------------------- + +// Authenticate data address +// Authenticate instruction address +// Compute pointer authentication code for data address +// Compute pointer authentication code, using generic key +// Compute pointer authentication code for instruction address +def : InstRW<[N2Write_5cyc_1M0], (instregex "^AUT", "^PAC")>; + +// Branch and link, register, with pointer authentication +// Branch, register, with pointer authentication +// Branch, return, with pointer authentication +def : InstRW<[N2Write_6cyc_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, + BRAAZ, BRAB, BRABZ, RETAA, RETAB, + ERETAA, ERETAB)>; + + +// Load register, with pointer authentication +def : InstRW<[N2Write_9cyc_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>; + +// Strip pointer authentication code +def : InstRW<[N2Write_2cyc_1M0], (instrs XPACD, XPACI, XPACLRI)>; + +// Miscellaneous data-processing instructions +// ----------------------------------------------------------------------------- + +// Bitfield extract, one reg +// Bitfield extract, two regs +// NOTE: We don't model the difference between EXTR where both operands are the +// same (one reg). +def : SchedAlias<WriteExtr, N2Write_3cyc_1I_1M>; +def : InstRW<[N2Write_3cyc_1I_1M], (instrs EXTRWrri, EXTRXrri)>; + +// Bitfield move, basic +def : SchedAlias<WriteIS, N2Write_1cyc_1I>; + +// Bitfield move, insert +def : InstRW<[N2Write_2cyc_1M], (instregex "^BFM[WX]ri$")>; + +// Load instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteLD, N2Write_4cyc_1L>; +def : SchedAlias<WriteLDIdx, N2Write_4cyc_1I_1L>; + +// Load pair, signed immed offset, signed words +def : InstRW<[N2Write_5cyc_1M0, WriteLDHi], (instrs LDPSWi)>; +// Load pair, immed post-index or immed pre-index, signed words +def : InstRW<[N2Write_5cyc_1M0, WriteLDHi, WriteAdr], + (instregex "^LDPSW(post|pre)$")>; + +// Store instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteST, N2Write_1cyc_1L01_1D>; +def : SchedAlias<WriteSTIdx, N2Write_1cyc_1L01_1D_1I>; +def : SchedAlias<WriteSTP, N2Write_1cyc_1L01_1D>; +def : SchedAlias<WriteAdr, N2Write_1cyc_1I>; // copied from A57. + +// Tag load instructions +// ----------------------------------------------------------------------------- + +// Load allocation tag +// Load multiple allocation tags +def : InstRW<[N2Write_4cyc_1L], (instrs LDG, LDGM)>; + +// Tag store instructions +// ----------------------------------------------------------------------------- + +// Store allocation tags to one or two granules, post-index +// Store allocation tags to one or two granules, pre-index +// Store allocation tag to one or two granules, zeroing, post-index +// Store Allocation Tag to one or two granules, zeroing, pre-index +// Store allocation tag and reg pair to memory, post-Index +// Store allocation tag and reg pair to memory, pre-Index +def : InstRW<[N2Write_1cyc_1L01_1D_1I], (instrs STGPreIndex, STGPostIndex, + ST2GPreIndex, ST2GPostIndex, + STZGPreIndex, STZGPostIndex, + STZ2GPreIndex, STZ2GPostIndex, + STGPpre, STGPpost)>; + +// Store allocation tags to one or two granules, signed offset +// Store allocation tag to two granules, zeroing, signed offset +// Store allocation tag and reg pair to memory, signed offset +// Store multiple allocation tags +def : InstRW<[N2Write_1cyc_1L01_1D], (instrs STGOffset, ST2GOffset, STZGOffset, + STZ2GOffset, STGPi, STGM, STZGM)>; + +// FP data processing instructions +// ----------------------------------------------------------------------------- + +// FP absolute value +// FP arithmetic +// FP min/max +// FP negate +// FP select +def : SchedAlias<WriteF, N2Write_2cyc_1V>; + +// FP compare +def : SchedAlias<WriteFCmp, N2Write_2cyc_1V0>; + +// FP divide, square root +def : SchedAlias<WriteFDiv, N2Write_7cyc_1V0>; + +// FP divide, H-form +def : InstRW<[N2Write_7cyc_1V0], (instrs FDIVHrr)>; +// FP divide, S-form +def : InstRW<[N2Write_10cyc_1V0], (instrs FDIVSrr)>; +// FP divide, D-form +def : InstRW<[N2Write_15cyc_1V0], (instrs FDIVDrr)>; + +// FP square root, H-form +def : InstRW<[N2Write_7cyc_1V0], (instrs FSQRTHr)>; +// FP square root, S-form +def : InstRW<[N2Write_9cyc_1V0], (instrs FSQRTSr)>; +// FP square root, D-form +def : InstRW<[N2Write_16cyc_1V0], (instrs FSQRTDr)>; + +// FP multiply +def : WriteRes<WriteFMul, [N2UnitV]> { let Latency = 3; } + +// FP multiply accumulate +def : InstRW<[N2Write_4cyc_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; + +// FP round to integral +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$", + "^FRINT(32|64)[XZ][SD]r$")>; + +// FP miscellaneous instructions +// ----------------------------------------------------------------------------- + +// FP convert, from gen to vec reg +def : InstRW<[N2Write_3cyc_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>; + +// FP convert, from vec to gen reg +def : InstRW<[N2Write_3cyc_1V], (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]r$")>; + +// FP convert, Javascript from vec to gen reg +// FP convert, from vec to vec reg +def : SchedAlias<WriteFCvt, N2Write_3cyc_1V0>; + +// FP move, immed +// FP move, register +def : SchedAlias<WriteFImm, N2Write_2cyc_1V>; + +// FP transfer, from gen to low half of vec reg +def : InstRW<[N2Write_3cyc_1M0], (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr, + FMOVHWr, FMOVHXr, FMOVSWr, FMOVDXr)>; + +// FP transfer, from gen to high half of vec reg +def : InstRW<[N2Write_5cyc_1M0_1V], (instrs FMOVXDHighr)>; + +// FP transfer, from vec to gen reg +def : SchedAlias<WriteFCopy, N2Write_2cyc_1V>; + +// FP load instructions +// ----------------------------------------------------------------------------- + +// Load vector reg, literal, S/D/Q forms +// Load vector reg, unscaled immed +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[SDQ]l$", + "^LDUR[BHSDQ]i$")>; + +// Load vector reg, immed post-index +def : InstRW<[N2Write_6cyc_1I_1L, WriteI], (instregex "^LDR[BHSDQ]post$")>; +// Load vector reg, immed pre-index +def : InstRW<[N2Write_6cyc_1I_1L, WriteAdr], (instregex "^LDR[BHSDQ]pre$")>; + +// Load vector reg, unsigned immed +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>; + +// Load vector reg, register offset, basic +// Load vector reg, register offset, scale, S/D-form +// Load vector reg, register offset, extend +// Load vector reg, register offset, extend, scale, S/D-form +def : InstRW<[N2Write_6cyc_1L, ReadAdrBase], (instregex "^LDR[BSD]ro[WX]$")>; + +// Load vector reg, register offset, scale, H/Q-form +// Load vector reg, register offset, extend, scale, H/Q-form +def : InstRW<[N2Write_7cyc_1I_1L, ReadAdrBase], (instregex "^LDR[HQ]ro[WX]$")>; + +// Load vector pair, immed offset, S/D-form +def : InstRW<[N2Write_6cyc_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>; + +// Load vector pair, immed offset, Q-form +def : InstRW<[N2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>; + +// Load vector pair, immed post-index, S/D-form +// Load vector pair, immed pre-index, S/D-form +def : InstRW<[N2Write_6cyc_1I_1L, WriteLDHi, WriteAdr], + (instregex "^LDP[SD](pre|post)$")>; + +// Load vector pair, immed post-index, Q-form +// Load vector pair, immed pre-index, Q-form +def : InstRW<[N2Write_6cyc_2I_2L, WriteLDHi, WriteAdr], (instrs LDPQpost, + LDPQpre)>; + +// FP store instructions +// ----------------------------------------------------------------------------- + +// Store vector reg, unscaled immed, B/H/S/D-form +// Store vector reg, unscaled immed, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STUR[BHSDQ]i$")>; + +// Store vector reg, immed post-index, B/H/S/D-form +// Store vector reg, immed post-index, Q-form +// Store vector reg, immed pre-index, B/H/S/D-form +// Store vector reg, immed pre-index, Q-form +def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V_1I, ReadAdrBase], + (instregex "^STR[BHSDQ](pre|post)$")>; + +// Store vector reg, unsigned immed, B/H/S/D-form +// Store vector reg, unsigned immed, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STR[BHSDQ]ui$")>; + +// Store vector reg, register offset, basic, B/H/S/D-form +// Store vector reg, register offset, basic, Q-form +// Store vector reg, register offset, scale, S/D-form +// Store vector reg, register offset, extend, B/H/S/D-form +// Store vector reg, register offset, extend, Q-form +// Store vector reg, register offset, extend, scale, S/D-form +def : InstRW<[N2Write_2cyc_1L01_1V, ReadAdrBase], + (instregex "^STR[BSD]ro[WX]$")>; + +// Store vector reg, register offset, scale, H-form +// Store vector reg, register offset, scale, Q-form +// Store vector reg, register offset, extend, scale, H-form +// Store vector reg, register offset, extend, scale, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V, ReadAdrBase], + (instregex "^STR[HQ]ro[WX]$")>; + +// Store vector pair, immed offset, S-form +// Store vector pair, immed offset, D-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STN?P[SD]i$")>; + +// Store vector pair, immed offset, Q-form +def : InstRW<[N2Write_2cyc_1L01_2V], (instrs STPQi, STNPQi)>; + +// Store vector pair, immed post-index, S-form +// Store vector pair, immed post-index, D-form +// Store vector pair, immed pre-index, S-form +// Store vector pair, immed pre-index, D-form +def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V_1I], + (instregex "^STP[SD](pre|post)$")>; + +// Store vector pair, immed post-index, Q-form +def : InstRW<[N2Write_2cyc_1L01_2V_1I], (instrs STPQpost)>; + +// Store vector pair, immed pre-index, Q-form +def : InstRW<[N2Write_2cyc_1L01_2V_2I], (instrs STPQpre)>; + +// ASIMD integer instructions +// ----------------------------------------------------------------------------- + +// ASIMD absolute diff +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD arith, pair-wise +// ASIMD compare +// ASIMD logical +// ASIMD max/min, basic and pair-wise +def : SchedAlias<WriteVd, N2Write_2cyc_1V>; +def : SchedAlias<WriteVq, N2Write_2cyc_1V>; + +// ASIMD absolute diff accum +// ASIMD absolute diff accum long +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>; + +// ASIMD arith, reduce, 4H/4S +def : InstRW<[N2Write_2cyc_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; + +// ASIMD arith, reduce, 8B/8H +def : InstRW<[N2Write_4cyc_1V1_1V], + (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>; + +// ASIMD arith, reduce, 16B +def : InstRW<[N2Write_4cyc_1V1], (instrs ADDVv16i8v, SADDLVv16i8v, + UADDLVv16i8v)>; + +// ASIMD dot product +// ASIMD dot product using signed and unsigned integers +def : InstRW<[N2Write_3cyc_1V], + (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; + +// ASIMD matrix multiply-accumulate +def : InstRW<[N2Write_3cyc_1V], (instrs SMMLA, UMMLA, USMMLA)>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$", + "^[SU](MAX|MIN)Vv4i32v$")>; + +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[N2Write_4cyc_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", + "^[SU](MAX|MIN)Vv8i16v$")>; + +// ASIMD max/min, reduce, 16B +def : InstRW<[N2Write_4cyc_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>; + +// ASIMD multiply +def : InstRW<[N2Write_4cyc_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>; + +// ASIMD multiply accumulate +def : InstRW<[N2Write_4cyc_1V0], (instregex "^MLAv", "^MLSv")>; + +// ASIMD multiply accumulate high +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + +// ASIMD multiply accumulate long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + +// ASIMD multiply accumulate saturating long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>; + +// ASIMD multiply/multiply long (8x8) polynomial, D-form +// ASIMD multiply/multiply long (8x8) polynomial, Q-form +def : InstRW<[N2Write_3cyc_1V0], (instregex "^PMULL?(v8i8|v16i8)$")>; + +// ASIMD multiply long +def : InstRW<[N2Write_3cyc_1V], (instregex "^[SU]MULLv", "^SQDMULLv")>; + +// ASIMD pairwise add and accumulate long +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ADALPv")>; + +// ASIMD shift accumulate +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]SRAv", "^[SU]RSRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[N2Write_2cyc_1V1], (instregex "^SHLv", "^SHLLv", "^SHRNv", + "^SSHLLv", "^SSHRv", "^USHLLv", + "^USHRv")>; + +// ASIMD shift by immed and insert, basic +def : InstRW<[N2Write_2cyc_1V1], (instregex "^SLIv", "^SRIv")>; + +// ASIMD shift by immed, complex +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^RSHRNv", "^SQRSHRNv", "^SQRSHRUNv", + "^(SQSHLU?|UQSHL)[bhsd]$", + "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", + "^SQSHRNv", "^SQSHRUNv", "^SRSHRv", "^UQRSHRNv", + "^UQSHRNv", "^URSHRv")>; + +// ASIMD shift by register, basic +def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU]SHLv")>; + +// ASIMD shift by register, complex +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^[SU]RSHLv", "^[SU]QRSHLv", + "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>; + +// ASIMD floating-point instructions +// ----------------------------------------------------------------------------- + +// ASIMD FP absolute value/difference +// ASIMD FP arith, normal +// ASIMD FP compare +// ASIMD FP complex add +// ASIMD FP max/min, normal +// ASIMD FP max/min, pairwise +// ASIMD FP negate +// Handled by SchedAlias<WriteV[dq], ...> + +// ASIMD FP complex multiply add +def : InstRW<[N2Write_4cyc_1V], (instregex "^FCMLAv")>; + +// ASIMD FP convert, long (F16 to F32) +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTL(v4|v8)i16")>; + +// ASIMD FP convert, long (F32 to F64) +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVTL(v2|v4)i32")>; + +// ASIMD FP convert, narrow (F32 to F16) +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTN(v4|v8)i16")>; + +// ASIMD FP convert, narrow (F64 to F32) +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVTN(v2|v4)i32", + "^FCVTXN(v2|v4)f32")>; + +// ASIMD FP convert, other, D-form F32 and Q-form F64 +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[FSU]CVT[AMNPZ][SU]v2f(32|64)$", + "^[SU]CVTFv2f(32|64)$")>; + +// ASIMD FP convert, other, D-form F16 and Q-form F32 +def : InstRW<[N2Write_4cyc_2V0], (instregex "^[FSU]CVT[AMNPZ][SU]v4f(16|32)$", + "^[SU]CVTFv4f(16|32)$")>; + +// ASIMD FP convert, other, Q-form F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^[FSU]CVT[AMNPZ][SU]v8f16$", + "^[SU]CVTFv8f16$")>; + +// ASIMD FP divide, D-form, F16 +def : InstRW<[N2Write_7cyc_1V0], (instrs FDIVv4f16)>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F16 +def : InstRW<[N2Write_13cyc_2V0], (instrs FDIVv8f16)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[N2Write_15cyc_2V0], (instrs FDIVv2f64)>; + +// ASIMD FP max/min, reduce, F32 and D-form F16 +def : InstRW<[N2Write_4cyc_1V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>; + +// ASIMD FP max/min, reduce, Q-form F16 +def : InstRW<[N2Write_6cyc_2V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>; + +// ASIMD FP multiply +def : InstRW<[N2Write_3cyc_1V], (instregex "^FMULv", "^FMULXv")>; + +// ASIMD FP multiply accumulate +def : InstRW<[N2Write_4cyc_1V], (instregex "^FMLAv", "^FMLSv")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[N2Write_5cyc_1V], (instregex "^FMLALv", "^FMLSLv")>; + +// ASIMD FP round, D-form F32 and Q-form F64 +def : InstRW<[N2Write_3cyc_1V0], + (instregex "^FRINT[AIMNPXZ]v2f(32|64)$", + "^FRINT[32|64)[XZ]v2f(32|64)$")>; + +// ASIMD FP round, D-form F16 and Q-form F32 +def : InstRW<[N2Write_4cyc_2V0], + (instregex "^FRINT[AIMNPXZ]v4f(16|32)$", + "^FRINT(32|64)[XZ]v4f32$")>; + + +// ASIMD FP round, Q-form F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^FRINT[AIMNPXZ]v8f16$")>; + +// ASIMD FP square root, D-form, F16 +def : InstRW<[N2Write_7cyc_1V0], (instrs FSQRTv4f16)>; + +// ASIMD FP square root, D-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FSQRTv2f32)>; + +// ASIMD FP square root, Q-form, F16 +def : InstRW<[N2Write_13cyc_2V0], (instrs FSQRTv8f16)>; + +// ASIMD FP square root, Q-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FSQRTv4f32)>; + +// ASIMD FP square root, Q-form, F64 +def : InstRW<[N2Write_16cyc_2V0], (instrs FSQRTv2f64)>; + +// ASIMD BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// ASIMD convert, F32 to BF16 +def : InstRW<[N2Write_4cyc_1V0], (instrs BFCVTN, BFCVTN2)>; + +// ASIMD dot product +def : InstRW<[N2Write_4cyc_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>; + +// ASIMD matrix multiply accumulate +def : InstRW<[N2Write_5cyc_1V], (instrs BFMMLA)>; + +// ASIMD multiply accumulate long +def : InstRW<[N2Write_4cyc_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, + BFMLALTIdx)>; + +// Scalar convert, F32 to BF16 +def : InstRW<[N2Write_3cyc_1V0], (instrs BFCVT)>; + +// ASIMD miscellaneous instructions +// ----------------------------------------------------------------------------- + +// ASIMD bit reverse +// ASIMD bitwise insert +// ASIMD count +// ASIMD duplicate, element +// ASIMD extract +// ASIMD extract narrow +// ASIMD insert, element to element +// ASIMD move, FP immed +// ASIMD move, integer immed +// ASIMD reverse +// ASIMD table lookup, 1 or 2 table regs +// ASIMD table lookup extension, 1 table reg +// ASIMD transfer, element to gen reg +// ASIMD transpose +// ASIMD unzip/zip +// Handled by SchedAlias<WriteV[dq], ...> + +// ASIMD duplicate, gen reg +def : InstRW<[N2Write_3cyc_1M0], (instregex "^DUPv.+gpr")>; + +// ASIMD extract narrow, saturating +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]QXTNv", "^SQXTUNv")>; + +// ASIMD reciprocal and square root estimate, D-form U32 +def : InstRW<[N2Write_3cyc_1V0], (instrs URECPEv2i32, URSQRTEv2i32)>; + +// ASIMD reciprocal and square root estimate, Q-form U32 +def : InstRW<[N2Write_4cyc_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>; + +// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms +def : InstRW<[N2Write_3cyc_1V0], (instrs FRECPEv1f16, FRECPEv1i32, + FRECPEv1i64, FRECPEv2f32, + FRSQRTEv1f16, FRSQRTEv1i32, + FRSQRTEv1i64, FRSQRTEv2f32)>; + +// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32 +def : InstRW<[N2Write_4cyc_2V0], (instrs FRECPEv4f16, FRECPEv4f32, + FRSQRTEv4f16, FRSQRTEv4f32)>; + +// ASIMD reciprocal and square root estimate, Q-form F16 +def : InstRW<[N2Write_6cyc_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>; + +// ASIMD reciprocal exponent +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRECPXv")>; + +// ASIMD reciprocal step +def : InstRW<[N2Write_4cyc_1V], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD table lookup, 3 table regs +def : InstRW<[N2Write_4cyc_2V], (instrs TBLv8i8Three, TBLv16i8Three)>; + +// ASIMD table lookup, 4 table regs +def : InstRW<[N2Write_4cyc_4V], (instrs TBLv8i8Four, TBLv16i8Four)>; + +// ASIMD table lookup extension, 2 table reg +def : InstRW<[N2Write_4cyc_2V], (instrs TBXv8i8Two, TBXv16i8Two)>; + +// ASIMD table lookup extension, 3 table reg +def : InstRW<[N2Write_6cyc_4V], (instrs TBXv8i8Three, TBXv16i8Three)>; + +// ASIMD table lookup extension, 4 table reg +def : InstRW<[N2Write_6cyc_8V], (instrs TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, gen reg to element +def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^INSv")>; + +// ASIMD load instructions +// ----------------------------------------------------------------------------- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_6cyc_1L, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_1L, WriteAdr], + (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_6cyc_2L, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_2L, WriteAdr], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_6cyc_3L, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_3L, WriteAdr], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_7cyc_4L, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_7cyc_4L, WriteAdr], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; + +// ASIMD load, 2 element, multiple, Q-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[N2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; + +// ASIMD load, 3 element, multiple, Q-form, B/H/S +def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; + +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(2d)$")>; +def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +// ASIMD load, 3 element, one lane, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[N2Write_9cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_9cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[N2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_4L_4V, WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store instructions +// ----------------------------------------------------------------------------- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[N2Write_2cyc_3L01_3V], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_3L01_3V, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[N2Write_2cyc_4L01_4V], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_4L01_4V, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; + +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_4cyc_2L01_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_5cyc_2L01_2V], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[N2Write_5cyc_2L01_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; + +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, B/H/S +def : InstRW<[N2Write_7cyc_6L01_6V], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[N2Write_7cyc_6L01_6V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[N2Write_5cyc_4L01_4V], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[N2Write_5cyc_4L01_4V, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H/S +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4i(8|16|32)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4i(8|16|32)_POST$")>; + +// ASIMD store, 4 element, one lane, D +def : InstRW<[N2Write_4cyc_3L01_3V], (instregex "ST4i(64)$")>; +def : InstRW<[N2Write_4cyc_3L01_3V, WriteAdr], (instregex "ST4i(64)_POST$")>; + +// Cryptography extensions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[N2Write_2cyc_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[N2Write_2cyc_1V0], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 hash acceleration op +// Crypto SHA1 schedule acceleration ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA1(H|SU0|SU1)")>; + +// Crypto SHA1 hash acceleration ops +// Crypto SHA256 hash acceleration ops +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>; + +// Crypto SHA256 schedule acceleration ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA256SU[01]")>; + +// Crypto SHA512 hash acceleration ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>; + +// Crypto SHA3 ops +def : InstRW<[N2Write_2cyc_1V0], (instrs BCAX, EOR3, RAX1, XAR)>; + +// Crypto SM3 ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$", + "^SM3TT[12][AB]$")>; + +// Crypto SM4 ops +def : InstRW<[N2Write_4cyc_1V0], (instrs SM4E, SM4ENCKEY)>; + +// CRC +// ----------------------------------------------------------------------------- + +def : InstRW<[N2Write_2cyc_1M0], (instregex "^CRC32")>; + +// SVE Predicate instructions +// ----------------------------------------------------------------------------- + +// Loop control, based on predicate +def : InstRW<[N2Write_2cyc_1M], (instrs BRKA_PPmP, BRKA_PPzP, + BRKB_PPmP, BRKB_PPzP)>; + +// Loop control, based on predicate and flag setting +def : InstRW<[N2Write_3cyc_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>; + +// Loop control, propagating +def : InstRW<[N2Write_2cyc_1M0], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; + +// Loop control, propagating and flag setting +def : InstRW<[N2Write_3cyc_1M0_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP, + BRKPBS_PPzPP)>; + +// Loop control, based on GPR +def : InstRW<[N2Write_3cyc_1M], + (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]$")>; + +def : InstRW<[N2Write_3cyc_1M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]$")>; + +// Loop terminate +def : InstRW<[N2Write_1cyc_1M], (instregex "^CTERM(EQ|NE)_(WW|XX)$")>; + +// Predicate counting scalar +def : InstRW<[N2Write_2cyc_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; +def : InstRW<[N2Write_2cyc_1M], + (instregex "^(CNT|DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI$", + "^SQ(DEC|INC)[BHWD]_XPiWdI$", + "^(UQDEC|UQINC)[BHWD]_WPiI$")>; + +// Predicate counting scalar, active predicate +def : InstRW<[N2Write_2cyc_1M], + (instregex "^CNTP_XPP_[BHSD]$", + "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]$", + "^(UQDEC|UQINC)P_WP_[BHSD]$", + "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]$")>; + +// Predicate counting vector, active predicate +def : InstRW<[N2Write_7cyc_1M_1M0_1V], + (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]$")>; + +// Predicate logical +def : InstRW<[N2Write_1cyc_1M0], + (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP$")>; + +// Predicate logical, flag setting +def : InstRW<[N2Write_2cyc_1M0_1M], + (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP$")>; + +// Predicate reverse +def : InstRW<[N2Write_2cyc_1M], (instregex "^REV_PP_[BHSD]$")>; + +// Predicate select +def : InstRW<[N2Write_1cyc_1M0], (instrs SEL_PPPP)>; + +// Predicate set +def : InstRW<[N2Write_2cyc_1M], (instregex "^PFALSE$", "^PTRUE_[BHSD]$")>; + +// Predicate set/initialize, set flags +def : InstRW<[N2Write_3cyc_1M], (instregex "^PTRUES_[BHSD]$")>; + +// Predicate find first/next +def : InstRW<[N2Write_3cyc_1M], (instregex "^PFIRST_B$", "^PNEXT_[BHSD]$")>; + +// Predicate test +def : InstRW<[N2Write_1cyc_1M], (instrs PTEST_PP)>; + +// Predicate transpose +def : InstRW<[N2Write_2cyc_1M], (instregex "^TRN[12]_PPP_[BHSDQ]$")>; + +// Predicate unpack and widen +def : InstRW<[N2Write_2cyc_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>; + +// Predicate zip/unzip +def : InstRW<[N2Write_2cyc_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]$")>; + +// SVE integer instructions +// ----------------------------------------------------------------------------- + +// Arithmetic, absolute diff +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]$")>; + +// Arithmetic, absolute diff accum +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>; + +// Arithmetic, absolute diff accum long +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>; + +// Arithmetic, absolute diff long +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>; + +// Arithmetic, basic +def : InstRW<[N2Write_2cyc_1V], + (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]$", + "^(ADD|SUB)_ZZZ_[BHSD]$", + "^(ADD|SUB|SUBR)_ZI_[BHSD]$", + "^ADR_[SU]XTW_ZZZ_D_[0123]$", + "^ADR_LSL_ZZZ_[SD]_[0123]$", + "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]$", + "^SADDLBT_ZZZ_[HSD]$", + "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]$", + "^SSUBL(BT|TB)_ZZZ_[HSD]$")>; + +// Arithmetic, complex +def : InstRW<[N2Write_2cyc_1V], + (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]$", + "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]$", + "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]$", + "^[SU]Q(ADD|SUB)_ZI_[BHSD]$", + "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]$", + "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]$")>; + +// Arithmetic, large integer +def : InstRW<[N2Write_2cyc_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>; + +// Arithmetic, pairwise add +def : InstRW<[N2Write_2cyc_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>; + +// Arithmetic, pairwise add and accum long +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>; + +// Arithmetic, shift +def : InstRW<[N2Write_2cyc_1V1], + (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]$", + "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]$", + "^(ASR|LSL|LSR)_ZPmI_[BHSD]$", + "^(ASR|LSL|LSR)_ZPmZ_[BHSD]$", + "^(ASR|LSL|LSR)_ZZI_[BHSD]$", + "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]$")>; + +// Arithmetic, shift and accumulate +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>; + +// Arithmetic, shift by immediate +// Arithmetic, shift by immediate and insert +def : InstRW<[N2Write_2cyc_1V1], + (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]$")>; + +// Arithmetic, shift complex +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]$", + "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]$", + "^(SQSHL|SQSHLU|UQSHL)_ZPmI_[BHSD]$", + "^SQSHRU?N[BT]_ZZI_[BHS]$", + "^UQR?SHRN[BT]_ZZI_[BHS]$")>; + +// Arithmetic, shift right for divide +def : InstRW<[N2Write_4cyc_1V1], (instregex "^ASRD_ZPmI_[BHSD]$")>; + +// Arithmetic, shift rounding +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^(SRSHL|SRSHLR|URSHL|URSHLR)_ZPmZ_[BHSD]$", + "^[SU]RSHR_ZPmI_[BHSD]$")>; + +// Bit manipulation +def : InstRW<[N2Write_6cyc_2V1], + (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]$")>; + +// Bitwise select +def : InstRW<[N2Write_2cyc_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ$")>; + +// Count/reverse bits +def : InstRW<[N2Write_2cyc_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]$")>; + +// Broadcast logical bitmask immediate to vector +def : InstRW<[N2Write_2cyc_1V], (instrs DUPM_ZI)>; + +// Compare and set flags +def : InstRW<[N2Write_4cyc_1V0_1M], + (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]$", + "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]$")>; + +// Complex add +def : InstRW<[N2Write_2cyc_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>; + +// Complex dot product 8-bit element +def : InstRW<[N2Write_3cyc_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; + +// Complex dot product 16-bit element +def : InstRW<[N2Write_4cyc_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; + +// Complex multiply-add B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^CMLA_ZZZ_[BHS]$", + "^CMLA_ZZZI_[HS]$")>; + +// Complex multiply-add D element size +def : InstRW<[N2Write_5cyc_2V0], (instrs CMLA_ZZZ_D)>; + +// Conditional extract operations, scalar form +def : InstRW<[N2Write_8cyc_1M0_1V1_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>; + +// Conditional extract operations, SIMD&FP scalar and vector forms +def : InstRW<[N2Write_3cyc_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]$", + "^COMPACT_ZPZ_[SD]$", + "^SPLICE_ZPZZ?_[BHSD]$")>; + +// Convert to floating point, 64b to float or convert to double +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]$")>; + +// Convert to floating point, 64b to half +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_DtoH$")>; + +// Convert to floating point, 32b to single or half +def : InstRW<[N2Write_4cyc_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]$")>; + +// Convert to floating point, 32b to double +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_StoD$")>; + +// Convert to floating point, 16b to half +def : InstRW<[N2Write_6cyc_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH$")>; + +// Copy, scalar +def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]$")>; + +// Copy, scalar SIMD&FP or imm +def : InstRW<[N2Write_2cyc_1V], (instregex "^CPY_ZPm[IV]_[BHSD]$", + "^CPY_ZPzI_[BHSD]$")>; + +// Divides, 32 bit +def : InstRW<[N2Write_12cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_S$")>; + +// Divides, 64 bit +def : InstRW<[N2Write_20cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_D$")>; + +// Dot product, 8 bit +def : InstRW<[N2Write_3cyc_1V], (instregex "^[SU]DOT_ZZZI?_S$")>; + +// Dot product, 8 bit, using signed and unsigned integers +def : InstRW<[N2Write_3cyc_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; + +// Dot product, 16 bit +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; + +// Duplicate, immediate and indexed form +def : InstRW<[N2Write_2cyc_1V], (instregex "^DUP_ZI_[BHSD]$", + "^DUP_ZZI_[BHSDQ]$")>; + +// Duplicate, scalar form +def : InstRW<[N2Write_3cyc_1M0], (instregex "^DUP_ZR_[BHSD]$")>; + +// Extend, sign or zero +def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]$", + "^[SU]XTH_ZPmZ_[SD]$", + "^[SU]XTW_ZPmZ_[D]$")>; + +// Extract +def : InstRW<[N2Write_2cyc_1V], (instrs EXT_ZZI, EXT_ZZI_B)>; + +// Extract narrow saturating +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$", + "^SQXTUN[BT]_ZZ_[BHS]$")>; + +// Extract/insert operation, SIMD and FP scalar form +def : InstRW<[N2Write_3cyc_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$", + "^INSR_ZV_[BHSD]$")>; + +// Extract/insert operation, scalar +def : InstRW<[N2Write_5cyc_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]$", + "^INSR_ZR_[BHSD]$")>; + +// Histogram operations +def : InstRW<[N2Write_2cyc_1V], (instregex "^HISTCNT_ZPzZZ_[SD]$", + "^HISTSEG_ZZZ$")>; + +// Horizontal operations, B, H, S form, immediate operands only +def : InstRW<[N2Write_4cyc_1V0], (instregex "^INDEX_II_[BHS]$")>; + +// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar +// operands only / immediate, scalar operands +def : InstRW<[N2Write_7cyc_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]$")>; + +// Horizontal operations, D form, immediate operands only +def : InstRW<[N2Write_5cyc_2V0], (instrs INDEX_II_D)>; + +// Horizontal operations, D form, scalar, immediate operands)/ scalar operands +// only / immediate, scalar operands +def : InstRW<[N2Write_8cyc_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D$")>; + +// Logical +def : InstRW<[N2Write_2cyc_1V], + (instregex "^(AND|EOR|ORR)_ZI$", + "^(AND|BIC|EOR|EOR(BT|TB)?|ORR)_ZZZ$", + "^EOR(BT|TB)_ZZZ_[BHSD]$", + "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]$")>; + +// Max/min, basic and pairwise +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]$", + "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]$")>; + +// Matching operations +def : InstRW<[N2Write_2cyc_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]$")>; + +// Matrix multiply-accumulate +def : InstRW<[N2Write_3cyc_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; + +// Move prefix +def : InstRW<[N2Write_2cyc_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$", + "^MOVPRFX_ZZ$")>; + +// Multiply, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]$", + "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]$")>; + +// Multiply, D element size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D$", + "^[SU]MULH_(ZPmZ|ZZZ)_D$")>; + +// Multiply long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$", + "^[SU]MULL[BT]_ZZZ_[HSD]$")>; + +// Multiply accumulate, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$", + "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]$")>; + +// Multiply accumulate, D element size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^ML[AS]_ZZZI_D$", + "^(ML[AS]|MAD|MSB)_ZPmZZ_D$")>; + +// Multiply accumulate long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$", + "^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>; + +// Multiply accumulate saturating doubling long regular +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$", + "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>; + +// Multiply saturating doubling high, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$", + "^SQDMULH_ZZZI_[HS]$")>; + +// Multiply saturating doubling high, D element size +def : InstRW<[N2Write_5cyc_2V0], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; + +// Multiply saturating doubling long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$", + "^SQDMULL[BT]_ZZZI_[SD]$")>; + +// Multiply saturating rounding doubling regular/complex accumulate, B, H, S +// element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$", + "^SQRDCMLAH_ZZZ_[BHS]$", + "^SQRDML[AS]H_ZZZI_[HS]$", + "^SQRDCMLAH_ZZZI_[HS]$")>; + +// Multiply saturating rounding doubling regular/complex accumulate, D element +// size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$", + "^SQRDCMLAH_ZZZ_D$")>; + +// Multiply saturating rounding doubling regular/complex, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDMULH_ZZZ_[BHS]$", + "^SQRDMULH_ZZZI_[HS]$")>; + +// Multiply saturating rounding doubling regular/complex, D element size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^SQRDMULH_ZZZI?_D$")>; + +// Multiply/multiply long, (8x8) polynomial +def : InstRW<[N2Write_2cyc_1V0], (instregex "^PMUL_ZZZ_B$", + "^PMULL[BT]_ZZZ_[HDQ]$")>; + +// Predicate counting vector +def : InstRW<[N2Write_2cyc_1V0], + (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI$")>; + +// Reciprocal estimate +def : InstRW<[N2Write_4cyc_2V0], (instrs URECPE_ZPmZ_S, URSQRTE_ZPmZ_S)>; + +// Reduction, arithmetic, B form +def : InstRW<[N2Write_11cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; + +// Reduction, arithmetic, H form +def : InstRW<[N2Write_9cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; + +// Reduction, arithmetic, S form +def : InstRW<[N2Write_8cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; + +// Reduction, arithmetic, D form +def : InstRW<[N2Write_8cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; + +// Reduction, logical +def : InstRW<[N2Write_6cyc_1V_1V1], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]$")>; + +// Reverse, vector +def : InstRW<[N2Write_2cyc_1V], (instregex "^REV_ZZ_[BHSD]$", + "^REVB_ZPmZ_[HSD]$", + "^REVH_ZPmZ_[SD]$", + "^REVW_ZPmZ_D$")>; + +// Select, vector form +def : InstRW<[N2Write_2cyc_1V], (instregex "^SEL_ZPZZ_[BHSD]$")>; + +// Table lookup +def : InstRW<[N2Write_2cyc_1V], (instregex "^TBL_ZZZZ?_[BHSD]$")>; + +// Table lookup extension +def : InstRW<[N2Write_2cyc_1V], (instregex "^TBX_ZZZ_[BHSD]$")>; + +// Transpose, vector form +def : InstRW<[N2Write_2cyc_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]$")>; + +// Unpack and extend +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]$")>; + +// Zip/unzip +def : InstRW<[N2Write_2cyc_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]$")>; + +// SVE floating-point instructions +// ----------------------------------------------------------------------------- + +// Floating point absolute value/difference +def : InstRW<[N2Write_2cyc_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]$")>; + +// Floating point arithmetic +def : InstRW<[N2Write_2cyc_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]$", + "^FADDP_ZPmZZ_[HSD]$", + "^FNEG_ZPmZ_[HSD]$", + "^FSUBR_ZPm[IZ]_[HSD]$")>; + +// Floating point associative add, F16 +def : InstRW<[N2Write_10cyc_1V1], (instrs FADDA_VPZ_H)>; + +// Floating point associative add, F32 +def : InstRW<[N2Write_6cyc_1V1], (instrs FADDA_VPZ_S)>; + +// Floating point associative add, F64 +def : InstRW<[N2Write_4cyc_1V], (instrs FADDA_VPZ_D)>; + +// Floating point compare +def : InstRW<[N2Write_2cyc_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]$", + "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]$", + "^FCM(LE|LT)_PPzZ0_[HSD]$", + "^FCMUO_PPzZZ_[HSD]$")>; + +// Floating point complex add +def : InstRW<[N2Write_3cyc_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>; + +// Floating point complex multiply add +def : InstRW<[N2Write_5cyc_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$", + "^FCMLA_ZZZI_[HS]$")>; + +// Floating point convert, long or narrow (F16 to F32 or F32 to F16) +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)$", + "^FCVTLT_ZPmZ_HtoS$", + "^FCVTNT_ZPmZ_StoH$")>; + +// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 +// or F64 to F16) +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)$", + "^FCVTLT_ZPmZ_StoD$", + "^FCVTNT_ZPmZ_DtoS$")>; + +// Floating point convert, round to odd +def : InstRW<[N2Write_3cyc_1V0], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>; + +// Floating point base2 log, F16 +def : InstRW<[N2Write_6cyc_4V0], (instrs FLOGB_ZPmZ_H)>; + +// Floating point base2 log, F32 +def : InstRW<[N2Write_4cyc_2V0], (instrs FLOGB_ZPmZ_S)>; + +// Floating point base2 log, F64 +def : InstRW<[N2Write_3cyc_1V0], (instrs FLOGB_ZPmZ_D)>; + +// Floating point convert to integer, F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH$")>; + +// Floating point convert to integer, F32 +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)$")>; + +// Floating point convert to integer, F64 +def : InstRW<[N2Write_3cyc_1V0], + (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)$")>; + +// Floating point copy +def : InstRW<[N2Write_2cyc_1V], (instregex "^FCPY_ZPmI_[HSD]$", + "^FDUP_ZI_[HSD]$")>; + +// Floating point divide, F16 +def : InstRW<[N2Write_13cyc_1V0], (instregex "^FDIVR?_ZPmZ_H$")>; + +// Floating point divide, F32 +def : InstRW<[N2Write_10cyc_1V0], (instregex "^FDIVR?_ZPmZ_S$")>; + +// Floating point divide, F64 +def : InstRW<[N2Write_15cyc_1V0], (instregex "^FDIVR?_ZPmZ_D$")>; + +// Floating point min/max pairwise +def : InstRW<[N2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]$")>; + +// Floating point min/max +def : InstRW<[N2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]$")>; + +// Floating point multiply +def : InstRW<[N2Write_3cyc_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]$", + "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]$")>; + +// Floating point multiply accumulate +def : InstRW<[N2Write_4cyc_1V], + (instregex "^FML[AS]_(ZPmZZ|ZZZI)_[HSD]$", + "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_ZPmZZ_[HSD]$")>; + +// Floating point multiply add/sub accumulate long +def : InstRW<[N2Write_4cyc_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>; + +// Floating point reciprocal estimate, F16 +def : InstRW<[N2Write_6cyc_4V0], (instrs FRECPE_ZZ_H, FRECPX_ZPmZ_H, + FRSQRTE_ZZ_H)>; + +// Floating point reciprocal estimate, F32 +def : InstRW<[N2Write_4cyc_2V0], (instrs FRECPE_ZZ_S, FRECPX_ZPmZ_S, + FRSQRTE_ZZ_S)>; + +// Floating point reciprocal estimate, F64 +def : InstRW<[N2Write_3cyc_1V0], (instrs FRECPE_ZZ_D, FRECPX_ZPmZ_D, + FRSQRTE_ZZ_D)>; + +// Floating point reciprocal step +def : InstRW<[N2Write_4cyc_1V0], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>; + +// Floating point reduction, F16 +def : InstRW<[N2Write_6cyc_2V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H$")>; + +// Floating point reduction, F32 +def : InstRW<[N2Write_4cyc_1V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S$")>; + +// Floating point reduction, F64 +def : InstRW<[N2Write_2cyc_1V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D$")>; + +// Floating point round to integral, F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H$")>; + +// Floating point round to integral, F32 +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S$")>; + +// Floating point round to integral, F64 +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D$")>; + +// Floating point square root, F16 +def : InstRW<[N2Write_13cyc_1V0], (instrs FSQRT_ZPmZ_H)>; + +// Floating point square root, F32 +def : InstRW<[N2Write_10cyc_1V0], (instrs FSQRT_ZPmZ_S)>; + +// Floating point square root, F64 +def : InstRW<[N2Write_16cyc_1V0], (instrs FSQRT_ZPmZ_D)>; + +// Floating point trigonometric exponentiation +def : InstRW<[N2Write_3cyc_1V1], (instregex "^FEXPA_ZZ_[HSD]$")>; + +// Floating point trigonometric multiply add +def : InstRW<[N2Write_4cyc_1V], (instregex "^FTMAD_ZZI_[HSD]$")>; + +// Floating point trigonometric, miscellaneous +def : InstRW<[N2Write_3cyc_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>; + +// SVE BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// Convert, F32 to BF16 +def : InstRW<[N2Write_3cyc_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; + +// Dot product +def : InstRW<[N2Write_4cyc_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; + +// Matrix multiply accumulate +def : InstRW<[N2Write_5cyc_1V], (instrs BFMMLA_ZZZ)>; + +// Multiply accumulate long +def : InstRW<[N2Write_4cyc_1V], (instregex "^BFMLAL[BT]_ZZ[ZI]$")>; + +// SVE Load instructions +// ----------------------------------------------------------------------------- + +// Load vector +def : InstRW<[N2Write_6cyc_1L], (instrs LDR_ZXI)>; + +// Load predicate +def : InstRW<[N2Write_6cyc_1L_1M], (instrs LDR_PXI)>; + +// Contiguous load, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM_REAL$", + "^LD1S?B_[HSD]_IMM_REAL$", + "^LD1S?H_[SD]_IMM_REAL$", + "^LD1S?W_D_IMM_REAL$" )>; +// Contiguous load, scalar + scalar +def : InstRW<[N2Write_6cyc_1L01], (instregex "^LD1[BHWD]$", + "^LD1S?B_[HSD]$", + "^LD1S?H_[SD]$", + "^LD1S?W_D$" )>; + +// Contiguous load broadcast, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1R[BHWD]_IMM$", + "^LD1RSW_IMM$", + "^LD1RS?B_[HSD]_IMM$", + "^LD1RS?H_[SD]_IMM$", + "^LD1RS?W_D_IMM$", + "^LD1RQ_[BHWD]_IMM$")>; + +// Contiguous load broadcast, scalar + scalar +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1RQ_[BHWD]$")>; + +// Non temporal load, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDNT1[BHWD]_ZRI$")>; + +// Non temporal load, scalar + scalar +def : InstRW<[N2Write_6cyc_1L_1S], (instregex "^LDNT1[BHWD]_ZRR$")>; + +// Non temporal gather load, vector + scalar 32-bit element size +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LDNT1[BHW]_ZZR_S_REAL$", + "^LDNT1S[BH]_ZZR_S_REAL$")>; + +// Non temporal gather load, vector + scalar 64-bit element size +def : InstRW<[N2Write_10cyc_2L_2V1], (instregex "^LDNT1S?[BHW]_ZZR_D_REAL$")>; +def : InstRW<[N2Write_10cyc_2L_2V1], (instrs LDNT1D_ZZR_D_REAL)>; + +// Contiguous first faulting load, scalar + scalar +def : InstRW<[N2Write_6cyc_1L_1S], (instregex "^LDFF1[BHWD]_REAL$", + "^LDFF1S?B_[HSD]_REAL$", + "^LDFF1S?H_[SD]_REAL$", + "^LDFF1S?W_D_REAL$")>; + +// Contiguous non faulting load, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDNF1[BHWD]_IMM_REAL$", + "^LDNF1S?B_[HSD]_IMM_REAL$", + "^LDNF1S?H_[SD]_IMM_REAL$", + "^LDNF1S?W_D_IMM_REAL$")>; + +// Contiguous Load two structures to two vectors, scalar + imm +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "^LD2[BHWD]_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + scalar +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LD2[BHWD]$")>; + +// Contiguous Load three structures to three vectors, scalar + imm +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LD3[BHWD]_IMM$")>; + +// Contiguous Load three structures to three vectors, scalar + scalar +def : InstRW<[N2Write_10cyc_1V_1L_1S], (instregex "^LD3[BHWD]$")>; + +// Contiguous Load four structures to four vectors, scalar + imm +def : InstRW<[N2Write_9cyc_2L_2V], (instregex "^LD4[BHWD]_IMM$")>; + +// Contiguous Load four structures to four vectors, scalar + scalar +def : InstRW<[N2Write_10cyc_2L_2V_2S], (instregex "^LD4[BHWD]$")>; + +// Gather load, vector + imm, 32-bit element size +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_IMM_REAL$", + "^GLD(FF)?1W_IMM_REAL$")>; + +// Gather load, vector + imm, 64-bit element size +def : InstRW<[N2Write_9cyc_2L_2V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM_REAL$", + "^GLD(FF)?1D_IMM_REAL$")>; + +// Gather load, 64-bit element size +def : InstRW<[N2Write_9cyc_2L_2V], + (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW_(SCALED_)?REAL$", + "^GLD(FF)?1S?[BHW]_D_(SCALED_)?REAL$", + "^GLD(FF)?1D_[SU]XTW_(SCALED_)?REAL$", + "^GLD(FF)?1D_(SCALED_)?REAL$")>; + +// Gather load, 32-bit scaled offset +def : InstRW<[N2Write_10cyc_2L_2V], + (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED_REAL$", + "^GLD(FF)?1W_[SU]XTW_SCALED_REAL")>; + +// Gather load, 32-bit unpacked unscaled offset +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW_REAL$", + "^GLD(FF)?1W_[SU]XTW_REAL$")>; + +// SVE Store instructions +// ----------------------------------------------------------------------------- + +// Store from predicate reg +def : InstRW<[N2Write_1cyc_1L01], (instrs STR_PXI)>; + +// Store from vector reg +def : InstRW<[N2Write_2cyc_1L01_1V], (instrs STR_ZXI)>; + +// Contiguous store, scalar + imm +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^ST1[BHWD]_IMM$", + "^ST1B_[HSD]_IMM$", + "^ST1H_[SD]_IMM$", + "^ST1W_D_IMM$")>; + +// Contiguous store, scalar + scalar +def : InstRW<[N2Write_2cyc_1L01_1S_1V], (instregex "^ST1H(_[SD])?$")>; +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^ST1[BWD]$", + "^ST1B_[HSD]$", + "^ST1W_D$")>; + +// Contiguous store two structures from two vectors, scalar + imm +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "^ST2[BHWD]_IMM$")>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[N2Write_4cyc_1L01_1S_1V], (instrs ST2H)>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "^ST2[BWD]$")>; + +// Contiguous store three structures from three vectors, scalar + imm +def : InstRW<[N2Write_7cyc_5L01_5V], (instregex "^ST3[BHWD]_IMM$")>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[N2Write_7cyc_5L01_5S_5V], (instrs ST3H)>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[N2Write_7cyc_5L01_5S_5V], (instregex "^ST3[BWD]$")>; + +// Contiguous store four structures from four vectors, scalar + imm +def : InstRW<[N2Write_11cyc_9L01_9V], (instregex "^ST4[BHWD]_IMM$")>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[N2Write_11cyc_9L01_9S_9V], (instrs ST4H)>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[N2Write_11cyc_9L01_9S_9V], (instregex "^ST4[BWD]$")>; + +// Non temporal store, scalar + imm +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>; + +// Non temporal store, scalar + scalar +def : InstRW<[N2Write_2cyc_1L01_1S_1V], (instrs STNT1H_ZRR)>; +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>; + +// Scatter non temporal store, vector + scalar 32-bit element size +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^STNT1[BHW]_ZZR_S")>; + +// Scatter non temporal store, vector + scalar 64-bit element size +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZZR_D")>; + +// Scatter store vector + imm 32-bit element size +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^SST1[BH]_S_IMM$", + "^SST1W_IMM$")>; + +// Scatter store vector + imm 64-bit element size +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D_IMM$", + "^SST1D_IMM$")>; + +// Scatter store, 32-bit scaled offset +def : InstRW<[N2Write_4cyc_2L01_2V], + (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unpacked unscaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D_[SU]XTW$", + "^SST1D_[SU]XTW$")>; + +// Scatter store, 32-bit unpacked scaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", + "^SST1D_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unscaled offset +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^SST1[BH]_S_[SU]XTW$", + "^SST1W_[SU]XTW$")>; + +// Scatter store, 64-bit scaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[HW]_D_SCALED$", + "^SST1D_SCALED$")>; + +// Scatter store, 64-bit unscaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D$", + "^SST1D$")>; + +// SVE Miscellaneous instructions +// ----------------------------------------------------------------------------- + +// Read first fault register, unpredicated +def : InstRW<[N2Write_2cyc_1M0], (instrs RDFFR_P_REAL)>; + +// Read first fault register, predicated +def : InstRW<[N2Write_3cyc_1M0_1M], (instrs RDFFR_PPz_REAL)>; + +// Read first fault register and set flags +def : InstRW<[N2Write_4cyc_2M0_2M], (instrs RDFFRS_PPz)>; + +// Set first fault register +// Write to first fault register +def : InstRW<[N2Write_2cyc_1M0], (instrs SETFFR, WRFFR)>; + +// Prefetch +def : InstRW<[N2Write_4cyc_1L], (instregex "^PRF[BHWD]")>; + +// SVE Cryptographic instructions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[N2Write_2cyc_1V], (instregex "^AES[DE]_ZZZ_B$", + "^AESI?MC_ZZ_B$")>; + +// Crypto SHA3 ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^(BCAX|EOR3)_ZZZZ$", + "^RAX1_ZZZ_D$", + "^XAR_ZZZI_[BHSD]$")>; + +// Crypto SM4 ops +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>; + +} diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 6ecfc97a4273..9c1bf3231a55 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -26,7 +26,8 @@ def TSV110Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } // Define each kind of processor resource and number available on the TSV110, diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index ff34c0ce9a0c..8b380ae0e8f3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -27,7 +27,8 @@ def ThunderXT8XModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index ffa0a5e7d91a..cdafa33da054 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -27,7 +27,8 @@ def ThunderX2T99Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td index 46a1c217f984..5b1e9b5bcf23 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -25,7 +25,8 @@ def ThunderX3T110Model : SchedMachineModel { let CompleteModel = 1; list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, - PAUnsupported.F); + PAUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 41c7a8c5042f..274a025e82a0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -796,6 +796,50 @@ static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, return IC.replaceInstUsesWith(II, Extract); } +static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, + IntrinsicInst &II) { + // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar + // integer variant across a variety of micro-architectures. Replace scalar + // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple + // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more + // depending on the micro-architecture, but has been observed as generally + // being faster, particularly when the CLAST[AB] op is a loop-carried + // dependency. + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + Value *Pg = II.getArgOperand(0); + Value *Fallback = II.getArgOperand(1); + Value *Vec = II.getArgOperand(2); + Type *Ty = II.getType(); + + if (!Ty->isIntegerTy()) + return None; + + Type *FPTy; + switch (cast<IntegerType>(Ty)->getBitWidth()) { + default: + return None; + case 16: + FPTy = Builder.getHalfTy(); + break; + case 32: + FPTy = Builder.getFloatTy(); + break; + case 64: + FPTy = Builder.getDoubleTy(); + break; + } + + Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); + auto *FPVTy = VectorType::get( + FPTy, cast<VectorType>(Vec->getType())->getElementCount()); + Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); + auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, + {Pg, FPFallBack, FPVec}); + Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); + return IC.replaceInstUsesWith(II, FPIItoInt); +} + static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II) { LLVMContext &Ctx = II.getContext(); @@ -1294,6 +1338,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_sve_lasta: case Intrinsic::aarch64_sve_lastb: return instCombineSVELast(IC, II); + case Intrinsic::aarch64_sve_clasta_n: + case Intrinsic::aarch64_sve_clastb_n: + return instCombineSVECondLast(IC, II); case Intrinsic::aarch64_sve_cntd: return instCombineSVECntElts(IC, II, 2); case Intrinsic::aarch64_sve_cntw: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d0aacb457a39..59ec91843266 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -334,8 +334,10 @@ public: return 2; } - bool emitGetActiveLaneMask() const { - return ST->hasSVE(); + PredicationStyle emitGetActiveLaneMask() const { + if (ST->hasSVE()) + return PredicationStyle::DataAndControlFlow; + return PredicationStyle::None; } bool supportsScalableVectors() const { return ST->hasSVE(); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 89e1d85a6085..aaef363e9b8d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" @@ -354,7 +355,9 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, "Return value without a vreg"); bool Success = true; - if (!VRegs.empty()) { + if (!FLI.CanLowerReturn) { + insertSRetStores(MIRBuilder, Val->getType(), VRegs, FLI.DemoteRegister); + } else if (!VRegs.empty()) { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); @@ -464,6 +467,18 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return Success; } +bool AArch64CallLowering::canLowerReturn(MachineFunction &MF, + CallingConv::ID CallConv, + SmallVectorImpl<BaseArgInfo> &Outs, + bool IsVarArg) const { + SmallVector<CCValAssign, 16> ArgLocs; + const auto &TLI = *getTLI<AArch64TargetLowering>(); + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, + MF.getFunction().getContext()); + + return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv)); +} + /// Helper function to compute forwarded registers for musttail calls. Computes /// the forwarded registers, sets MBB liveness, and emits COPY instructions that /// can be used to save + restore registers later. @@ -533,6 +548,12 @@ bool AArch64CallLowering::lowerFormalArguments( SmallVector<ArgInfo, 8> SplitArgs; SmallVector<std::pair<Register, Register>> BoolArgs; + + // Insert the hidden sret parameter if the return value won't fit in the + // return registers. + if (!FLI.CanLowerReturn) + insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL); + unsigned i = 0; for (auto &Arg : F.args()) { if (DL.getTypeStoreSize(Arg.getType()).isZero()) @@ -1194,7 +1215,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. - if (!Info.OrigRet.Ty->isVoidTy()) { + if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv); CallReturnHandler Handler(MIRBuilder, MRI, MIB); bool UsingReturnedArg = @@ -1226,6 +1247,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, .addImm(Assigner.StackOffset) .addImm(CalleePopBytes); + if (!Info.CanLowerReturn) { + insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, + Info.DemoteRegister, Info.DemoteStackIndex); + } return true; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h index aafb1d19640a..cbdf77f69a63 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -35,6 +35,10 @@ public: ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI, Register SwiftErrorVReg) const override; + bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv, + SmallVectorImpl<BaseArgInfo> &Outs, + bool IsVarArg) const override; + bool fallBackToDAGISel(const MachineFunction &MF) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 9a65687735fe..eb8d0552173d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1710,11 +1710,6 @@ bool AArch64InstructionSelector::selectCompareBranch( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { Register CondReg = I.getOperand(0).getReg(); MachineInstr *CCMI = MRI.getVRegDef(CondReg); - if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) { - CondReg = CCMI->getOperand(1).getReg(); - CCMI = MRI.getVRegDef(CondReg); - } - // Try to select the G_BRCOND using whatever is feeding the condition if // possible. unsigned CCMIOpc = CCMI->getOpcode(); @@ -3346,12 +3341,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_SELECT: { auto &Sel = cast<GSelect>(I); - if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) { - LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty - << ", expected: " << LLT::scalar(1) << '\n'); - return false; - } - const Register CondReg = Sel.getCondReg(); const Register TReg = Sel.getTrueReg(); const Register FReg = Sel.getFalseReg(); @@ -4777,12 +4766,6 @@ static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, return false; MachineInstr *ValDef = MRI.getVRegDef(Val); unsigned Opcode = ValDef->getOpcode(); - if (Opcode == TargetOpcode::G_TRUNC) { - // Look through a trunc. - Val = ValDef->getOperand(1).getReg(); - ValDef = MRI.getVRegDef(Val); - Opcode = ValDef->getOpcode(); - } if (isa<GAnyCmp>(ValDef)) { CanNegate = true; MustBeFirst = false; @@ -4870,12 +4853,6 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec( auto &MRI = *MIB.getMRI(); MachineInstr *ValDef = MRI.getVRegDef(Val); unsigned Opcode = ValDef->getOpcode(); - if (Opcode == TargetOpcode::G_TRUNC) { - // Look through a trunc. - Val = ValDef->getOperand(1).getReg(); - ValDef = MRI.getVRegDef(Val); - Opcode = ValDef->getOpcode(); - } if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { Register LHS = Cmp->getLHSReg(); Register RHS = Cmp->getRHSReg(); @@ -5026,31 +5003,17 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { // First, check if the condition is defined by a compare. MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); - while (CondDef) { - // We can only fold if all of the defs have one use. - Register CondDefReg = CondDef->getOperand(0).getReg(); - if (!MRI.hasOneNonDBGUse(CondDefReg)) { - // Unless it's another select. - for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { - if (CondDef == &UI) - continue; - if (UI.getOpcode() != TargetOpcode::G_SELECT) - return false; - } - } - - // We can skip over G_TRUNC since the condition is 1-bit. - // Truncating/extending can have no impact on the value. - unsigned Opc = CondDef->getOpcode(); - if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) - break; - - // Can't see past copies from physregs. - if (Opc == TargetOpcode::COPY && - Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) - return false; - CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); + // We can only fold if all of the defs have one use. + Register CondDefReg = CondDef->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDefReg)) { + // Unless it's another select. + for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { + if (CondDef == &UI) + continue; + if (UI.getOpcode() != TargetOpcode::G_SELECT) + return false; + } } // Is the condition defined by a compare? diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 74ec9373ce9e..d3617b87a851 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -42,7 +42,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) : ST(&ST) { using namespace TargetOpcode; const LLT p0 = LLT::pointer(0, 64); - const LLT s1 = LLT::scalar(1); const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); @@ -80,7 +79,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) const LLT &MinFPScalar = HasFP16 ? s16 : s32; getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalFor({p0, s1, s8, s16, s32, s64}) + .legalFor({p0, s8, s16, s32, s64}) .legalFor(PackedVectorAllTypeList) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64) @@ -198,8 +197,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder( {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) - .legalFor({{s32, s1}, {s64, s1}}) + .legalFor({{s32, s32}, {s64, s32}}) .clampScalar(0, s32, s64) + .clampScalar(1, s32, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) @@ -241,7 +241,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_INSERT) .legalIf(all(typeInSet(0, {s32, s64, p0}), - typeInSet(1, {s1, s8, s16, s32}), smallerThan(1, 0))) + typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) .widenScalarToNextPow2(0) .clampScalar(0, s32, s64) .widenScalarToNextPow2(1) @@ -260,8 +260,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) .maxScalarIf(typeInSet(1, {s128}), 0, s64); - getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) + + for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { + auto &Actions = getActionDefinitionsBuilder(Op); + + if (Op == G_SEXTLOAD) + Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); + + // Atomics have zero extending behavior. + Actions .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8}, @@ -278,6 +285,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .unsupportedIfMemSizeNotPow2() // Lower anything left over into G_*EXT and G_LOAD .lower(); + } auto IsPtrVecPred = [=](const LegalityQuery &Query) { const LLT &ValTy = Query.Types[0]; @@ -425,10 +433,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) const LLT &SrcTy = Query.Types[1]; - // Special case for s1. - if (SrcTy == s1) - return true; - // Make sure we fit in a register otherwise. Don't bother checking that // the source type is below 128 bits. We shouldn't be allowing anything // through which is wider than the destination in the first place. @@ -481,13 +485,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0); // Control-flow - getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); + getActionDefinitionsBuilder(G_BRCOND) + .legalFor({s32}) + .clampScalar(0, s32, s32); getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); getActionDefinitionsBuilder(G_SELECT) - .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) + .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) .widenScalarToNextPow2(0) .clampScalar(0, s32, s64) + .clampScalar(1, s32, s32) .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) .lowerIf(isVector(0)); @@ -500,7 +507,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); getActionDefinitionsBuilder(G_PTRTOINT) - .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) + .legalForCartesianProduct({s8, s16, s32, s64}, {p0}) .legalFor({{v2s64, v2p0}}) .maxScalar(0, s64) .widenScalarToNextPow2(0, /*Min*/ 8); @@ -517,7 +524,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // FIXME: This is wrong since G_BITCAST is not allowed to change the // number of bits but it's what the previous code described and fixing // it breaks tests. - .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, + .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, v2p0}); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 2901e5c0fe4d..bd0a497fa441 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -43,11 +43,9 @@ namespace { class AArch64MCCodeEmitter : public MCCodeEmitter { MCContext &Ctx; - const MCInstrInfo &MCII; public: - AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : Ctx(ctx), MCII(mcii) {} + AArch64MCCodeEmitter(const MCInstrInfo &, MCContext &ctx) : Ctx(ctx) {} AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) = delete; void operator=(const AArch64MCCodeEmitter &) = delete; ~AArch64MCCodeEmitter() override = default; @@ -193,12 +191,6 @@ public: uint32_t encodeMatrixIndexGPR32(const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -618,9 +610,6 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - if (MI.getOpcode() == AArch64::TLSDESCCALL) { // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the // following (BLR) instruction. It doesn't emit any code itself so it @@ -674,7 +663,6 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison( return EncodedValue; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AArch64GenMCCodeEmitter.inc" MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 34e3b2cf58e4..f129bfe11e4d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -34,6 +34,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC #define GET_INSTRINFO_MC_HELPERS +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AArch64GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 049c49796dc6..7d1de3e53c0c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -33,6 +33,7 @@ class MCSubtargetInfo; class MCTargetOptions; class MCTargetStreamer; class Target; +class FeatureBitset; MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 2744e81f99f1..cb36aa26e839 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -227,6 +227,40 @@ class sme_add_vector_to_tile_u64<bit V, string mnemonic> let Inst{2-0} = ZAda; } +class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty> + : Pseudo<(outs), + (ins i64imm:$tile, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; +} + +def ADDHA_MPPZ_PSEUDO_S : sme_add_vector_to_tile_pseudo<ZPR32>; +def ADDVA_MPPZ_PSEUDO_S : sme_add_vector_to_tile_pseudo<ZPR32>; + +def : Pat<(int_aarch64_sme_addha + imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm), + (nxv4i32 ZPR32:$zn)), + (ADDHA_MPPZ_PSEUDO_S imm0_3:$tile, $pn, $pm, $zn)>; +def : Pat<(int_aarch64_sme_addva + imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm), + (nxv4i32 ZPR32:$zn)), + (ADDVA_MPPZ_PSEUDO_S imm0_3:$tile, $pn, $pm, $zn)>; + +let Predicates = [HasSMEI64] in { +def ADDHA_MPPZ_PSEUDO_D : sme_add_vector_to_tile_pseudo<ZPR64>; +def ADDVA_MPPZ_PSEUDO_D : sme_add_vector_to_tile_pseudo<ZPR64>; + +def : Pat<(int_aarch64_sme_addha + imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm), + (nxv2i64 ZPR64:$zn)), + (ADDHA_MPPZ_PSEUDO_D imm0_7:$tile, $pn, $pm, $zn)>; +def : Pat<(int_aarch64_sme_addva + imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm), + (nxv2i64 ZPR64:$zn)), + (ADDVA_MPPZ_PSEUDO_D imm0_7:$tile, $pn, $pm, $zn)>; +} + //===----------------------------------------------------------------------===// // SME Contiguous Loads //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 3631536a32b9..7cdd4c4af95e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -650,11 +650,11 @@ multiclass sve_int_pfalse<bits<6> opc, string asm> { def : Pat<(nxv1i1 immAllZerosV), (!cast<Instruction>(NAME))>; } -class sve_int_ptest<bits<6> opc, string asm> +class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op> : I<(outs), (ins PPRAny:$Pg, PPR8:$Pn), asm, "\t$Pg, $Pn", "", - []>, Sched<[]> { + [(op (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn))]>, Sched<[]> { bits<4> Pg; bits<4> Pn; let Inst{31-24} = 0b00100101; @@ -1691,6 +1691,9 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op, !cast<Instruction>(NAME), PTRUE_S>; def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1, !cast<Instruction>(NAME), PTRUE_D>; + // Emulate .Q operation using a PTRUE_D when the other lanes don't matter. + def : SVE_2_Op_AllActive_Pat<nxv1i1, op_nopred, nxv1i1, nxv1i1, + !cast<Instruction>(NAME), PTRUE_D>; } // An instance of sve_int_pred_log_and but uses op_nopred's first operand as the @@ -1706,6 +1709,9 @@ multiclass sve_int_pred_log_v2<bits<4> opc, string asm, SDPatternOperator op, (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; def : Pat<(nxv2i1 (op_nopred nxv2i1:$Op1, nxv2i1:$Op2)), (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; + // Emulate .Q operation using a PTRUE_D when the other lanes don't matter. + def : Pat<(nxv1i1 (op_nopred nxv1i1:$Op1, nxv1i1:$Op2)), + (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 71303611265c..cf8891cff1b3 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -343,7 +343,8 @@ struct SysAlias { : Name(N), Encoding(E), FeaturesRequired(F) {} bool haveFeatures(FeatureBitset ActiveFeatures) const { - return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + return ActiveFeatures[llvm::AArch64::FeatureAll] || + (FeaturesRequired & ActiveFeatures) == FeaturesRequired; } FeatureBitset getRequiredFeatures() const { return FeaturesRequired; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index c4680cbedadf..91dc611fb265 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -317,6 +317,9 @@ extern char &SIFormMemoryClausesID; void initializeSIPostRABundlerPass(PassRegistry&); extern char &SIPostRABundlerID; +void initializeGCNCreateVOPDPass(PassRegistry &); +extern char &GCNCreateVOPDID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 94d7844e8a32..a8108b1d637b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -626,13 +626,13 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { Constant *FoldedT = SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); - if (isa<ConstantExpr>(FoldedT)) + if (!FoldedT || isa<ConstantExpr>(FoldedT)) return false; Constant *FoldedF = SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); - if (isa<ConstantExpr>(FoldedF)) + if (!FoldedF || isa<ConstantExpr>(FoldedF)) return false; IRBuilder<> Builder(&BO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b00df27f5fd3..589992c7a7ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1883,20 +1883,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// Match an immediate (if Imm is true) or an SGPR (if Imm is false) +// offset. If Imm32Only is true, match only 32-bit immediate offsets +// available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool Imm, + bool Imm32Only) const { ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { + if (Imm) + return false; if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { Offset = ByteOffsetNode; - Imm = false; return true; } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { Offset = ByteOffsetNode.getOperand(0); - Imm = false; return true; } } @@ -1908,9 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, int64_t ByteOffset = C->getSExtValue(); Optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false); - if (EncodedOffset) { + if (EncodedOffset && Imm && !Imm32Only) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); - Imm = true; return true; } @@ -1919,7 +1922,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return false; EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); - if (EncodedOffset) { + if (EncodedOffset && Imm32Only) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; } @@ -1927,11 +1930,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) return false; - SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); - Offset = SDValue( - CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + if (!Imm) { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue( + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + return true; + } - return true; + return false; } SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { @@ -1959,8 +1965,12 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { Ops), 0); } +// Match a base and an immediate (if Imm is true) or an SGPR +// (if Imm is false) offset. If Imm32Only is true, match only 32-bit +// immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool Imm, + bool Imm32Only) const { SDLoc SL(Addr); // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -1977,41 +1987,34 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, assert(N0 && N1 && isa<ConstantSDNode>(N1)); } if (N0 && N1) { - if (SelectSMRDOffset(N1, Offset, Imm)) { + if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) { SBase = Expand32BitAddress(N0); return true; } } + return false; } + if (!Imm) + return false; SBase = Expand32BitAddress(Addr); Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); - Imm = true; return true; } bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm = false; - return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; + return SelectSMRD(Addr, SBase, Offset, /* Imm */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - - bool Imm = false; - if (!SelectSMRD(Addr, SBase, Offset, Imm)) - return false; - - return !Imm && isa<ConstantSDNode>(Offset); + return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm = false; - return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && - !isa<ConstantSDNode>(Offset); + return SelectSMRD(Addr, SBase, Offset, /* Imm */ false); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 862be9dc5568..7894b8eb5b67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -193,11 +193,11 @@ private: bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, - bool &Imm) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm, + bool Imm32Only) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, - bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm, + bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ef7929012597..bf520a560404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4803,6 +4803,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::Nand: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: return AtomicExpansionKind::CmpXChg; default: return AtomicExpansionKind::None; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3f242fdb6d8e..70fae9d784a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1180,7 +1180,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); if (Arg) { - const int64_t Value = Arg.getValue().Value.getSExtValue(); + const int64_t Value = Arg.value().Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); @@ -3235,7 +3235,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; + return Register(); if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); @@ -3851,27 +3851,36 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { getAddrModeInfo(*MI, *MRI, AddrInfo); // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, - // then we can select all ptr + 32-bit offsets not just immediate offsets. - if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + // then we can select all ptr + 32-bit offsets. + if (AddrInfo.empty()) return None; const GEPInfo &GEPInfo = AddrInfo[0]; + Register PtrReg = GEPInfo.SgprParts[0]; + // SGPR offset is unsigned. - if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) - return None; + if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) && + GEPInfo.Imm != 0) { + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } - // If we make it this far we have a load with an 32-bit immediate offset. - // It is OK to select this using a sgpr offset, because we have already - // failed trying to select this load into one of the _IMM variants since - // the _IMM Patterns are considered before the _SGPR patterns. - Register PtrReg = GEPInfo.SgprParts[0]; - Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } - }}; + if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) { + if (Register OffsetReg = + matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) { + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } + } + + return None; } std::pair<Register, int> @@ -4231,7 +4240,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { }, [=](MachineInstrBuilder &MIB) { // vaddr if (FI) - MIB.addFrameIndex(FI.getValue()); + MIB.addFrameIndex(FI.value()); else MIB.addReg(VAddr); }, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 31012915457b..26e6b9a10688 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -542,63 +542,37 @@ def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val), } } // End foreach as -// TODO: Add GISelPredicateCode for the ret and noret PatFrags once -// GlobalISelEmitter allows pattern matches where src and dst def count -// mismatch. - -multiclass ret_noret_op { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - def "_ret" : PatFrag<(ops node:$ptr, node:$data), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; - } - - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - def "_noret" : PatFrag<(ops node:$ptr, node:$data), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; - } +multiclass noret_op { + let HasNoUse = true in + def "_noret" : PatFrag<(ops node:$ptr, node:$data), + (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; } -defm int_amdgcn_flat_atomic_fadd : ret_noret_op; -defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op; -defm int_amdgcn_flat_atomic_fmin : ret_noret_op; -defm int_amdgcn_flat_atomic_fmax : ret_noret_op; -defm int_amdgcn_global_atomic_fadd : ret_noret_op; -defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op; -defm int_amdgcn_global_atomic_fmin : ret_noret_op; -defm int_amdgcn_global_atomic_fmax : ret_noret_op; -defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op; - -multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - defm "_noret" : binary_atomic_op<atomic_op, IsInt>; - } +defm int_amdgcn_flat_atomic_fadd : noret_op; +defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; +defm int_amdgcn_flat_atomic_fmin : noret_op; +defm int_amdgcn_flat_atomic_fmax : noret_op; +defm int_amdgcn_global_atomic_fadd : noret_op; +defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; +defm int_amdgcn_global_atomic_fmin : noret_op; +defm int_amdgcn_global_atomic_fmax : noret_op; +defm int_amdgcn_ds_fadd_v2bf16 : noret_op; - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - defm "_ret" : binary_atomic_op<atomic_op, IsInt>; - } +multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { + let HasNoUse = true in + defm "_noret" : binary_atomic_op<atomic_op, IsInt>; } -multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> { - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - defm "_noret" : ternary_atomic_op<atomic_op>; - } - - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - defm "_ret" : ternary_atomic_op<atomic_op>; - } +multiclass noret_ternary_atomic_op<SDNode atomic_op> { + let HasNoUse = true in + defm "_noret" : ternary_atomic_op<atomic_op>; } multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> { foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { defm "_"#as : binary_atomic_op<atomic_op, IsInt>; - defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>; + defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>; } } } @@ -640,13 +614,15 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr), let AddressSpaces = StoreAddress_local.AddrSpaces in { defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local : noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_local_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { -defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region : noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_region_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ed6ddbf426fd..38e04dedd9fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -171,6 +171,10 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { } void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 1b513c456307..745734aac2b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -131,8 +131,8 @@ public: bool IsAOneAddressSpace = isOneAddressSpace(A); bool IsBOneAddressSpace = isOneAddressSpace(B); - return AIO.getValue() >= BIO.getValue() && - (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); + return AIO.value() >= BIO.value() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 77816a783630..6bd906439ee8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -40,9 +40,9 @@ using namespace llvm; #include "AMDGPUGenSubtargetInfo.inc" #undef AMDGPUSubtarget -static cl::opt<bool> DisablePowerSched( - "amdgpu-disable-power-sched", - cl::desc("Disable scheduling to minimize mAI power bursts"), +static cl::opt<bool> EnablePowerSched( + "amdgpu-enable-power-sched", + cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false)); static cl::opt<bool> EnableVGPRIndexMode( @@ -916,7 +916,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void apply(ScheduleDAGInstrs *DAGInstrs) override { const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasMAIInsts() || DisablePowerSched) + if (!ST.hasMAIInsts()) return; DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); @@ -966,7 +966,8 @@ void GCNSubtarget::getPostRAMutations( std::unique_ptr<ScheduleDAGMutation> GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { - return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); + return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) + : nullptr; } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1c6b9d35695a..971e44723758 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,11 +22,13 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "GCNVOPDUtils.h" #include "R600.h" #include "R600TargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -278,6 +280,12 @@ static cl::opt<bool> cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable GFX11+ VOPD +static cl::opt<bool> + EnableVOPD("amdgpu-enable-vopd", + cl::desc("Enable VOPD, dual issue of VALU in wave32"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", @@ -383,6 +391,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeSIPostRABundlerPass(*PR); + initializeGCNCreateVOPDPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -920,6 +929,8 @@ public: DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + DAG->addMutation(createVOPDPairingMutation()); return DAG; } @@ -1399,6 +1410,8 @@ void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a087323e5de7..04dd3e938a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1412,10 +1412,12 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode + defvar Op = !cast<SDPatternOperator>(OpPrefix + # !if(!eq(RtnMode, "ret"), "", "_noret") # !if(isIntr, "", "_" # vt.Size)); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, @@ -1428,6 +1430,7 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1439,10 +1442,12 @@ multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> { multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode + defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global" + # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt.Size); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); @@ -1465,6 +1470,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> !if(!eq(vt, i32), sub0, sub0_sub1)), Addr64ResDag) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1495,13 +1501,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, list<string> RtnModes = ["ret", "noret"]> { foreach RtnMode = RtnModes in { - defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"), - OpPrefix, OpPrefix # "_" # RtnMode)); - defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), - "_RTN", ""); - defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), + defvar Op = !cast<SDPatternOperator>(OpPrefix + # !if(!eq(RtnMode, "ret"), "", "_noret")); + + defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), (timm:$cachepolicy)); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), @@ -1534,6 +1541,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1551,7 +1559,7 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">; defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">; -defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>; +defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["ret"]>; defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">; @@ -1643,7 +1651,8 @@ let SubtargetPredicate = isGFX90APlus in { foreach RtnMode = ["ret", "noret"] in { -defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode); +defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap + # !if(!eq(RtnMode, "ret"), "", "_noret")); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), (timm:$cachepolicy)); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 27b723875aa4..d8387bf6f1ae 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -950,10 +950,11 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), - (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) ->; +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, + bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -965,75 +966,88 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; } multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; } def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; + !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { // Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode. -class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < +class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, + int complexity = 0, bit gds=0> : GCNPat< (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), - (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds)) ->; + (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), + /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), + /* complexity */ 1>; } - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } } // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 let SubtargetPredicate = isGFX11Plus in { // The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode. -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, + int complexity = 0, bit gds=0> : GCNPat< (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), - (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds)) ->; + (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; - def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } } // End SubtargetPredicate = isGFX11Plus @@ -1090,17 +1104,20 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = isGFX90APlus in { -def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>; +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +let AddedComplexity = 1 in def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; } let SubtargetPredicate = isGFX940Plus in { -def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>; +def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>; +let AddedComplexity = 1 in def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)), + (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) >; +let AddedComplexity = 1 in def : GCNPat < (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index cb2822818549..c634e15945ad 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1015,31 +1015,35 @@ class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt multiclass FlatAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt> { - defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size); + defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size); defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, - ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + ValueType data_vt = vt, int complexity = 0, + bit isIntr = 0> { + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + let AddedComplexity = complexity in def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + let AddedComplexity = !add(complexity, 1) in def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt, ValueType data_vt = vt> { - defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>; + defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>; } class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1260,17 +1264,16 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - let AddedComplexity = 10 in { - defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>; - } + defm : FlatSignedAtomicPat <inst, node, vt, data_vt, /* complexity */ 10, isIntr>; - let AddedComplexity = 11 in { - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; - } + let AddedComplexity = 13 in + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; + + let AddedComplexity = 12 in + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt, diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp new file mode 100644 index 000000000000..83dc3bebf4d3 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -0,0 +1,175 @@ +//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Combine VALU pairs into VOPD instructions +/// Only works on wave32 +/// Has register requirements, we reject creating VOPD if the requirements are +/// not met. +/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate +/// instructions for VOPD back-to-back +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "GCNVOPDUtils.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include <utility> + +#define DEBUG_TYPE "gcn-create-vopd" +STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created."); + +using namespace llvm; + +namespace { + +class GCNCreateVOPD : public MachineFunctionPass { +private: +public: + static char ID; + const GCNSubtarget *ST = nullptr; + + GCNCreateVOPD() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "GCN Create VOPD Instructions"; + } + + bool doReplace(const SIInstrInfo *SII, + std::pair<MachineInstr *, MachineInstr *> &Pair) { + auto *FirstMI = Pair.first; + auto *SecondMI = Pair.second; + unsigned Opc1 = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2)); + assert(NewOpcode != -1 && + "Should have previously determined this as a possible VOPD\n"); + + auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI, + FirstMI->getDebugLoc(), SII->get(NewOpcode)) + .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); + VOPDInst.add(FirstMI->getOperand(0)) + .add(SecondMI->getOperand(0)) + .add(FirstMI->getOperand(1)); + + switch (Opc1) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + default: + VOPDInst.add(FirstMI->getOperand(2)); + break; + } + + VOPDInst.add(SecondMI->getOperand(1)); + + switch (Opc2) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + default: + VOPDInst.add(SecondMI->getOperand(2)); + break; + } + + VOPDInst.copyImplicitOps(*FirstMI); + VOPDInst.copyImplicitOps(*SecondMI); + + LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " + << *Pair.first << "\tY: " << *Pair.second << "\n"); + FirstMI->eraseFromParent(); + SecondMI->eraseFromParent(); + ++NumVOPDCreated; + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) + return false; + LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); + + const SIInstrInfo *SII = ST->getInstrInfo(); + bool Changed = false; + + SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates; + + for (auto &MBB : MF) { + auto MII = MBB.begin(), E = MBB.end(); + while (MII != E) { + auto *FirstMI = &*MII; + MII = next_nodbg(MII, MBB.end()); + if (MII == MBB.end()) + break; + if (FirstMI->isDebugInstr()) + continue; + auto *SecondMI = &*MII; + unsigned Opc = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + std::pair<MachineInstr *, MachineInstr *> Pair; + + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + Pair = {FirstMI, SecondMI}; + else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + Pair = {SecondMI, FirstMI}; + else + continue; + // checkVOPDRegConstraints cares about program order, but doReplace + // cares about X-Y order in the constituted VOPD + if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { + ReplaceCandidates.push_back(Pair); + ++MII; + } + } + } + for (auto &Pair : ReplaceCandidates) { + Changed |= doReplace(SII, Pair); + } + + return Changed; + } +}; + +} // namespace + +char GCNCreateVOPD::ID = 0; + +char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID; + +INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions", + false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 1cd880eaa48e..5d254518c67a 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { } int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { - auto DPP32 = AMDGPU::getDPPOp32(Op); + int DPP32 = AMDGPU::getDPPOp32(Op); if (IsShrinkable) { assert(DPP32 == -1); - auto E32 = AMDGPU::getVOPe32(Op); + int E32 = AMDGPU::getVOPe32(Op); DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32); } - return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32; + if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1) + return DPP32; + int DPP64 = -1; + if (ST->hasVOP3DPP()) + DPP64 = AMDGPU::getDPPOp64(Op); + if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1) + return DPP64; + return -1; } // tracks the register operand definition and returns: @@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + bool HasVOP3DPP = ST->hasVOP3DPP(); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp, IsShrinkable); if (DPPOp == -1) { @@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, bool Fail = false; do { - auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); - assert(Dst); - DPPInst.add(*Dst); - int NumOperands = 1; + int NumOperands = 0; + if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) { + DPPInst.add(*Dst); + ++NumOperands; + } + if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { + if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { + DPPInst.add(*SDst); + ++NumOperands; + } + // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst + } const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { @@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::OpName::src0_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src0_modifiers)); - assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod0->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::OpName::src1_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src1_modifiers)); - assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod1->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.addImm(0); ++NumOperands; } - if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (Src1) { if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); Fail = true; @@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src1); ++NumOperands; } - - if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (auto *Mod2 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) { + assert(NumOperands == + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); + assert(HasVOP3DPP || + (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); + DPPInst.addImm(Mod2->getImm()); + ++NumOperands; + } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); + if (Src2) { if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); @@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } DPPInst.add(*Src2); + ++NumOperands; + } + if (HasVOP3DPP) { + auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); + if (ClampOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) { + DPPInst.addImm(ClampOpr->getImm()); + } + auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in); + if (VdstInOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) { + DPPInst.add(*VdstInOpr); + } + auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod); + if (OmodOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) { + DPPInst.addImm(OmodOpr->getImm()); + } + // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to + // all 1. + if (auto *OpSelOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { + auto OpSel = OpSelOpr->getImm(); + if (OpSel != 0) { + LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1) + DPPInst.addImm(OpSel); + } + if (auto *OpSelHiOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { + auto OpSelHi = OpSelHiOpr->getImm(); + // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check + // the bitmask for 3 op_sel_hi bits set + assert(Src2 && "Expected vop3p with 3 operands"); + if (OpSelHi != 7) { + LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1) + DPPInst.addImm(OpSelHi); + } + auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo); + if (NegOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) { + DPPInst.addImm(NegOpr->getImm()); + } + auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi); + if (NegHiOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) { + DPPInst.addImm(NegHiOpr->getImm()); + } } - DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); @@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } bool IsShrinkable = isShrinkable(OrigMI); - if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { - LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + if (!(IsShrinkable || + ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) || + TII->isVOP3(OrigOp)) && + ST->hasVOP3DPP()) || + TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n"); + break; + } + if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) { + LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n"); break; } @@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); assert(Src0 && "Src1 without Src0?"); - if (Src1 && Src1->isIdenticalTo(*Src0)) { - assert(Src1->isReg()); + if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) || + (Src2 && Src2->isIdenticalTo(*Src0)))) || + (Use == Src1 && (Src1->isIdenticalTo(*Src0) || + (Src2 && Src2->isIdenticalTo(*Src1))))) { LLVM_DEBUG( dbgs() << " " << OrigMI diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp new file mode 100644 index 000000000000..a5008e39d91a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -0,0 +1,212 @@ +//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#include "GCNVOPDUtils.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/MC/MCInst.h" + +using namespace llvm; + +#define DEBUG_TYPE "gcn-vopd-utils" + +bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI) { + const MachineFunction *MF = FirstMI.getMF(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo()); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned NumVGPRBanks = 4; + // Literals also count against scalar bus limit + SmallVector<const MachineOperand *> UniqueLiterals; + auto addLiteral = [&](const MachineOperand &Op) { + for (auto &Literal : UniqueLiterals) { + if (Literal->isIdenticalTo(Op)) + return; + } + UniqueLiterals.push_back(&Op); + }; + SmallVector<Register> UniqueScalarRegs; + assert([&]() -> bool { + for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); + MII != FirstMI.getParent()->instr_end(); ++MII) { + if (&*MII == &SecondMI) + return true; + } + return false; + }() && "Expected FirstMI to precede SecondMI"); + // Cannot pair dependent instructions + for (const auto &Use : SecondMI.uses()) + if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) + return false; + + struct ComponentInfo { + ComponentInfo(const MachineInstr &MI) : MI(MI) {} + Register Dst, Reg0, Reg1, Reg2; + const MachineInstr &MI; + }; + ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)}; + + for (ComponentInfo &Comp : CInfo) { + switch (Comp.MI.getOpcode()) { + case AMDGPU::V_FMAMK_F32: + // cannot inline the fixed literal in fmamk + addLiteral(Comp.MI.getOperand(2)); + Comp.Reg2 = Comp.MI.getOperand(3).getReg(); + break; + case AMDGPU::V_FMAAK_F32: + // cannot inline the fixed literal in fmaak + addLiteral(Comp.MI.getOperand(3)); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_DOT2_F32_F16: + case AMDGPU::V_DOT2_F32_BF16: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + Comp.Reg2 = Comp.MI.getOperand(0).getReg(); + break; + case AMDGPU::V_CNDMASK_B32_e32: + UniqueScalarRegs.push_back(AMDGPU::VCC_LO); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_MOV_B32_e32: + break; + default: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + } + + Comp.Dst = Comp.MI.getOperand(0).getReg(); + + const MachineOperand &Op0 = Comp.MI.getOperand(1); + if (Op0.isReg()) { + if (!TRI->isVectorRegister(MRI, Op0.getReg())) { + if (!is_contained(UniqueScalarRegs, Op0.getReg())) + UniqueScalarRegs.push_back(Op0.getReg()); + } else + Comp.Reg0 = Op0.getReg(); + } else { + if (!TII.isInlineConstant(Comp.MI, 1)) + addLiteral(Op0); + } + } + + if (UniqueLiterals.size() > 1) + return false; + if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) + return false; + + // check port 0 + if (CInfo[0].Reg0 && CInfo[1].Reg0 && + CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks) + return false; + // check port 1 + if (CInfo[0].Reg1 && CInfo[1].Reg1 && + CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks) + return false; + // check port 2 + if (CInfo[0].Reg2 && CInfo[1].Reg2 && + !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1)) + return false; + if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1)) + return false; + + LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI + << "\n\tY: " << SecondMI << "\n"); + return true; +} + +/// Check if the instr pair, FirstMI and SecondMI, should be scheduled +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII); + unsigned Opc2 = SecondMI.getOpcode(); + auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + + // One instruction case + if (!FirstMI) + return SecondCanBeVOPD.Y; + + unsigned Opc = FirstMI->getOpcode(); + auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + + if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || + (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) + return false; + + return checkVOPDRegConstraints(STII, *FirstMI, SecondMI); +} + +/// Adapts design from MacroFusion +/// Puts valid candidate instructions back-to-back so they can easily +/// be turned into VOPD instructions +/// Greedily pairs instruction candidates. O(n^2) algorithm. +struct VOPDPairingMutation : ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer + + VOPDPairingMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer + : shouldScheduleAdjacent(shouldScheduleAdjacent) {} + + void apply(ScheduleDAGInstrs *DAG) override { + const TargetInstrInfo &TII = *DAG->TII; + const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) { + LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n"); + return; + } + + std::vector<SUnit>::iterator ISUI, JSUI; + for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { + const MachineInstr *IMI = ISUI->getInstr(); + if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) + continue; + if (!hasLessThanNumFused(*ISUI, 2)) + continue; + + for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { + if (JSUI->isBoundaryNode()) + continue; + const MachineInstr *JMI = JSUI->getInstr(); + if (!hasLessThanNumFused(*JSUI, 2) || + !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) + continue; + if (fuseInstructionPair(*DAG, *ISUI, *JSUI)) + break; + } + } + LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n"); + } +}; + +std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() { + return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent); +} diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h new file mode 100644 index 000000000000..22361b9a1a07 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h @@ -0,0 +1,32 @@ +//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIInstrInfo; + +bool checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI); + +std::unique_ptr<ScheduleDAGMutation> createVOPDPairingMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 02c213f90f89..228963ff2a20 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -62,12 +62,6 @@ public: virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const = 0; - -protected: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 11fe3f9ef058..fba4b1a3db66 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -36,6 +36,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 060d4b660632..c2e2563c3989 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -50,6 +50,7 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 78eb304fe84f..3d926e52c368 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -58,11 +58,6 @@ private: uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; - }; } // end anonymous namespace @@ -90,11 +85,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, } void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (MI.getOpcode() == R600::RETURN || MI.getOpcode() == R600::FETCH_CLAUSE || @@ -187,5 +179,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, return MO.getImm(); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "R600GenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp index 269209a12175..b9ff195e0ddc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -13,10 +13,12 @@ #include "R600MCTargetDesc.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/SubtargetFeature.h" using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "R600GenInstrInfo.inc" MCInstrInfo *llvm::createR600MCInstrInfo() { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h index 605ae851378d..b4ce748532f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h @@ -35,6 +35,7 @@ MCInstrInfo *createR600MCInstrInfo(); #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "R600GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 5e67fb5ec876..e093d78b2cc6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -310,11 +310,8 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { int Opcode = MI.getOpcode(); APInt Encoding, Scratch; getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI); @@ -574,5 +571,4 @@ void SIMCCodeEmitter::getMachineOpValueCommon( llvm_unreachable("Encoding of this operand type is not supported yet."); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index bf52f7830ad7..5199a37a0519 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1623,7 +1623,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, NewBldVec); } -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, const SDLoc &DL) const { // Old -> New swizzle values diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 1e75a0432ec3..e7706fa0ef5c 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -74,8 +74,8 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - const SDLoc &DL) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], + SelectionDAG &DAG, const SDLoc &DL) const; SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp index 8f7807a2b472..f81f5122bbc9 100644 --- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp @@ -13,6 +13,7 @@ // #include "AMDGPUMCInstLower.h" +#include "MCTargetDesc/R600MCTargetDesc.h" #include "R600AsmPrinter.h" #include "R600Subtarget.h" #include "llvm/CodeGen/MachineOperand.h" @@ -42,6 +43,9 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } void R600AsmPrinter::emitInstruction(const MachineInstr *MI) { + R600_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>(); R600MCInstLower MCInstLowering(OutContext, STI, *this); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 094d5cd58673..d16da2a8b86b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -352,7 +352,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // TODO: Generalize to more vector types. setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16}, + MVT::v4i16, MVT::v4f16}, Custom); // Deal with vec3 vector operations when widened to vec4. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 814a7c446889..799d34e32d27 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3335,15 +3335,18 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { MachineInstr *DefMI; - const auto killDef = [&DefMI, &MBB, this]() -> void { + const auto killDef = [&]() -> void { const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // The only user is the instruction which will be killed. - if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) + Register DefReg = DefMI->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(DefReg)) return; // We cannot just remove the DefMI here, calling pass will crash. DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) DefMI->removeOperand(I); + if (LV) + LV->getVarInfo(DefReg).AliveBlocks.clear(); }; int64_t Imm; @@ -3982,6 +3985,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + int Src3Idx = -1; + if (Src0Idx == -1) { + // VOPD V_DUAL_* instructions use different operand names. + Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); + Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); + Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); + Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); + } // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); @@ -4255,9 +4266,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. - for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { if (OpIdx == -1) - break; + continue; const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 311f9f68e675..1b411eb83eb3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1242,6 +1242,9 @@ namespace AMDGPU { int getDPPOp32(uint16_t Opcode); LLVM_READONLY + int getDPPOp64(uint16_t Opcode); + + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 29ee9f12b12d..23afd6556bc9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -193,43 +193,32 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; -multiclass SDBufferAtomicRetNoRet { - def "_ret" : PatFrag< - (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, - node:$offset, node:$cachepolicy, node:$idxen), - (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, - node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, - node:$idxen)> { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; - let GISelPredicateCode = [{ return true; }]; - } - +multiclass SDBufferAtomicNoRet { def "_noret" : PatFrag< (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen)> { - let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; - let GISelPredicateCode = [{ return false; }]; + let HasNoUse = true; } } -defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_swap : SDBufferAtomicNoRet; +defm SIbuffer_atomic_add : SDBufferAtomicNoRet; +defm SIbuffer_atomic_sub : SDBufferAtomicNoRet; +defm SIbuffer_atomic_smin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_umin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_smax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_umax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_and : SDBufferAtomicNoRet; +defm SIbuffer_atomic_or : SDBufferAtomicNoRet; +defm SIbuffer_atomic_xor : SDBufferAtomicNoRet; +defm SIbuffer_atomic_inc : SDBufferAtomicNoRet; +defm SIbuffer_atomic_dec : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -246,24 +235,13 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; -def SIbuffer_atomic_cmpswap_ret : PatFrag< - (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, - node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), - (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, - node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, - node:$idxen)> { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; - let GISelPredicateCode = [{ return true; }]; -} - def SIbuffer_atomic_cmpswap_noret : PatFrag< (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen)> { - let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; - let GISelPredicateCode = [{ return false; }]; + let HasNoUse = true; } class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, @@ -774,13 +752,13 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, let AddressSpaces = StoreAddress_local.AddrSpaces in { defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; } } @@ -2194,21 +2172,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp, "$sdst", "$vdst"), ""); // use $sdst for VOPC - string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); - string isrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1", - " $src1,")); - string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); + string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1nomods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", ""); - string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string fsrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1mods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - string src0 = !if(Src0HasMods, fsrc0, isrc0); - string src1 = !if(Src1HasMods, fsrc1, isrc1); - string src2 = !if(Src2HasMods, fsrc2, isrc2); + string src0 = !if(Src0HasMods, src0mods, src0nomods); + string src1 = !if(Src1HasMods, src1mods, src1nomods); + string src2 = !if(Src2HasMods, src2mods, src2nomods); string opsel = !if(HasOpSel, "$op_sel", ""); string 3PMods = !if(IsVOP3P, !if(HasOpSel, "$op_sel_hi", "") @@ -2559,8 +2537,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, // the asm operand name via this HasModifiers flag field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret; field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp, - HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods, - HasSrc2FloatMods, DstVT >.ret; + HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers, + HasModifiers, DstVT>.ret; field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret; field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret; field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret; @@ -2800,6 +2778,14 @@ def getDPPOp32 : InstrMapping { let ValueCols = [["DPP"]]; } +def getDPPOp64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["VOP3"]; + let ValueCols = [["VOP3_DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; @@ -2961,6 +2947,27 @@ def getVCMPXOpFromVCMP : InstrMapping { let ValueCols = [["1"]]; } +def VOPDComponentTable : GenericTable { + let FilterClass = "VOPD_Component"; + let CppTypeName = "VOPDComponentInfo"; + let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"]; + let PrimaryKey = ["BaseVOP"]; + let PrimaryKeyName = "getVOPDComponentHelper"; +} + +def VOPDPairs : GenericTable { + let FilterClass = "VOPD_Base"; + let CppTypeName = "VOPDInfo"; + let Fields = ["Opcode", "OpX", "OpY"]; + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getVOPDOpcodeHelper"; +} + +def getVOPDInfoFromComponentOpcodes : SearchIndex { + let Table = VOPDPairs; + let Key = ["OpX", "OpY"]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 829669157893..ce8c03bb8d64 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1449,6 +1449,14 @@ def : BitConvert <v8i32, v16f16, VReg_256>; def : BitConvert <v8i32, v16i16, VReg_256>; def : BitConvert <v8f32, v16f16, VReg_256>; def : BitConvert <v8f32, v16i16, VReg_256>; +def : BitConvert <v16f16, v4i64, VReg_256>; +def : BitConvert <v16i16, v4i64, VReg_256>; +def : BitConvert <v16f16, v4f64, VReg_256>; +def : BitConvert <v16i16, v4f64, VReg_256>; +def : BitConvert <v4i64, v16f16, VReg_256>; +def : BitConvert <v4i64, v16i16, VReg_256>; +def : BitConvert <v4f64, v16f16, VReg_256>; +def : BitConvert <v4f64, v16i16, VReg_256>; // 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; @@ -3012,6 +3020,35 @@ multiclass Int16Med3Pat<Instruction med3Inst, def : FPMed3Pat<f32, V_MED3_F32_e64>; +class +IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, + SDPatternOperator max_or_min_oneuse> : AMDGPUPat < + (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), + i32:$src2), + (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) +>; + +class +FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, + SDPatternOperator max_or_min_oneuse> : GCNPat < + (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods))), + (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +let OtherPredicates = [isGFX11Plus] in { +def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; +def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; +def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; +def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; +def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + let OtherPredicates = [isGFX9Plus] in { def : FP16Med3Pat<f16, V_MED3_F16_e64>; defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 607383ab8cde..67077a2eaa6b 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -148,6 +148,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addUsedIfAvailable<LiveIntervals>(); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<MachineDominatorTree>(); AU.addPreserved<SlotIndexes>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index dd881ec42d53..786b6b61cb23 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -72,7 +72,7 @@ INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; -/// Insert restore code for the callee-saved registers used in the function. +/// Insert spill code for the callee-saved registers used in the function. static void insertCSRSaves(MachineBasicBlock &SaveBlock, ArrayRef<CalleeSavedInfo> CSI, LiveIntervals *LIS) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index e426e938b856..ff5587fbb0ca 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1883,7 +1883,13 @@ void SIScheduleDAGMI::schedule() LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); buildDAGWithRegPressure(); + postprocessDAG(); + LLVM_DEBUG(dump()); + if (PrintDAGs) + dump(); + if (ViewMISchedDAGs) + viewGraph(); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 8a66213931ff..6b93769949bc 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2329,13 +2329,13 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { continue; if (const auto &MOI = MOA.getLoadInfo(MI)) - Changed |= expandLoad(MOI.getValue(), MI); + Changed |= expandLoad(MOI.value(), MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) - Changed |= expandStore(MOI.getValue(), MI); + Changed |= expandStore(MOI.value(), MI); else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) - Changed |= expandAtomicFence(MOI.getValue(), MI); + Changed |= expandAtomicFence(MOI.value(), MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) - Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); + Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 5215397d5936..66bc46aaefea 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -20,10 +21,40 @@ using namespace llvm; namespace { class SIOptimizeExecMasking : public MachineFunctionPass { + MachineFunction *MF = nullptr; + const GCNSubtarget *ST = nullptr; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + const MachineRegisterInfo *MRI = nullptr; + + Register isCopyFromExec(const MachineInstr &MI) const; + Register isCopyToExec(const MachineInstr &MI) const; + bool removeTerminatorBit(MachineInstr &MI) const; + MachineBasicBlock::reverse_iterator + fixTerminators(MachineBasicBlock &MBB) const; + MachineBasicBlock::reverse_iterator + findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const; + + bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, + MCRegister Reg, bool UseLiveOuts = false, + bool IgnoreStart = false) const; + bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; + MachineInstr *findInstrBackwards(MachineInstr &Origin, + std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, + unsigned MaxInstructions = 20) const; + MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, + MCRegister Exec) const; + bool optimizeExecSequence() const; + bool optimizeVCmpxAndSaveexecSequence() const; + bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, + MCRegister Exec) const; + public: static char ID; -public: SIOptimizeExecMasking() : MachineFunctionPass(ID) { initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); } @@ -53,7 +84,7 @@ char SIOptimizeExecMasking::ID = 0; char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; /// If \p MI is a copy from exec, return the register copied to. -static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: @@ -61,8 +92,7 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && - Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) + if (Src.isReg() && Src.getReg() == TRI->getExec()) return MI.getOperand(0).getReg(); } } @@ -71,14 +101,13 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { } /// If \p MI is a copy to exec, return the register copied from. -static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + if (Dst.isReg() && Dst.getReg() == TRI->getExec() && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; @@ -173,64 +202,64 @@ static unsigned getSaveExecOp(unsigned Opc) { // These are only terminators to get correct spill code placement during // register allocation, so turn them back into normal instructions. -static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { +bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::S_MOV_B32_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); return true; } case AMDGPU::S_MOV_B64_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); return true; } case AMDGPU::S_XOR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); return true; } case AMDGPU::S_XOR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); return true; } case AMDGPU::S_OR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B64)); + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); return true; } case AMDGPU::S_OR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B32)); + MI.setDesc(TII->get(AMDGPU::S_OR_B32)); return true; } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); return true; } case AMDGPU::S_ANDN2_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); return true; } case AMDGPU::S_AND_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + MI.setDesc(TII->get(AMDGPU::S_AND_B64)); return true; } case AMDGPU::S_AND_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + MI.setDesc(TII->get(AMDGPU::S_AND_B32)); return true; } default: @@ -241,9 +270,8 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { // Turn all pseudoterminators in the block into their equivalent non-terminator // instructions. Returns the reverse iterator to the first non-terminator // instruction in the block. -static MachineBasicBlock::reverse_iterator fixTerminators( - const SIInstrInfo &TII, - MachineBasicBlock &MBB) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); bool Seen = false; @@ -252,7 +280,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators( if (!I->isTerminator()) return Seen ? FirstNonTerm : I; - if (removeTerminatorBit(TII, *I)) { + if (removeTerminatorBit(*I)) { if (!Seen) { FirstNonTerm = I; Seen = true; @@ -263,17 +291,15 @@ static MachineBasicBlock::reverse_iterator fixTerminators( return FirstNonTerm; } -static MachineBasicBlock::reverse_iterator findExecCopy( - const SIInstrInfo &TII, - const GCNSubtarget &ST, - MachineBasicBlock &MBB, - MachineBasicBlock::reverse_iterator I, - unsigned CopyToExec) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB, + MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const { const unsigned InstLimit = 25; auto E = MBB.rend(); for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { - Register CopyFromExec = isCopyFromExec(*I, ST); + Register CopyFromExec = isCopyFromExec(*I); if (CopyFromExec.isValid()) return I; } @@ -298,11 +324,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { // an arbitrary condition based on the current MachineInstr, for instance an // target instruction. Breaks prematurely by returning nullptr if one of the // registers given in NonModifiableRegs is modified by the current instruction. -static MachineInstr * -findInstrBackwards(MachineInstr &Origin, - std::function<bool(MachineInstr *)> Pred, - ArrayRef<MCRegister> NonModifiableRegs, - const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) { +MachineInstr *SIOptimizeExecMasking::findInstrBackwards( + MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), E = Origin.getParent()->rend(); unsigned CurrentIteration = 0; @@ -310,7 +334,7 @@ findInstrBackwards(MachineInstr &Origin, for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { if (A->isDebugInstr()) continue; - + if (Pred(&*A)) return &*A; @@ -318,209 +342,64 @@ findInstrBackwards(MachineInstr &Origin, if (A->modifiesRegister(Reg, TRI)) return nullptr; } - + ++CurrentIteration; } return nullptr; } - // Determine if a register Reg is not re-defined and still in use // in the range (Stop..Start]. // It does so by backwards calculating liveness from the end of the BB until // either Stop or the beginning of the BB is reached. // After liveness is calculated, we can determine if Reg is still in use and not // defined inbetween the instructions. -static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, - MCRegister Reg, const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI, - bool useLiveOuts = false, - bool ignoreStart = false) { +bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, + MachineInstr &Start, + MCRegister Reg, + bool UseLiveOuts, + bool IgnoreStart) const { LivePhysRegs LR(*TRI); - if (useLiveOuts) + if (UseLiveOuts) LR.addLiveOuts(*Stop.getParent()); MachineBasicBlock::reverse_iterator A(Start); MachineBasicBlock::reverse_iterator E(Stop); - if (ignoreStart) + if (IgnoreStart) ++A; for (; A != Stop.getParent()->rend() && A != Stop; ++A) { LR.stepBackward(*A); } - return !LR.available(MRI, Reg); + return !LR.available(*MRI, Reg); } // Determine if a register Reg is not re-defined and still in use // in the range (Stop..BB.end]. -static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI, - MRI, true); -} - -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -static MachineInstr *findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, - const SIInstrInfo *TII, MachineRegisterInfo &MRI) { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec or the saveexec input operand inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}, TRI); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the - // s_and_saveexec, skip the optimization. - if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI, - false, true) || - isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI)) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector<MCRegister, 2> NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs, TRI)) - return nullptr; - - return VCmp; -} - -// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the -// operands extracted from a v_cmp ..., s_and_saveexec pattern. -static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, MCRegister Exec, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); - - if (NewOpcode == -1) - return false; - - MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); - - Register MoveDest = SaveExecInstr.getOperand(0).getReg(); - - MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); - if (!SaveExecInstr.uses().empty()) { - bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; - unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(*SaveExecInstr.getParent(), InsertPosIt, - SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) - .addReg(Exec); - } - - // Omit dst as V_CMPX is implicitly writing to EXEC. - // Add dummy src and clamp modifiers, if needed. - auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), - VCmp.getDebugLoc(), TII->get(NewOpcode)); - - auto TryAddImmediateValueFromNamedOperand = - [&](unsigned OperandName) -> void { - if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) - Builder.addImm(Mod->getImm()); - }; - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); - Builder.add(*Src0); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); - Builder.add(*Src1); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); - - // The kill flags may no longer be correct. - if (Src0->isReg()) - MRI.clearKillFlags(Src0->getReg()); - if (Src1->isReg()) - MRI.clearKillFlags(Src1->getReg()); - - return true; +bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, + MCRegister Reg) const { + return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); } -bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - - // Optimize sequences emitted for control flow lowering. They are originally - // emitted as the separate operations because spill code may need to be - // inserted for the saved copy of exec. - // - // x = copy exec - // z = s_<op>_b64 x, y - // exec = copy z - // => - // x = s_<op>_saveexec_b64 y - // +// Optimize sequences emitted for control flow lowering. They are originally +// emitted as the separate operations because spill code may need to be +// inserted for the saved copy of exec. +// +// x = copy exec +// z = s_<op>_b64 x, y +// exec = copy z +// => +// x = s_<op>_saveexec_b64 y +// +bool SIOptimizeExecMasking::optimizeExecSequence() const { + MCRegister Exec = TRI->getExec(); bool Changed = false; - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); + for (MachineBasicBlock &MBB : *MF) { + MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); MachineBasicBlock::reverse_iterator E = MBB.rend(); if (I == E) continue; @@ -532,7 +411,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { unsigned SearchCount = 0; const unsigned SearchLimit = 5; while (I != E && SearchCount++ < SearchLimit) { - CopyToExec = isCopyToExec(*I, ST); + CopyToExec = isCopyToExec(*I); if (CopyToExec) break; ++I; @@ -542,8 +421,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { continue; // Scan backwards to find the def. - auto CopyToExecInst = &*I; - auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); + auto *CopyToExecInst = &*I; + auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) @@ -574,8 +453,9 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { MachineInstr *SaveExecInst = nullptr; SmallVector<MachineInstr *, 4> OtherUseInsts; - for (MachineBasicBlock::iterator J - = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); + for (MachineBasicBlock::iterator + J = std::next(CopyFromExecInst->getIterator()), + JE = I->getIterator(); J != JE; ++J) { if (SaveExecInst && J->readsRegister(Exec, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); @@ -655,58 +535,210 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), CopyFromExec) - .addReg(OtherOp->getReg()); + .addReg(OtherOp->getReg()); SaveExecInst->eraseFromParent(); CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, Exec, - AMDGPU::NoSubRegister, *TRI); + OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, + *TRI); } Changed = true; } - // After all s_op_saveexec instructions are inserted, - // replace (on GFX10.3 and later) - // v_cmp_* SGPR, IMM, VGPR - // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR - // with - // s_mov_b32 EXEC_SGPR_DEST, exec_lo - // v_cmpx_* IMM, VGPR - // to reduce pipeline stalls. - if (ST.hasGFX10_3Insts()) { - DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; - const unsigned AndSaveExecOpcode = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + return Changed; +} - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if it is safe to replace the sequence (see the +// conditions in the function body). This is after register allocation, so some +// checks on operand dependencies need to be considered. +MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec) const { - if (MachineInstr *VCmp = - findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) - SaveExecVCmpMapping[&MI] = VCmp; - } + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (!TRI->isSGPRReg(*MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + SaveExec, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}); + + if (!VCmp) + return nullptr; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && + SaveExec.modifiesRegister(Src0->getReg(), TRI)) + return nullptr; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && + SaveExec.modifiesRegister(Src1->getReg(), TRI)) + return nullptr; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return nullptr; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, + true) || + isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) + return nullptr; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector<MCRegister, 2> NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( + MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); + if (!SaveExecInstr.uses().empty()) { + bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; + unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) + .addReg(Exec); + } + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add dummy src and clamp modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + auto TryAddImmediateValueFromNamedOperand = + [&](unsigned OperandName) -> void { + if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) + Builder.addImm(Mod->getImm()); + }; + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); + Builder.add(*Src0); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); + Builder.add(*Src1); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); + + // The kill flags may no longer be correct. + if (Src0->isReg()) + MRI->clearKillFlags(Src0->getReg()); + if (Src1->isReg()) + MRI->clearKillFlags(Src1->getReg()); + + return true; +} + +// After all s_op_saveexec instructions are inserted, +// replace (on GFX10.3 and later) +// v_cmp_* SGPR, IMM, VGPR +// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR +// with +// s_mov_b32 EXEC_SGPR_DEST, exec_lo +// v_cmpx_* IMM, VGPR +// to reduce pipeline stalls. +bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { + if (!ST->hasGFX10_3Insts()) + return false; + + bool Changed = false; + + DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; + MCRegister Exec = TRI->getExec(); + const unsigned AndSaveExecOpcode = + ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) + SaveExecVCmpMapping[&MI] = VCmp; } + } - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); - if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, - TRI, *MRI)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); + if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); - Changed = true; - } + Changed = true; } } return Changed; } + +bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + this->MF = &MF; + ST = &MF.getSubtarget<GCNSubtarget>(); + TRI = ST->getRegisterInfo(); + TII = ST->getInstrInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = optimizeExecSequence(); + Changed |= optimizeVCmpxAndSaveexecSequence(); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index e5e65a8dbbf1..57dbad468de8 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -159,6 +159,9 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { return false; Register SelReg = Op1->getReg(); + if (SelReg.isPhysical()) + return false; + auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS); if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return false; @@ -264,13 +267,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { // Try to remove v_cndmask_b32. if (SelLI) { - bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill(); - if (!CanRemoveSel) { - // Try to shrink the live interval and check for dead def instead. - LIS->shrinkToUses(SelLI, nullptr); - CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); - } - if (CanRemoveSel) { + // Kill status must be checked before shrinking the live range. + bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill(); + LIS->shrinkToUses(SelLI); + bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); + if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) { LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ad1455ed20fd..b32d5bb04d5b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2933,6 +2933,10 @@ MCRegister SIRegisterInfo::getVCC() const { return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } +MCRegister SIRegisterInfo::getExec() const { + return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; +} + const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { // VGPR tuples have an alignment requirement on gfx90a variants. return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 9bfbc253410b..6024158be181 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -344,6 +344,8 @@ public: MCRegister getVCC() const; + MCRegister getExec() const; + const TargetRegisterClass *getRegClass(unsigned RCID) const; // Find reaching register definition diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e4ab72f1095b..2f334e211181 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -277,6 +277,18 @@ struct VOPC64DPPInfo { uint16_t Opcode; }; +struct VOPDComponentInfo { + uint16_t BaseVOP; + uint16_t VOPDOp; + bool CanBeVOPDX; +}; + +struct VOPDInfo { + uint16_t Opcode; + uint16_t OpX; + uint16_t OpY; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -293,6 +305,10 @@ struct VOPC64DPPInfo { #define GET_VOPC64DPPTable_IMPL #define GET_VOPC64DPP8Table_DECL #define GET_VOPC64DPP8Table_IMPL +#define GET_VOPDComponentTable_DECL +#define GET_VOPDComponentTable_IMPL +#define GET_VOPDPairs_DECL +#define GET_VOPDPairs_IMPL #define GET_WMMAOpcode2AddrMappingTable_DECL #define GET_WMMAOpcode2AddrMappingTable_IMPL #define GET_WMMAOpcode3AddrMappingTable_DECL @@ -398,6 +414,19 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info ? Info->is_gfx940_xdl : false; } +CanBeVOPD getCanBeVOPD(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + if (Info) + return {Info->CanBeVOPDX, 1}; + else + return {0, 0}; +} + +unsigned getVOPDOpcode(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + return Info ? Info->VOPDOp : ~0u; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -415,6 +444,11 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } +int getVOPDFull(unsigned OpX, unsigned OpY) { + const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY); + return Info ? Info->Opcode : -1; +} + namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index dffeec10a14a..51cf1678207c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -470,6 +470,14 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +struct CanBeVOPD { + bool X; + bool Y; +}; + +LLVM_READONLY +CanBeVOPD getCanBeVOPD(unsigned Opc); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, @@ -483,6 +491,12 @@ LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); LLVM_READONLY +unsigned getVOPDOpcode(unsigned Opc); + +LLVM_READONLY +int getVOPDFull(unsigned OpX, unsigned OpY); + +LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 1485a1e63129..b24857edb59a 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -495,9 +495,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=* bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, - Src0DPP:$src0, - Src1DPP:$src1, - dpp8:$dpp8, FI:$fi); + Src0DPP:$src0, + Src1DPP:$src1, + dpp8:$dpp8, FI:$fi); let HasExt = 1; let HasExtDPP = 1; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index eb6c54a45263..33d3441e94c2 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1108,7 +1108,6 @@ class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P> // Inst{87-84} ignored by hw let Inst{91-88} = bank_mask; let Inst{95-92} = row_mask; - } class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> @@ -1148,7 +1147,6 @@ class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P> let Inst{40-32} = fi; let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{95-72} = dpp8{23-0}; - } class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8cd3d2fe2c47..187485ffa3ae 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); + let HasModifiers = + !if (Features.IsMAI, 0, + !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); } class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> { @@ -1414,7 +1416,7 @@ multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName, VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>; multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName, - string asmName> : + string asmName> : VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>; multiclass VOP3be_Realtriple_gfx11< diff --git a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp index 0390c01eecb1..cee2fc7d2bf0 100644 --- a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp +++ b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp @@ -49,6 +49,9 @@ public: } // end anonymous namespace void ARCAsmPrinter::emitInstruction(const MachineInstr *MI) { + ARC_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + SmallString<128> Str; raw_svector_ostream O(Str); diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp index d4f74fa77fc4..36b00af2c0b4 100644 --- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp +++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp @@ -26,6 +26,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "ARCGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h index ab06ce46d99f..5f83b48b36af 100644 --- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h +++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h @@ -28,6 +28,7 @@ class Target; // Defines symbolic names for the ARC instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "ARCGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 48559a89a30a..73970b9c74c5 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -378,13 +378,13 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2", "Prefer 32-bit alignment for loops">; -def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1", +def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4", "Model MVE instructions as a 1 beat per tick architecture">; def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2", "Model MVE instructions as a 2 beats per tick architecture">; -def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4", +def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "1", "Model MVE instructions as a 4 beats per tick architecture">; /// Some instructions update CPSR partially, which can add false dependency for @@ -1450,6 +1450,13 @@ def : ProcessorModel<"cortex-m55", CortexM4Model, [ARMv81mMainline, HasMVEFloatOps, FeatureFixCMSE_CVE_2021_35465]>; +def : ProcessorModel<"cortex-m85", CortexM7Model, [ARMv81mMainline, + FeatureDSP, + FeatureFPARMv8_D16, + FeaturePACBTI, + FeatureUseMISched, + HasMVEFloatOps]>; + def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 4aa28bc5d28d..57cbd7a3b2b8 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1337,6 +1337,10 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { #include "ARMGenMCPseudoLowering.inc" void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { + // TODOD FIXME: Enable feature predicate checks once all the test pass. + // ARM_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + const DataLayout &DL = getDataLayout(); MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 85e32c08c74c..e6be93e6480a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -450,6 +450,14 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + + if (!HasMVEFP) { + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + } } setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); @@ -13350,14 +13358,14 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { // to make better use of vaddva style instructions. if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) && - !isa<ConstantSDNode>(N0)) { + !isa<ConstantSDNode>(N0) && N1->hasOneUse()) { SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); } // And turn add(add(A, reduce(B)), add(C, reduce(D))) -> // add(add(add(A, C), reduce(B)), reduce(D)) if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD && - N1.getOpcode() == ISD::ADD) { + N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) { unsigned N0RedOp = 0; if (!IsVecReduce(N0.getOperand(N0RedOp))) { N0RedOp = 1; @@ -13424,7 +13432,7 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { }; SDValue X; - if (N0.getOpcode() == ISD::ADD) { + if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) { if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) { int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0), N0.getOperand(1).getOperand(0)); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 3a9946ee810b..ba1d806c8d81 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2247,15 +2247,15 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return canTailPredicateLoop(L, LI, SE, DL, LAI); } -bool ARMTTIImpl::emitGetActiveLaneMask() const { +PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return false; + return PredicationStyle::None; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return true; + return PredicationStyle::Data; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index d7a2bdb3db15..dcf82e703a7f 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,7 @@ public: TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - bool emitGetActiveLaneMask() const; + PredicationStyle emitGetActiveLaneMask() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 3f1379f135d1..9f85d72cc810 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -133,6 +133,7 @@ static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, } #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "ARMGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index e0c992f4fae2..3066d9ba6783 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -139,6 +139,7 @@ bool isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI); // Defines symbolic names for the ARM instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "ARMGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 30785340ef12..296801094fbe 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -351,13 +351,13 @@ Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) { if (!Op0 || !Op1) return Optional<int64_t>{}; if (I->getOpcode() == Instruction::Add) - return Optional<int64_t>{Op0.getValue() + Op1.getValue()}; + return Optional<int64_t>{Op0.value() + Op1.value()}; if (I->getOpcode() == Instruction::Mul) - return Optional<int64_t>{Op0.getValue() * Op1.getValue()}; + return Optional<int64_t>{Op0.value() * Op1.value()}; if (I->getOpcode() == Instruction::Shl) - return Optional<int64_t>{Op0.getValue() << Op1.getValue()}; + return Optional<int64_t>{Op0.value() << Op1.value()}; if (I->getOpcode() == Instruction::Or) - return Optional<int64_t>{Op0.getValue() | Op1.getValue()}; + return Optional<int64_t>{Op0.value() | Op1.value()}; } return Optional<int64_t>{}; } diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp index 0001e520b1fb..70fc90bf9eb5 100644 --- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp @@ -180,6 +180,10 @@ bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void AVRAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // AVR_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + AVRMCInstLower MCInstLowering(OutContext, *this); MCInst I; diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp index cdfe4a21105d..ba370261e284 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp @@ -27,6 +27,7 @@ #include "llvm/MC/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AVRGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h index aaf236d82016..e83d674f87cc 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h @@ -49,6 +49,7 @@ std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI); #include "AVRGenRegisterInfo.inc" #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "AVRGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 349cdd92ae62..9aad9375d913 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -149,6 +149,13 @@ private: // The base call is not an input of any other preserve_* // intrinsics. std::map<CallInst *, CallInfo> BaseAICalls; + // A map to hold <AnonRecord, TypeDef> relationships + std::map<DICompositeType *, DIDerivedType *> AnonRecords; + + void CheckAnonRecordType(DIDerivedType *ParentTy, DIType *Ty); + void CheckCompositeType(DIDerivedType *ParentTy, DICompositeType *CTy); + void CheckDerivedType(DIDerivedType *ParentTy, DIDerivedType *DTy); + void ResetMetadata(struct CallInfo &CInfo); bool doTransformation(Function &F); @@ -221,10 +228,80 @@ bool BPFAbstractMemberAccess::run(Function &F) { if (M->debug_compile_units().empty()) return false; + // For each argument/return/local_variable type, trace the type + // pattern like '[derived_type]* [composite_type]' to check + // and remember (anon record -> typedef) relations where the + // anon record is defined as + // typedef [const/volatile/restrict]* [anon record] + DISubprogram *SP = F.getSubprogram(); + if (SP && SP->isDefinition()) { + for (DIType *Ty: SP->getType()->getTypeArray()) + CheckAnonRecordType(nullptr, Ty); + for (const DINode *DN : SP->getRetainedNodes()) { + if (const auto *DV = dyn_cast<DILocalVariable>(DN)) + CheckAnonRecordType(nullptr, DV->getType()); + } + } + DL = &M->getDataLayout(); return doTransformation(F); } +void BPFAbstractMemberAccess::ResetMetadata(struct CallInfo &CInfo) { + if (auto Ty = dyn_cast<DICompositeType>(CInfo.Metadata)) { + if (AnonRecords.find(Ty) != AnonRecords.end()) { + if (AnonRecords[Ty] != nullptr) + CInfo.Metadata = AnonRecords[Ty]; + } + } +} + +void BPFAbstractMemberAccess::CheckCompositeType(DIDerivedType *ParentTy, + DICompositeType *CTy) { + if (!CTy->getName().empty() || !ParentTy || + ParentTy->getTag() != dwarf::DW_TAG_typedef) + return; + + if (AnonRecords.find(CTy) == AnonRecords.end()) { + AnonRecords[CTy] = ParentTy; + return; + } + + // Two or more typedef's may point to the same anon record. + // If this is the case, set the typedef DIType to be nullptr + // to indicate the duplication case. + DIDerivedType *CurrTy = AnonRecords[CTy]; + if (CurrTy == ParentTy) + return; + AnonRecords[CTy] = nullptr; +} + +void BPFAbstractMemberAccess::CheckDerivedType(DIDerivedType *ParentTy, + DIDerivedType *DTy) { + DIType *BaseType = DTy->getBaseType(); + if (!BaseType) + return; + + unsigned Tag = DTy->getTag(); + if (Tag == dwarf::DW_TAG_pointer_type) + CheckAnonRecordType(nullptr, BaseType); + else if (Tag == dwarf::DW_TAG_typedef) + CheckAnonRecordType(DTy, BaseType); + else + CheckAnonRecordType(ParentTy, BaseType); +} + +void BPFAbstractMemberAccess::CheckAnonRecordType(DIDerivedType *ParentTy, + DIType *Ty) { + if (!Ty) + return; + + if (auto *CTy = dyn_cast<DICompositeType>(Ty)) + return CheckCompositeType(ParentTy, CTy); + else if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) + return CheckDerivedType(ParentTy, DTy); +} + static bool SkipDIDerivedTag(unsigned Tag, bool skipTypedef) { if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type && @@ -298,6 +375,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic"); + ResetMetadata(CInfo); CInfo.AccessIndex = getConstant(Call->getArgOperand(1)); CInfo.Base = Call->getArgOperand(0); return true; @@ -307,6 +385,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic"); + ResetMetadata(CInfo); CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); CInfo.Base = Call->getArgOperand(0); CInfo.RecordAlignment = DL->getABITypeAlign(getBaseElementType(Call)); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index d6145f53c170..c8849bd50464 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -138,6 +138,9 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { + BPF_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst TmpInst; if (!BTF || !BTF->InstLower(MI, TmpInst)) { diff --git a/llvm/lib/Target/BPF/BTF.h b/llvm/lib/Target/BPF/BTF.h index 4540054aaf34..89852be4a8c8 100644 --- a/llvm/lib/Target/BPF/BTF.h +++ b/llvm/lib/Target/BPF/BTF.h @@ -48,6 +48,8 @@ #ifndef LLVM_LIB_TARGET_BPF_BTF_H #define LLVM_LIB_TARGET_BPF_BTF_H +#include <cstdint> + namespace llvm { namespace BTF { diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp index a98d001097bc..cb321906db03 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp @@ -31,14 +31,13 @@ using namespace llvm; namespace { class BPFMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; const MCRegisterInfo &MRI; bool IsLittleEndian; public: - BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + BPFMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri, bool IsLittleEndian) - : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {} + : MRI(mri), IsLittleEndian(IsLittleEndian) { } BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete; void operator=(const BPFMCCodeEmitter &) = delete; ~BPFMCCodeEmitter() override = default; @@ -62,12 +61,6 @@ public: void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -117,9 +110,6 @@ static uint8_t SwapBits(uint8_t Val) void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - unsigned Opcode = MI.getOpcode(); support::endian::Writer OSE(OS, IsLittleEndian ? support::little : support::big); @@ -174,5 +164,4 @@ uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op, return Encoding; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "BPFGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index 5a1e251cd29c..77db5f99225e 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/Host.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "BPFGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h index fc190504581c..ea30e714a5b7 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h @@ -54,6 +54,7 @@ std::unique_ptr<MCObjectTargetWriter> createBPFELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the BPF instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "BPFGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp index 0236b22ad379..ea5b4555757e 100644 --- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp +++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp @@ -141,6 +141,9 @@ void CSKYAsmPrinter::emitEndOfAsmFile(Module &M) { } void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) { + CSKY_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td index 300ecceae906..8d3835b22bb0 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td @@ -153,7 +153,7 @@ def CSKYSymbol : AsmOperandClass { let ParserMethod = "parseCSKYSymbol"; } -def br_symbol : Operand<iPTR> { +def br_symbol : Operand<OtherVT> { let EncoderMethod = "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm16_scale2>"; let ParserMatchClass = CSKYSymbol; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td index 3be1ca8b7998..2d7fb85e89fa 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td @@ -24,7 +24,7 @@ def CSKY_NIR : SDNode<"CSKYISD::NIR", SDTNone, // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def br_symbol_16bit : Operand<iPTR> { +def br_symbol_16bit : Operand<OtherVT> { let EncoderMethod = "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm10_scale2>"; let ParserMatchClass = CSKYSymbol; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp index 1a69dc8acde0..64f01cd1c9fa 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "CSKYGenInstrInfo.inc" #define GET_REGINFO_MC_DESC diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h index 4b8c45e95b74..1137b4d6e9b1 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h @@ -41,6 +41,7 @@ MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); #include "CSKYGenRegisterInfo.inc" #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "CSKYGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 4d6e1a9d3166..709279889653 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -116,7 +116,7 @@ def ThreadId :dxil_op< "ThreadId", 93, ThreadIdClass, ComputeID, "reads the thr dxil_param<1, "i32", "opcode", "DXIL opcode">, dxil_param<2, "i32", "component", "component to read (x,y,z)"> ]>, - dxil_map_intrinsic<int_dxil_thread_id>; + dxil_map_intrinsic<int_dx_thread_id>; def GroupId :dxil_op< "GroupId", 94, GroupIdClass, ComputeID, "reads the group ID (SV_GroupID)", "i32;", "rn", [ @@ -124,7 +124,7 @@ def GroupId :dxil_op< "GroupId", 94, GroupIdClass, ComputeID, "reads the group dxil_param<1, "i32", "opcode", "DXIL opcode">, dxil_param<2, "i32", "component", "component to read"> ]>, - dxil_map_intrinsic<int_dxil_group_id>; + dxil_map_intrinsic<int_dx_group_id>; def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeID, "reads the thread ID within the group (SV_GroupThreadID)", "i32;", "rn", @@ -133,7 +133,7 @@ def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95, ThreadIdInGroupClass, Comp dxil_param<1, "i32", "opcode", "DXIL opcode">, dxil_param<2, "i32", "component", "component to read (x,y,z)"> ]>, - dxil_map_intrinsic<int_dxil_thread_id_in_group>; + dxil_map_intrinsic<int_dx_thread_id_in_group>; def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeID, "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;", "rn", @@ -141,4 +141,4 @@ def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96, Flattene dxil_param<0, "i32", "", "result">, dxil_param<1, "i32", "opcode", "DXIL opcode"> ]>, - dxil_map_intrinsic<int_dxil_flattened_thread_id_in_group>; + dxil_map_intrinsic<int_dx_flattened_thread_id_in_group>; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 494a71e51a89..3e09270a66d0 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -595,6 +595,10 @@ unsigned DXILBitcodeWriter::getEncodedRMWOperation(AtomicRMWInst::BinOp Op) { return bitc::RMW_FADD; case AtomicRMWInst::FSub: return bitc::RMW_FSUB; + case AtomicRMWInst::FMax: + return bitc::RMW_FMAX; + case AtomicRMWInst::FMin: + return bitc::RMW_FMIN; } } diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 48d339234e9e..1064296b0991 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -743,6 +743,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst, /// Print out a single Hexagon MI to the current output stream. void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) { + Hexagon_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst MCB; MCB.setOpcode(Hexagon::BUNDLE); MCB.addOperand(MCOperand::createImm(0)); diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 0b4a95bc9ce5..01501109f3b1 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1024,7 +1024,7 @@ void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const { for (auto &B : MF) { auto At = findCFILocation(B); if (At) - insertCFIInstructionsAt(B, At.getValue()); + insertCFIInstructionsAt(B, At.value()); } } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index ed2856eb1fe9..9c235776c160 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -376,11 +376,9 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, State.Bundle = &MI; State.Index = 0; size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1; - FeatureBitset Features = computeAvailableFeatures(STI.getFeatureBits()); for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) { MCInst &HMI = const_cast<MCInst &>(*I.getInst()); - verifyInstructionPredicates(HMI, Features); EncodeSingleInstruction(HMI, OS, Fixups, STI, parseBits(Last, HMB, HMI)); State.Extended = HexagonMCInstrInfo::isImmext(HMI); @@ -793,5 +791,4 @@ MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII, return new HexagonMCCodeEmitter(MII, MCT); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "HexagonGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h index 9e86dc8e4989..151964bf818b 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h @@ -81,11 +81,6 @@ private: // Return parse bits for instruction `MCI' inside bundle `MCB' uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const; - - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index d068baf05998..f2d1173cd503 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -46,6 +46,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "HexagonGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index d717e710f3c0..3932077c08f1 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -110,6 +110,7 @@ unsigned HexagonConvertUnits(unsigned ItinUnits, unsigned *Lanes); // #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "HexagonGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index d715ba901a2b..33e7068622f1 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -705,14 +705,14 @@ LanaiAsmParser::parseRegister(bool RestoreOnFailure) { RegNum = MatchRegisterName(Lexer.getTok().getIdentifier()); if (RegNum == 0) { if (PercentTok && RestoreOnFailure) - Lexer.UnLex(PercentTok.getValue()); + Lexer.UnLex(PercentTok.value()); return nullptr; } Parser.Lex(); // Eat identifier token return LanaiOperand::createReg(RegNum, Start, End); } if (PercentTok && RestoreOnFailure) - Lexer.UnLex(PercentTok.getValue()); + Lexer.UnLex(PercentTok.value()); return nullptr; } diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp index c0b7fd3fdd5d..d142fd3a414f 100644 --- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp +++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp @@ -195,6 +195,9 @@ void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) { } void LanaiAsmPrinter::emitInstruction(const MachineInstr *MI) { + Lanai_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp index eb6bf8d3836c..c43450869832 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp @@ -28,6 +28,7 @@ #include <string> #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "LanaiGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h index e8da1bc88142..93fe1a4609d8 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h @@ -43,6 +43,7 @@ std::unique_ptr<MCObjectTargetWriter> createLanaiELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the Lanai instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "LanaiGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp index dd61bb2df077..1467d1757ff0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp @@ -27,6 +27,9 @@ using namespace llvm; #include "LoongArchGenMCPseudoLowering.inc" void LoongArchAsmPrinter::emitInstruction(const MachineInstr *MI) { + LoongArch_MC::verifyInstructionPredicates( + MI->getOpcode(), getSubtargetInfo().getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h index 7e5aa49f227c..b51c19188051 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h @@ -39,6 +39,10 @@ public: // tblgen'erated function. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); + // Wrapper needed for tblgenned pseudo lowering. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { + return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this); + } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 5b117d40e0a9..20448492a558 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -11,6 +11,22 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// LoongArch specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_LoongArchMOVGR2FR_W_LA64 + : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>; +def SDT_LoongArchMOVFR2GR_S_LA64 + : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; +def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; + +def loongarch_movgr2fr_w_la64 + : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; +def loongarch_movfr2gr_s_la64 + : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>; +def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>; + +//===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -149,6 +165,7 @@ def : PatFPSetcc<SETULT, FCMP_CULT_S, FPR32>; def : PatFPSetcc<SETULE, FCMP_CULE_S, FPR32>; def : PatFPSetcc<SETUNE, FCMP_CUNE_S, FPR32>; def : PatFPSetcc<SETUO, FCMP_CUN_S, FPR32>; +def : PatFPSetcc<SETLT, FCMP_CLT_S, FPR32>; // TODO: Match signaling comparison strict_fsetccs with FCMP_S*_S instructions. @@ -174,4 +191,39 @@ def : PatFPSelectcc<SETULE, FCMP_CULE_S, FSEL_S, FPR32>; def : PatFPSelectcc<SETUNE, FCMP_CUNE_S, FSEL_S, FPR32>; def : PatFPSelectcc<SETUO, FCMP_CUN_S, FSEL_S, FPR32>; +/// Loads + +defm : LdPat<load, FLD_S, f32>; + +/// Stores + +defm : StPat<store, FST_S, FPR32, f32>; + +/// Floating point constants + +def : Pat<(f32 fpimm0), (MOVGR2FR_W R0)>; +def : Pat<(f32 fpimm0neg), (FNEG_S (MOVGR2FR_W R0))>; +def : Pat<(f32 fpimm1), (FFINT_S_W (MOVGR2FR_W (ADDI_W R0, 1)))>; + +// FP Conversion +def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>; } // Predicates = [HasBasicF] + +let Predicates = [HasBasicF, IsLA64] in { +// GPR -> FPR +def : Pat<(loongarch_movgr2fr_w_la64 GPR:$src), (MOVGR2FR_W GPR:$src)>; +// FPR -> GPR +def : Pat<(loongarch_movfr2gr_s_la64 FPR32:$src), + (MOVFR2GR_S FPR32:$src)>; +// int -> f32 +def : Pat<(f32 (sint_to_fp GPR:$src)), (FFINT_S_W (MOVGR2FR_W GPR:$src))>; +} // Predicates = [HasBasicF, IsLA64] + +let Predicates = [HasBasicF, IsLA32] in { +// GPR -> FPR +def : Pat<(bitconvert (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>; +// FPR -> GPR +def : Pat<(i32 (bitconvert FPR32:$src)), (MOVFR2GR_S FPR32:$src)>; +// int -> f32 +def : Pat<(f32 (sint_to_fp (i32 GPR:$src))), (FFINT_S_W (MOVGR2FR_W GPR:$src))>; +} // Predicates = [HasBasicF, IsLA64] diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 07fa61f4c361..bb50cec9f4c0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -131,6 +131,11 @@ def MOVGR2FR_D : FP_MOV<0b0000000100010100101010, "movgr2fr.d", FPR64, GPR>; def MOVFR2GR_D : FP_MOV<0b0000000100010100101110, "movfr2gr.d", GPR, FPR64>; } // Predicates = [HasBasicD, IsLA64] +// Instructions only available on LA32 +let Predicates = [HasBasicD, IsLA32], isCodeGenOnly = 1 in { +def MOVGR2FR_W_64 : FP_MOV<0b0000000100010100101001, "movgr2fr.w", FPR64, GPR>; +} // Predicates = [HasBasicD, IsLA32], isCodeGenOnly = 1 + //===----------------------------------------------------------------------===// // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// @@ -164,6 +169,7 @@ def : PatFPSetcc<SETULT, FCMP_CULT_D, FPR64>; def : PatFPSetcc<SETULE, FCMP_CULE_D, FPR64>; def : PatFPSetcc<SETUNE, FCMP_CUNE_D, FPR64>; def : PatFPSetcc<SETUO, FCMP_CUN_D, FPR64>; +def : PatFPSetcc<SETLT, FCMP_CLT_D, FPR64>; // TODO: Match signaling comparison strict_fsetccs with FCMP_S*_D instructions. @@ -185,4 +191,52 @@ def : PatFPSelectcc<SETULE, FCMP_CULE_D, FSEL_D, FPR64>; def : PatFPSelectcc<SETUNE, FCMP_CUNE_D, FSEL_D, FPR64>; def : PatFPSelectcc<SETUO, FCMP_CUN_D, FSEL_D, FPR64>; +/// Loads + +defm : LdPat<load, FLD_D, f64>; + +/// Stores + +defm : StPat<store, FST_D, FPR64, f64>; + +/// FP conversion operations + +def : Pat<(loongarch_ftint FPR64:$src), (FTINTRZ_W_D FPR64:$src)>; +def : Pat<(f64 (loongarch_ftint FPR64:$src)), (FTINTRZ_L_D FPR64:$src)>; +def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_L_S FPR32:$src)>; + +// f64 -> f32 +def : Pat<(f32 (fpround FPR64:$src)), (FCVT_S_D FPR64:$src)>; +// f32 -> f64 +def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>; } // Predicates = [HasBasicD] + +/// Floating point constants + +let Predicates = [HasBasicD, IsLA64] in { +def : Pat<(f64 fpimm0), (MOVGR2FR_D R0)>; +def : Pat<(f64 fpimm0neg), (FNEG_D (MOVGR2FR_D R0))>; +def : Pat<(f64 fpimm1), (FFINT_D_L (MOVGR2FR_D (ADDI_D R0, 1)))>; + +// Convert int to FP +def : Pat<(f64 (sint_to_fp (i64 (sexti32 (i64 GPR:$src))))), + (FFINT_D_W (MOVGR2FR_W GPR:$src))>; +def : Pat<(f64 (sint_to_fp GPR:$src)), (FFINT_D_L (MOVGR2FR_D GPR:$src))>; + +def : Pat<(f64 (uint_to_fp (i64 (zexti32 (i64 GPR:$src))))), + (FFINT_D_W (MOVGR2FR_W GPR:$src))>; + +def : Pat<(bitconvert GPR:$src), (MOVGR2FR_D GPR:$src)>; + +// Convert FP to int +def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>; +} // Predicates = [HasBasicD, IsLA64] + +let Predicates = [HasBasicD, IsLA32] in { +def : Pat<(f64 fpimm0), (MOVGR2FRH_W (MOVGR2FR_W_64 R0), R0)>; +def : Pat<(f64 fpimm0neg), (FNEG_D (MOVGR2FRH_W (MOVGR2FR_W_64 R0), R0))>; +def : Pat<(f64 fpimm1), (FCVT_D_S (FFINT_S_W (MOVGR2FR_W (ADDI_W R0, 1))))>; + +// Convert int to FP +def : Pat<(f64 (sint_to_fp (i32 GPR:$src))), (FFINT_D_W (MOVGR2FR_W GPR:$src))>; +} // Predicates = [HasBasicD, IsLA32] diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 7182d55ca3cf..0d9ec9e2eaaa 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -11,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "LoongArchFrameLowering.h" +#include "LoongArchMachineFunctionInfo.h" #include "LoongArchSubtarget.h" +#include "MCTargetDesc/LoongArchBaseInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -44,12 +46,178 @@ bool LoongArchFrameLowering::hasBP(const MachineFunction &MF) const { return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF); } +void LoongArchFrameLowering::adjustReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DestReg, + Register SrcReg, int64_t Val, + MachineInstr::MIFlag Flag) const { + const LoongArchInstrInfo *TII = STI.getInstrInfo(); + bool IsLA64 = STI.is64Bit(); + + if (DestReg == SrcReg && Val == 0) + return; + + if (isInt<12>(Val)) { + // addi.w/d $DstReg, $SrcReg, Val + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ADDI_D : LoongArch::ADDI_W), DestReg) + .addReg(SrcReg) + .addImm(Val) + .setMIFlag(Flag); + return; + } + + report_fatal_error("adjustReg cannot yet handle adjustments >12 bits"); +} + +// Determine the size of the frame and maximum call frame size. +void LoongArchFrameLowering::determineFrameLayout(MachineFunction &MF) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t FrameSize = MFI.getStackSize(); + + // Make sure the frame is aligned. + FrameSize = alignTo(FrameSize, getStackAlign()); + + // Update frame info. + MFI.setStackSize(FrameSize); +} + void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // TODO: Implement this when we have function calls + MachineFrameInfo &MFI = MF.getFrameInfo(); + const LoongArchRegisterInfo *RI = STI.getRegisterInfo(); + const LoongArchInstrInfo *TII = STI.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + + Register SPReg = LoongArch::R3; + Register FPReg = LoongArch::R22; + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + // Determine the correct frame layout + determineFrameLayout(MF); + + // First, compute final stack size. + uint64_t StackSize = MFI.getStackSize(); + + // Early exit if there is no need to allocate space in the stack. + if (StackSize == 0 && !MFI.adjustsStack()) + return; + + // Adjust stack. + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); + // Emit ".cfi_def_cfa_offset StackSize". + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + + const auto &CSI = MFI.getCalleeSavedInfo(); + + // The frame pointer is callee-saved, and code has been generated for us to + // save it to the stack. We need to skip over the storing of callee-saved + // registers as the frame pointer must be modified after it has been saved + // to the stack, not before. + std::advance(MBBI, CSI.size()); + + // Iterate over list of callee-saved registers and emit .cfi_offset + // directives. + for (const auto &Entry : CSI) { + int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx()); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, RI->getDwarfRegNum(Entry.getReg(), true), Offset)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Generate new FP. + if (hasFP(MF)) { + adjustReg(MBB, MBBI, DL, FPReg, SPReg, StackSize, MachineInstr::FrameSetup); + + // Emit ".cfi_def_cfa $fp, 0" + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( + nullptr, RI->getDwarfRegNum(FPReg, true), 0)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } } void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // TODO: Implement this when we have function calls + const LoongArchRegisterInfo *RI = STI.getRegisterInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + Register SPReg = LoongArch::R3; + + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + const auto &CSI = MFI.getCalleeSavedInfo(); + // Skip to before the restores of callee-saved registers. + auto LastFrameDestroy = MBBI; + if (!CSI.empty()) + LastFrameDestroy = std::prev(MBBI, CSI.size()); + + // Get the number of bytes from FrameInfo. + uint64_t StackSize = MFI.getStackSize(); + + // Restore the stack pointer. + if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects()) { + assert(hasFP(MF) && "frame pointer should not have been eliminated"); + adjustReg(MBB, LastFrameDestroy, DL, SPReg, LoongArch::R22, -StackSize, + MachineInstr::FrameDestroy); + } + + // Deallocate stack + adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); +} + +void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + // Unconditionally spill RA and FP only if the function uses a frame + // pointer. + if (hasFP(MF)) { + SavedRegs.set(LoongArch::R1); + SavedRegs.set(LoongArch::R22); + } + // Mark BP as used if function has dedicated base pointer. + if (hasBP(MF)) + SavedRegs.set(LoongArchABI::getBPReg()); +} + +StackOffset LoongArchFrameLowering::getFrameIndexReference( + const MachineFunction &MF, int FI, Register &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Callee-saved registers should be referenced relative to the stack + // pointer (positive offset), otherwise use the frame pointer (negative + // offset). + const auto &CSI = MFI.getCalleeSavedInfo(); + int MinCSFI = 0; + int MaxCSFI = -1; + StackOffset Offset = + StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + + MFI.getOffsetAdjustment()); + + if (CSI.size()) { + MinCSFI = CSI[0].getFrameIdx(); + MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); + } + + FrameReg = RI->getFrameRegister(MF); + if ((FI >= MinCSFI && FI <= MaxCSFI) || !hasFP(MF)) { + FrameReg = LoongArch::R3; + Offset += StackOffset::getFixed(MFI.getStackSize()); + } + + return Offset; } diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h index 25c53efc10f1..014b666de711 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -31,8 +31,26 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; + + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override { + return MBB.erase(MI); + } + + StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const override; + bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; + +private: + void determineFrameLayout(MachineFunction &MF) const; + void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DestReg, Register SrcReg, + int64_t Val, MachineInstr::MIFlag Flag) const; }; } // namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp index cc9ea0255d98..bb40ff817574 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp @@ -33,13 +33,14 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { unsigned Opcode = Node->getOpcode(); MVT GRLenVT = Subtarget->getGRLenVT(); SDLoc DL(Node); + MVT VT = Node->getSimpleValueType(0); switch (Opcode) { default: break; case ISD::Constant: { int64_t Imm = cast<ConstantSDNode>(Node)->getSExtValue(); - if (Imm == 0 && Node->getSimpleValueType(0) == GRLenVT) { + if (Imm == 0 && VT == GRLenVT) { SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, LoongArch::R0, GRLenVT); ReplaceNode(Node, New.getNode()); @@ -60,6 +61,15 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Result); return; } + case ISD::FrameIndex: { + SDValue Imm = CurDAG->getTargetConstant(0, DL, GRLenVT); + int FI = cast<FrameIndexSDNode>(Node)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT); + unsigned ADDIOp = + Subtarget->is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W; + ReplaceNode(Node, CurDAG->getMachineNode(ADDIOp, DL, VT, TFI, Imm)); + return; + } // TODO: Add selection nodes needed later. } @@ -67,6 +77,17 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { SelectCode(Node); } +bool LoongArchDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) { + // If this is FrameIndex, select it directly. Otherwise just let it get + // selected to a register independently. + if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) + Base = + CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getGRLenVT()); + else + Base = Addr; + return true; +} + bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt) { // Shift instructions on LoongArch only read the lower 5 or 6 bits of the @@ -125,6 +146,39 @@ bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth, return true; } +bool LoongArchDAGToDAGISel::selectSExti32(SDValue N, SDValue &Val) { + if (N.getOpcode() == ISD::SIGN_EXTEND_INREG && + cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { + Val = N.getOperand(0); + return true; + } + MVT VT = N.getSimpleValueType(); + if (CurDAG->ComputeNumSignBits(N) > (VT.getSizeInBits() - 32)) { + Val = N; + return true; + } + + return false; +} + +bool LoongArchDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) { + if (N.getOpcode() == ISD::AND) { + auto *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (C && C->getZExtValue() == UINT64_C(0xFFFFFFFF)) { + Val = N.getOperand(0); + return true; + } + } + MVT VT = N.getSimpleValueType(); + APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 32); + if (CurDAG->MaskedValueIsZero(N, Mask)) { + Val = N; + return true; + } + + return false; +} + // This pass converts a legalized DAG into a LoongArch-specific DAG, ready // for instruction scheduling. FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM) { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h index f477129d933c..7ad329a64424 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h @@ -38,6 +38,8 @@ public: void Select(SDNode *Node) override; + bool SelectBaseAddr(SDValue Addr, SDValue &Base); + bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt); bool selectShiftMaskGRLen(SDValue N, SDValue &ShAmt) { return selectShiftMask(N, Subtarget->getGRLen(), ShAmt); @@ -46,6 +48,9 @@ public: return selectShiftMask(N, 32, ShAmt); } + bool selectSExti32(SDValue N, SDValue &Val); + bool selectZExti32(SDValue N, SDValue &Val); + // Include the pieces autogenerated from the target description. #include "LoongArchGenDAGISel.inc" }; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index d5a469216859..4acf90bd9788 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -17,14 +17,21 @@ #include "LoongArchRegisterInfo.h" #include "LoongArchSubtarget.h" #include "LoongArchTargetMachine.h" +#include "MCTargetDesc/LoongArchMCTargetDesc.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; #define DEBUG_TYPE "loongarch-isel-lowering" +static cl::opt<bool> ZeroDivCheck( + "loongarch-check-zero-division", cl::Hidden, + cl::desc("Trap on integer division by zero."), + cl::init(false)); + LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, const LoongArchSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -37,15 +44,25 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicD()) addRegisterClass(MVT::f64, &LoongArch::FPR64RegClass); + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, GRLenVT, + MVT::i1, Promote); + // TODO: add necessary setOperationAction calls later. setOperationAction(ISD::SHL_PARTS, GRLenVT, Custom); setOperationAction(ISD::SRA_PARTS, GRLenVT, Custom); setOperationAction(ISD::SRL_PARTS, GRLenVT, Custom); + setOperationAction(ISD::FP_TO_SINT, GRLenVT, Custom); + + setOperationAction({ISD::GlobalAddress, ISD::ConstantPool}, GRLenVT, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::SHL, MVT::i32, Custom); setOperationAction(ISD::SRA, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::BITCAST, MVT::i32, Custom); + if (Subtarget.hasBasicF() && !Subtarget.hasBasicD()) + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); } static const ISD::CondCode FPCCToExpand[] = {ISD::SETOGT, ISD::SETOGE, @@ -58,10 +75,19 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicD()) { setCondCodeAction(FPCCToExpand, MVT::f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); } + setOperationAction(ISD::BR_CC, GRLenVT, Expand); setOperationAction(ISD::SELECT_CC, GRLenVT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand); + if (!Subtarget.is64Bit()) + setLibcallName(RTLIB::MUL_I128, nullptr); + + setOperationAction(ISD::FP_TO_UINT, GRLenVT, Custom); + setOperationAction(ISD::UINT_TO_FP, GRLenVT, Custom); // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -70,11 +96,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setBooleanContents(ZeroOrOneBooleanContent); + setMaxAtomicSizeInBitsSupported(Subtarget.getGRLen()); + // Function alignments. const Align FunctionAlignment(4); setMinFunctionAlignment(FunctionAlignment); setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::SRL); } @@ -83,6 +112,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, switch (Op.getOpcode()) { default: report_fatal_error("unimplemented operand"); + case ISD::GlobalAddress: + return lowerGlobalAddress(Op, DAG); case ISD::SHL_PARTS: return lowerShiftLeftParts(Op, DAG); case ISD::SRA_PARTS: @@ -96,7 +127,105 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); return SDValue(); + case ISD::ConstantPool: + return lowerConstantPool(Op, DAG); + case ISD::FP_TO_SINT: + return lowerFP_TO_SINT(Op, DAG); + case ISD::BITCAST: + return lowerBITCAST(Op, DAG); + case ISD::FP_TO_UINT: + return SDValue(); + case ISD::UINT_TO_FP: + return lowerUINT_TO_FP(Op, DAG); + } +} + +SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue Tmp1, Tmp2; + SDValue Op1 = Op.getOperand(0); + if (Op1->getOpcode() == ISD::AssertZext || + Op1->getOpcode() == ISD::AssertSext) + return Op; + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op.getOperand(0)); + SDValue Res = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f64, Trunc); + SDNode *N = Res.getNode(); + TLI.expandUINT_TO_FP(N, Tmp1, Tmp2, DAG); + return Tmp1; +} + +SDValue LoongArchTargetLowering::lowerBITCAST(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + + if (Op.getValueType() == MVT::f32 && Op0.getValueType() == MVT::i32 && + Subtarget.is64Bit() && Subtarget.hasBasicF()) { + SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + return DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, NewOp0); + } + return Op; +} + +SDValue LoongArchTargetLowering::lowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + + if (Op.getValueSizeInBits() > 32 && Subtarget.hasBasicF() && + !Subtarget.hasBasicD()) { + SDValue Dst = + DAG.getNode(LoongArchISD::FTINT, DL, MVT::f32, Op.getOperand(0)); + return DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Dst); + } + + EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits()); + SDValue Trunc = DAG.getNode(LoongArchISD::FTINT, DL, FPTy, Op.getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Trunc); +} + +SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT Ty = Op.getValueType(); + ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op); + + // FIXME: Only support PC-relative addressing to access the symbol. + // Target flags will be added later. + if (!isPositionIndependent()) { + SDValue ConstantN = DAG.getTargetConstantPool( + N->getConstVal(), Ty, N->getAlign(), N->getOffset()); + SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, ConstantN), + 0); + SDValue Addr(DAG.getMachineNode(Subtarget.is64Bit() ? LoongArch::ADDI_D + : LoongArch::ADDI_W, + DL, Ty, AddrHi, ConstantN), + 0); + return Addr; } + report_fatal_error("Unable to lower ConstantPool"); +} + +SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + unsigned ADDIOp = Subtarget.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W; + + // FIXME: Only support PC-relative addressing to access the symbol. + // TODO: Add target flags. + if (!isPositionIndependent()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty); + SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, GA), 0); + SDValue Addr(DAG.getMachineNode(ADDIOp, DL, Ty, AddrHi, GA), 0); + return Addr; + } + report_fatal_error("Unable to lowerGlobalAddress"); } SDValue LoongArchTargetLowering::lowerShiftLeftParts(SDValue Op, @@ -238,6 +367,36 @@ void LoongArchTargetLowering::ReplaceNodeResults( break; } break; + case ISD::FP_TO_SINT: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + SDValue Src = N->getOperand(0); + EVT VT = EVT::getFloatingPointVT(N->getValueSizeInBits(0)); + SDValue Dst = DAG.getNode(LoongArchISD::FTINT, DL, VT, Src); + Results.push_back(DAG.getNode(ISD::BITCAST, DL, N->getValueType(0), Dst)); + break; + } + case ISD::BITCAST: { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + if (VT == MVT::i32 && SrcVT == MVT::f32 && Subtarget.is64Bit() && + Subtarget.hasBasicF()) { + SDValue Dst = + DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Src); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Dst)); + } + break; + } + case ISD::FP_TO_UINT: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue Tmp1, Tmp2; + TLI.expandFP_TO_UINT(N, Tmp1, Tmp2, DAG); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Tmp1)); + break; + } } } @@ -345,6 +504,224 @@ static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + MVT GRLenVT = Subtarget.getGRLenVT(); + EVT ValTy = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *CN0, *CN1; + SDLoc DL(N); + unsigned ValBits = ValTy.getSizeInBits(); + unsigned MaskIdx0, MaskLen0, MaskIdx1, MaskLen1; + unsigned Shamt; + bool SwapAndRetried = false; + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (ValBits != 32 && ValBits != 64) + return SDValue(); + +Retry: + // 1st pattern to match BSTRINS: + // R = or (and X, mask0), (and (shl Y, lsb), mask1) + // where mask1 = (2**size - 1) << lsb, mask0 = ~mask1 + // => + // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1) + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) && + MaskIdx0 == MaskIdx1 && MaskLen0 == MaskLen1 && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskIdx0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 1\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0).getOperand(0), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 2nd pattern to match BSTRINS: + // R = or (and X, mask0), (shl (and Y, mask1), lsb) + // where mask1 = (2**size - 1), mask0 = ~(mask1 << lsb) + // => + // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1) + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskIdx0 && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) && + MaskLen0 == MaskLen1 && MaskIdx1 == 0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 2\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0).getOperand(0), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 3rd pattern to match BSTRINS: + // R = or (and X, mask0), (and Y, mask1) + // where ~mask0 = (2**size - 1) << lsb, mask0 & mask1 = 0 + // => + // R = BSTRINS X, (shr (and Y, mask1), lsb), msb, lsb + // where msb = lsb + size - 1 + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + (MaskIdx0 + MaskLen0 <= 64) && + (CN1 = dyn_cast<ConstantSDNode>(N1->getOperand(1))) && + (CN1->getSExtValue() & CN0->getSExtValue()) == 0) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 3\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + DAG.getNode(ISD::SRL, DL, N1->getValueType(0), N1, + DAG.getConstant(MaskIdx0, DL, GRLenVT)), + DAG.getConstant(ValBits == 32 + ? (MaskIdx0 + (MaskLen0 & 31) - 1) + : (MaskIdx0 + MaskLen0 - 1), + DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 4th pattern to match BSTRINS: + // R = or (and X, mask), (shl Y, shamt) + // where mask = (2**shamt - 1) + // => + // R = BSTRINS X, Y, ValBits - 1, shamt + // where ValBits = 32 or 64 + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::SHL && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(CN0->getZExtValue(), MaskIdx0, MaskLen0) && + MaskIdx0 == 0 && (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskLen0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 4\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant((ValBits - 1), DL, GRLenVT), + DAG.getConstant(Shamt, DL, GRLenVT)); + } + + // 5th pattern to match BSTRINS: + // R = or (and X, mask), const + // where ~mask = (2**size - 1) << lsb, mask & const = 0 + // => + // R = BSTRINS X, (const >> lsb), msb, lsb + // where msb = lsb + size - 1 + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + (CN1 = dyn_cast<ConstantSDNode>(N1)) && + (CN1->getSExtValue() & CN0->getSExtValue()) == 0) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 5\n"); + return DAG.getNode( + LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 6th pattern. + // a = b | ((c & mask) << shamt), where all positions in b to be overwritten + // by the incoming bits are known to be zero. + // => + // a = BSTRINS b, c, shamt + MaskLen - 1, shamt + // + // Note that the 1st pattern is a special situation of the 6th, i.e. the 6th + // pattern is more common than the 1st. So we put the 1st before the 6th in + // order to match as many nodes as possible. + ConstantSDNode *CNMask, *CNShamt; + unsigned MaskIdx, MaskLen; + if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND && + (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) && + MaskIdx == 0 && (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + CNShamt->getZExtValue() + MaskLen <= ValBits) { + Shamt = CNShamt->getZExtValue(); + APInt ShMask(ValBits, CNMask->getZExtValue() << Shamt); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 6\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + N1.getOperand(0).getOperand(0), + DAG.getConstant(Shamt + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(Shamt, DL, GRLenVT)); + } + } + + // 7th pattern. + // a = b | ((c << shamt) & shifted_mask), where all positions in b to be + // overwritten by the incoming bits are known to be zero. + // => + // a = BSTRINS b, c, MaskIdx + MaskLen - 1, MaskIdx + // + // Similarly, the 7th pattern is more common than the 2nd. So we put the 2nd + // before the 7th in order to match as many nodes as possible. + if (N1.getOpcode() == ISD::AND && + (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) && + N1.getOperand(0).getOpcode() == ISD::SHL && + (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + CNShamt->getZExtValue() == MaskIdx) { + APInt ShMask(ValBits, CNMask->getZExtValue()); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 7\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + N1.getOperand(0).getOperand(0), + DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(MaskIdx, DL, GRLenVT)); + } + } + + // (or a, b) and (or b, a) are equivalent, so swap the operands and retry. + if (!SwapAndRetried) { + std::swap(N0, N1); + SwapAndRetried = true; + goto Retry; + } + + SwapAndRetried = false; +Retry2: + // 8th pattern. + // a = b | (c & shifted_mask), where all positions in b to be overwritten by + // the incoming bits are known to be zero. + // => + // a = BSTRINS b, c >> MaskIdx, MaskIdx + MaskLen - 1, MaskIdx + // + // Similarly, the 8th pattern is more common than the 4th and 5th patterns. So + // we put it here in order to match as many nodes as possible or generate less + // instructions. + if (N1.getOpcode() == ISD::AND && + (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen)) { + APInt ShMask(ValBits, CNMask->getZExtValue()); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 8\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + DAG.getNode(ISD::SRL, DL, N1->getValueType(0), + N1->getOperand(0), + DAG.getConstant(MaskIdx, DL, GRLenVT)), + DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(MaskIdx, DL, GRLenVT)); + } + } + // Swap N0/N1 and retry. + if (!SwapAndRetried) { + std::swap(N0, N1); + SwapAndRetried = true; + goto Retry2; + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -353,12 +730,62 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::AND: return performANDCombine(N, DAG, DCI, Subtarget); + case ISD::OR: + return performORCombine(N, DAG, DCI, Subtarget); case ISD::SRL: return performSRLCombine(N, DAG, DCI, Subtarget); } return SDValue(); } +static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI, + MachineBasicBlock &MBB, + const TargetInstrInfo &TII) { + if (!ZeroDivCheck) + return &MBB; + + // Build instructions: + // div(or mod) $dst, $dividend, $divisor + // bnez $divisor, 8 + // break 7 + // fallthrough + MachineOperand &Divisor = MI.getOperand(2); + auto FallThrough = std::next(MI.getIterator()); + + BuildMI(MBB, FallThrough, MI.getDebugLoc(), TII.get(LoongArch::BNEZ)) + .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill())) + .addImm(8); + + // See linux header file arch/loongarch/include/uapi/asm/break.h for the + // definition of BRK_DIVZERO. + BuildMI(MBB, FallThrough, MI.getDebugLoc(), TII.get(LoongArch::BREAK)) + .addImm(7/*BRK_DIVZERO*/); + + // Clear Divisor's kill flag. + Divisor.setIsKill(false); + + return &MBB; +} + +MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instr type to insert"); + case LoongArch::DIV_W: + case LoongArch::DIV_WU: + case LoongArch::MOD_W: + case LoongArch::MOD_WU: + case LoongArch::DIV_D: + case LoongArch::DIV_DU: + case LoongArch::MOD_D: + case LoongArch::MOD_DU: + return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo()); + break; + } +} + const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((LoongArchISD::NodeType)Opcode) { case LoongArchISD::FIRST_NUMBER: @@ -369,11 +796,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { return "LoongArchISD::" #node; // TODO: Add more target-dependent nodes later. + NODE_NAME_CASE(CALL) NODE_NAME_CASE(RET) NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) + NODE_NAME_CASE(BSTRINS) NODE_NAME_CASE(BSTRPICK) + NODE_NAME_CASE(MOVGR2FR_W_LA64) + NODE_NAME_CASE(MOVFR2GR_S_LA64) + NODE_NAME_CASE(FTINT) } #undef NODE_NAME_CASE return nullptr; @@ -483,6 +915,132 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( return Chain; } +// Lower a call to a callseq_start + CALL + callseq_end chain, and add input +// and output parameter nodes. +SDValue +LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &DL = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + CLI.IsTailCall = false; + + if (IsVarArg) + report_fatal_error("LowerCall with varargs not implemented"); + + MachineFunction &MF = DAG.getMachineFunction(); + + // Analyze the operands of the call, assigning locations to each operand. + SmallVector<CCValAssign> ArgLocs; + CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + analyzeOutputArgs(ArgCCInfo, Outs, CC_LoongArch); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = ArgCCInfo.getNextStackOffset(); + + for (auto &Arg : Outs) { + if (!Arg.Flags.isByVal()) + continue; + report_fatal_error("Passing arguments byval not implemented"); + } + + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); + + // Copy argument values to their designated locations. + SmallVector<std::pair<Register, SDValue>> RegsToPass; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue ArgValue = OutVals[i]; + + // Promote the value if needed. + // For now, only handle fully promoted arguments. + if (VA.getLocInfo() != CCValAssign::Full) + report_fatal_error("Unknown loc info"); + + if (VA.isRegLoc()) { + // Queue up the argument copies and emit them at the end. + RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); + } else { + report_fatal_error("Passing arguments via the stack not implemented"); + } + } + + SDValue Glue; + + // Build a sequence of copy-to-reg nodes, chained and glued together. + for (auto &Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue); + Glue = Chain.getValue(1); + } + + // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a + // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't + // split it and then direct call can be matched by PseudoCALL. + // FIXME: Add target flags for relocation. + if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) + Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT); + else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT); + + // The first call operand is the chain and the second is the target address. + SmallVector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are + // known live into the call. + for (auto &Reg : RegsToPass) + Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + // Glue the call to the argument copies, if any. + if (Glue.getNode()) + Ops.push_back(Glue); + + // Emit the call. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); + Glue = Chain.getValue(1); + + // Mark the end of the call, which is glued to the call itself. + Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, DL, PtrVT, true), + DAG.getConstant(0, DL, PtrVT, true), Glue, DL); + Glue = Chain.getValue(1); + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign> RVLocs; + CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); + analyzeInputArgs(RetCCInfo, Ins, CC_LoongArch); + + // Copy all of the result registers out of their specified physreg. + for (auto &VA : RVLocs) { + // Copy the value out. + SDValue RetValue = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue); + Chain = RetValue.getValue(1); + Glue = RetValue.getValue(2); + + InVals.push_back(Chain.getValue(0)); + } + + return Chain; +} + bool LoongArchTargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { @@ -529,3 +1087,14 @@ SDValue LoongArchTargetLowering::LowerReturn( return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps); } + +bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { + assert((VT == MVT::f32 || VT == MVT::f64) && "Unexpected VT"); + + if (VT == MVT::f32 && !Subtarget.hasBasicF()) + return false; + if (VT == MVT::f64 && !Subtarget.hasBasicD()) + return false; + return (Imm.isZero() || Imm.isExactlyValue(+1.0)); +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index c852577a3744..279550482675 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -27,6 +27,7 @@ enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, // TODO: add more LoongArchISDs + CALL, RET, // 32-bit shifts, directly matching the semantics of the named LoongArch // instructions. @@ -34,6 +35,13 @@ enum NodeType : unsigned { SRA_W, SRL_W, + // FPR<->GPR transfer operations + MOVGR2FR_W_LA64, + MOVFR2GR_S_LA64, + + FTINT, + + BSTRINS, BSTRPICK, }; @@ -72,6 +80,8 @@ public: const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; private: /// Target-specific function used to lower LoongArch calling conventions. @@ -86,8 +96,24 @@ private: const SmallVectorImpl<ISD::OutputArg> &Outs, LoongArchCCAssignFn Fn) const; + SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; + SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; + + bool shouldInsertFencesForAtomic(const Instruction *I) const override { + return isa<LoadInst>(I) || isa<StoreInst>(I); + } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 146ef53befd5..bcbd4b28f3c7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -12,6 +12,7 @@ #include "LoongArchInstrInfo.h" #include "LoongArch.h" +#include "LoongArchMachineFunctionInfo.h" using namespace llvm; @@ -19,8 +20,8 @@ using namespace llvm; #include "LoongArchGenInstrInfo.inc" LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI) - // FIXME: add CFSetup and CFDestroy Inst when we implement function call. - : LoongArchGenInstrInfo() {} + : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN, + LoongArch::ADJCALLSTACKUP) {} void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -47,3 +48,68 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, get(Opc), DstReg) .addReg(SrcReg, getKillRegState(KillSrc)); } + +void LoongArchInstrInfo::storeRegToStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, + bool IsKill, int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + unsigned Opcode; + if (LoongArch::GPRRegClass.hasSubClassEq(RC)) + Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + ? LoongArch::ST_W + : LoongArch::ST_D; + else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FST_S; + else if (LoongArch::FPR64RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FST_D; + else + llvm_unreachable("Can't store this register to stack slot"); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); + + BuildMI(MBB, I, DL, get(Opcode)) + .addReg(SrcReg, getKillRegState(IsKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); +} + +void LoongArchInstrInfo::loadRegFromStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, + int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + unsigned Opcode; + if (LoongArch::GPRRegClass.hasSubClassEq(RC)) + Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + ? LoongArch::LD_W + : LoongArch::LD_D; + else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FLD_S; + else if (LoongArch::FPR64RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FLD_D; + else + llvm_unreachable("Can't load this register from stack slot"); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); + + BuildMI(MBB, I, DL, get(Opcode), DstReg) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); +} diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index f31943b85a51..0a8c86a5e0c2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -30,6 +30,16 @@ public: void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg, bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, Register SrcReg, + bool IsKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, Register DstReg, + int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 6b8ee9e43f94..d07d086bd7da 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -14,22 +14,45 @@ // LoongArch specific DAG Nodes. //===----------------------------------------------------------------------===// +// Target-independent type requirements, but with target-specific formats. +def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; +def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + // Target-dependent type requirements. +def SDT_LoongArchCall : SDTypeProfile<0, -1, [SDTCisVT<0, GRLenVT>]>; def SDT_LoongArchIntBinOpW : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64> ]>; +def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [ + SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, + SDTCisSameAs<3, 4> +]>; + def SDT_LoongArchBStrPick: SDTypeProfile<1, 3, [ SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<2, 3> ]>; // TODO: Add LoongArch specific DAG Nodes +// Target-independent nodes, but with target-specific formats. +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + // Target-dependent nodes. +def loongarch_call : SDNode<"LoongArchISD::CALL", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; +def loongarch_bstrins + : SDNode<"LoongArchISD::BSTRINS", SDT_LoongArchBStrIns>; def loongarch_bstrpick : SDNode<"LoongArchISD::BSTRPICK", SDT_LoongArchBStrPick>; @@ -106,7 +129,14 @@ def simm16 : Operand<GRLenVT> { let DecoderMethod = "decodeSImmOperand<16>"; } -def simm16_lsl2 : Operand<GRLenVT> { +def simm16_lsl2 : Operand<GRLenVT>, + ImmLeaf<GRLenVT, [{return isInt<16>(Imm>>2);}]> { + let ParserMatchClass = SImmAsmOperand<16, "lsl2">; + let EncoderMethod = "getImmOpValueAsr2"; + let DecoderMethod = "decodeSImmOperand<16, 2>"; +} + +def simm16_lsl2_br : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<16, "lsl2">; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<16, 2>"; @@ -117,13 +147,13 @@ def simm20 : Operand<GRLenVT> { let DecoderMethod = "decodeSImmOperand<20>"; } -def simm21_lsl2 : Operand<GRLenVT> { +def simm21_lsl2 : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<21, "lsl2">; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<21, 2>"; } -def simm26_lsl2 : Operand<GRLenVT> { +def simm26_lsl2 : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<26, "lsl2">; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<26, 2>"; @@ -141,6 +171,24 @@ def NegImm : SDNodeXForm<imm, [{ N->getValueType(0)); }]>; +// FP immediate patterns. +def fpimm0 : PatLeaf<(fpimm), [{return N->isExactlyValue(+0.0);}]>; +def fpimm0neg : PatLeaf<(fpimm), [{return N->isExactlyValue(-0.0);}]>; +def fpimm1 : PatLeaf<(fpimm), [{return N->isExactlyValue(+1.0);}]>; + +def CallSymbol: AsmOperandClass { + let Name = "CallSymbol"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isImm"; +} + +// A bare symbol used in call only. +def call_symbol : Operand<iPTR> { + let ParserMatchClass = CallSymbol; +} + +def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">; + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -185,7 +233,7 @@ class RDTIME_2R<bits<22> op, string opstr> : Fmt2R<op, (outs GPR:$rd, GPR:$rj), (ins), opstr, "$rd, $rj">; class BrCC_2RI16<bits<6> op, string opstr> - : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2:$imm16), opstr, + : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2_br:$imm16), opstr, "$rj, $rd, $imm16"> { let isBranch = 1; let isTerminator = 1; @@ -274,10 +322,12 @@ def XORI : ALU_2RI12<0b0000001111, "xori", uimm12>; def MUL_W : ALU_3R<0b00000000000111000, "mul.w">; def MULH_W : ALU_3R<0b00000000000111001, "mulh.w">; def MULH_WU : ALU_3R<0b00000000000111010, "mulh.wu">; +let usesCustomInserter = true in { def DIV_W : ALU_3R<0b00000000001000000, "div.w">; def MOD_W : ALU_3R<0b00000000001000001, "mod.w">; def DIV_WU : ALU_3R<0b00000000001000010, "div.wu">; def MOD_WU : ALU_3R<0b00000000001000011, "mod.wu">; +} // usesCustomInserter = true // Bit-shift Instructions def SLL_W : ALU_3R<0b00000000000101110, "sll.w">; @@ -379,10 +429,12 @@ def MULH_D : ALU_3R<0b00000000000111100, "mulh.d">; def MULH_DU : ALU_3R<0b00000000000111101, "mulh.du">; def MULW_D_W : ALU_3R<0b00000000000111110, "mulw.d.w">; def MULW_D_WU : ALU_3R<0b00000000000111111, "mulw.d.wu">; +let usesCustomInserter = true in { def DIV_D : ALU_3R<0b00000000001000100, "div.d">; def MOD_D : ALU_3R<0b00000000001000101, "mod.d">; def DIV_DU : ALU_3R<0b00000000001000110, "div.du">; def MOD_DU : ALU_3R<0b00000000001000111, "mod.du">; +} // usesCustomInserter = true // Bit-shift Instructions for 64-bits def SLL_D : ALU_3R<0b00000000000110001, "sll.d">; @@ -545,6 +597,9 @@ def shiftMaskGRLen : ComplexPattern<GRLenVT, 1, "selectShiftMaskGRLen", [], [], 0>; def shiftMask32 : ComplexPattern<i64, 1, "selectShiftMask32", [], [], 0>; +def sexti32 : ComplexPattern<i64, 1, "selectSExti32">; +def zexti32 : ComplexPattern<i64, 1, "selectZExti32">; + class shiftop<SDPatternOperator operator> : PatFrag<(ops node:$val, node:$count), (operator node:$val, (GRLenVT (shiftMaskGRLen node:$count)))>; @@ -556,6 +611,13 @@ let Predicates = [IsLA32] in { def : PatGprGpr<add, ADD_W>; def : PatGprImm<add, ADDI_W, simm12>; def : PatGprGpr<sub, SUB_W>; +def : PatGprGpr<sdiv, DIV_W>; +def : PatGprGpr<udiv, DIV_WU>; +def : PatGprGpr<srem, MOD_W>; +def : PatGprGpr<urem, MOD_WU>; +def : PatGprGpr<mul, MUL_W>; +def : PatGprGpr<mulhs, MULH_W>; +def : PatGprGpr<mulhu, MULH_WU>; } // Predicates = [IsLA32] let Predicates = [IsLA64] in { @@ -565,6 +627,24 @@ def : PatGprImm<add, ADDI_D, simm12>; def : PatGprImm_32<add, ADDI_W, simm12>; def : PatGprGpr<sub, SUB_D>; def : PatGprGpr_32<sub, SUB_W>; +def : PatGprGpr<sdiv, DIV_D>; +def : PatGprGpr<udiv, DIV_DU>; +def : PatGprGpr<srem, MOD_D>; +def : PatGprGpr<urem, MOD_DU>; +// TODO: Select "_W[U]" instructions for i32xi32 if only lower 32 bits of the +// product are used. +def : PatGprGpr<mul, MUL_D>; +def : PatGprGpr<mulhs, MULH_D>; +def : PatGprGpr<mulhu, MULH_DU>; +// Select MULW_D_W for calculating the full 64 bits product of i32xi32 signed +// multiplication. +def : Pat<(i64 (mul (sext_inreg GPR:$rj, i32), (sext_inreg GPR:$rk, i32))), + (MULW_D_W GPR:$rj, GPR:$rk)>; +// Select MULW_D_WU for calculating the full 64 bits product of i32xi32 +// unsigned multiplication. +def : Pat<(i64 (mul (loongarch_bstrpick GPR:$rj, (i64 31), (i64 0)), + (loongarch_bstrpick GPR:$rk, (i64 31), (i64 0)))), + (MULW_D_WU GPR:$rj, GPR:$rk)>; } // Predicates = [IsLA64] def : PatGprGpr<and, AND>; @@ -649,19 +729,143 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f), /// Branches and jumps +class BccPat<PatFrag CondOp, LAInst Inst> + : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16), + (Inst GPR:$rj, GPR:$rd, bb:$imm16)>; + +def : BccPat<seteq, BEQ>; +def : BccPat<setne, BNE>; +def : BccPat<setlt, BLT>; +def : BccPat<setge, BGE>; +def : BccPat<setult, BLTU>; +def : BccPat<setuge, BGEU>; + +class BccSwapPat<PatFrag CondOp, LAInst InstBcc> + : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16), + (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>; + +// Condition codes that don't have matching LoongArch branch instructions, but +// are trivially supported by swapping the two input operands. +def : BccSwapPat<setgt, BLT>; +def : BccSwapPat<setle, BGE>; +def : BccSwapPat<setugt, BLTU>; +def : BccSwapPat<setule, BGEU>; + +// An extra pattern is needed for a brcond without a setcc (i.e. where the +// condition was calculated elsewhere). +def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>; + +let isBarrier = 1, isBranch = 1, isTerminator = 1 in +def PseudoBR : Pseudo<(outs), (ins simm26_lsl2:$imm26), [(br bb:$imm26)]>, + PseudoInstExpansion<(B simm26_lsl2:$imm26)>; + +let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in +def PseudoBRIND : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16), []>, + PseudoInstExpansion<(JIRL R0, GPR:$rj, simm16_lsl2:$imm16)>; + +def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>; +def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), + (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; + +let isCall = 1, Defs = [R1] in +def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> { + let AsmString = "bl\t$func"; +} + +def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; +def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; + +let isCall = 1, Defs = [R1] in +def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj), + [(loongarch_call GPR:$rj)]>, + PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>; + let isBarrier = 1, isReturn = 1, isTerminator = 1 in def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, PseudoInstExpansion<(JIRL R0, R1, 0)>; -/// BSTRPICK +/// BSTRINS and BSTRPICK -let Predicates = [IsLA32] in +let Predicates = [IsLA32] in { +def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd), + (BSTRINS_W GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>; def : Pat<(loongarch_bstrpick GPR:$rj, uimm5:$msbd, uimm5:$lsbd), (BSTRPICK_W GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>; +} // Predicates = [IsLA32] -let Predicates = [IsLA64] in +let Predicates = [IsLA64] in { +def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd), + (BSTRINS_D GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>; def : Pat<(loongarch_bstrpick GPR:$rj, uimm6:$msbd, uimm6:$lsbd), (BSTRPICK_D GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>; +} // Predicates = [IsLA64] + +/// Loads + +multiclass LdPat<PatFrag LoadOp, LAInst Inst, ValueType vt = GRLenVT> { + def : Pat<(vt (LoadOp BaseAddr:$rj)), (Inst BaseAddr:$rj, 0)>; + def : Pat<(vt (LoadOp (add BaseAddr:$rj, simm12:$imm12))), + (Inst BaseAddr:$rj, simm12:$imm12)>; +} + +defm : LdPat<sextloadi8, LD_B>; +defm : LdPat<extloadi8, LD_B>; +defm : LdPat<sextloadi16, LD_H>; +defm : LdPat<extloadi16, LD_H>; +defm : LdPat<load, LD_W>, Requires<[IsLA32]>; +defm : LdPat<zextloadi8, LD_BU>; +defm : LdPat<zextloadi16, LD_HU>; +let Predicates = [IsLA64] in { +defm : LdPat<sextloadi32, LD_W, i64>; +defm : LdPat<extloadi32, LD_W, i64>; +defm : LdPat<zextloadi32, LD_WU, i64>; +defm : LdPat<load, LD_D, i64>; +} // Predicates = [IsLA64] + +/// Stores + +multiclass StPat<PatFrag StoreOp, LAInst Inst, RegisterClass StTy, + ValueType vt> { + def : Pat<(StoreOp (vt StTy:$rd), BaseAddr:$rj), + (Inst StTy:$rd, BaseAddr:$rj, 0)>; + def : Pat<(StoreOp (vt StTy:$rd), (add BaseAddr:$rj, simm12:$imm12)), + (Inst StTy:$rd, BaseAddr:$rj, simm12:$imm12)>; +} + +defm : StPat<truncstorei8, ST_B, GPR, GRLenVT>; +defm : StPat<truncstorei16, ST_H, GPR, GRLenVT>; +defm : StPat<store, ST_W, GPR, i32>, Requires<[IsLA32]>; +let Predicates = [IsLA64] in { +defm : StPat<truncstorei32, ST_W, GPR, i64>; +defm : StPat<store, ST_D, GPR, i64>; +} // Predicates = [IsLA64] + +/// Atomic loads and stores + +def : Pat<(atomic_fence timm, timm), (DBAR 0)>; + +defm : LdPat<atomic_load_8, LD_B>; +defm : LdPat<atomic_load_16, LD_H>; +defm : LdPat<atomic_load_32, LD_W>; + +defm : StPat<atomic_store_8, ST_B, GPR, GRLenVT>; +defm : StPat<atomic_store_16, ST_H, GPR, GRLenVT>; +defm : StPat<atomic_store_32, ST_W, GPR, i32>, Requires<[IsLA32]>; +let Predicates = [IsLA64] in { +defm : LdPat<atomic_load_64, LD_D>; +defm : StPat<atomic_store_32, ST_W, GPR, i64>; +defm : StPat<atomic_store_64, ST_D, GPR, i64>; +} // Predicates = [IsLA64] + +/// Other pseudo-instructions + +// Pessimistically assume the stack pointer will be clobbered +let Defs = [R3], Uses = [R3] in { +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_start timm:$amt1, timm:$amt2)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_end timm:$amt1, timm:$amt2)]>; +} // Defs = [R3], Uses = [R3] //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp index 7416c93b4d05..488c66f47863 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp @@ -22,6 +22,22 @@ using namespace llvm; +static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, + const AsmPrinter &AP) { + MCContext &Ctx = AP.OutContext; + + // TODO: Processing target flags. + + const MCExpr *ME = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx); + + if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) + ME = MCBinaryExpr::createAdd( + ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + + return MCOperand::createExpr(ME); +} + bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &MCOp, const AsmPrinter &AP) { @@ -41,12 +57,21 @@ bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO, case MachineOperand::MO_Immediate: MCOp = MCOperand::createImm(MO.getImm()); break; - // TODO: lower special operands - case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_ConstantPoolIndex: + MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP); + break; case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_BlockAddress: + MCOp = lowerSymbolOperand(MO, AP.getSymbolPreferLocal(*MO.getGlobal()), AP); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = lowerSymbolOperand(MO, MO.getMBB()->getSymbol(), AP); + break; case MachineOperand::MO_ExternalSymbol: - case MachineOperand::MO_ConstantPoolIndex: + MCOp = lowerSymbolOperand( + MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP); + break; + // TODO: lower special operands + case MachineOperand::MO_BlockAddress: case MachineOperand::MO_JumpTableIndex: break; } diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp index b9bae8e56304..05902ebb7ba6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -110,6 +110,28 @@ void LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { + // TODO: this implementation is a temporary placeholder which does just + // enough to allow other aspects of code generation to be tested. + assert(SPAdj == 0 && "Unexpected non-zero SPAdj value"); - // TODO: Implement this when we have function calls + + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + DebugLoc DL = MI.getDebugLoc(); + + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + Register FrameReg; + StackOffset Offset = + TFI->getFrameIndexReference(MF, FrameIndex, FrameReg) + + StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); + + // Offsets must be encodable with a 12-bit immediate field. + if (!isInt<12>(Offset.getFixed())) { + report_fatal_error("Frame offsets outside of the signed 12-bit range is " + "not supported currently"); + } + + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); } diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 3a1a46a9e624..468c4f43cb90 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -102,6 +102,7 @@ public: return getTM<LoongArchTargetMachine>(); } + void addIRPasses() override; bool addInstSelector() override; }; } // namespace @@ -111,6 +112,12 @@ LoongArchTargetMachine::createPassConfig(PassManagerBase &PM) { return new LoongArchPassConfig(*this, PM); } +void LoongArchPassConfig::addIRPasses() { + addPass(createAtomicExpandPass()); + + TargetPassConfig::addIRPasses(); +} + bool LoongArchPassConfig::addInstSelector() { addPass(createLoongArchISelDag(getLoongArchTargetMachine())); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index c733c194e6a2..e50761ab1e27 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/Compiler.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "LoongArchGenInstrInfo.inc" #define GET_REGINFO_MC_DESC diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h index e576b9a49cd6..a606ccdbc47c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h @@ -46,6 +46,7 @@ createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); // Defines symbolic names for LoongArch instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "LoongArchGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp index 3bcce9e3ba3b..4933d40f3388 100644 --- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp +++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp @@ -77,6 +77,9 @@ bool M68kAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } void M68kAsmPrinter::emitInstruction(const MachineInstr *MI) { + M68k_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + switch (MI->getOpcode()) { default: { if (MI->isPseudo()) { diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp index 2606e22410fc..e6290d4cbec5 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp @@ -31,6 +31,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "M68kGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h index 0dc601ad876b..2a1cc678016a 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h @@ -52,6 +52,7 @@ std::unique_ptr<MCObjectTargetWriter> createM68kELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the M68k instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "M68kGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp index 3f006056955d..13a880de68b5 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp @@ -22,6 +22,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "MSP430GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h index 24b0b3298592..e596c3f1ce46 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h @@ -53,6 +53,7 @@ createMSP430ELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the MSP430 instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "MSP430GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp index 85c59d5b14b5..9cd2cbe89e46 100644 --- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -149,6 +149,9 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, //===----------------------------------------------------------------------===// void MSP430AsmPrinter::emitInstruction(const MachineInstr *MI) { + MSP430_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MSP430MCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 6fc8fcb482cd..40c807082fdc 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -36,6 +36,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "MipsGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index 8531177ee924..d51f3b9abcfd 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -55,6 +55,7 @@ StringRef selectMipsCPU(const Triple &TT, StringRef CPU); // Defines symbolic names for the Mips instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "MipsGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 9330a791a7cc..fcaf450cc511 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -181,6 +181,10 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, } void MipsAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // Mips_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + MipsTargetStreamer &TS = getTargetStreamer(); unsigned Opc = MI->getOpcode(); TS.forbidModuleDirective(); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index 856d03f0b210..0ba29fb48b05 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -23,6 +23,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "NVPTXGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h index b394566edd0d..78f4e6745502 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h @@ -21,6 +21,7 @@ // Defines symbolic names for the PTX instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "NVPTXGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 41e9f375e536..8c92766faecb 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -183,6 +183,7 @@ enum CmpMode { // Defines symbolic names for the NVPTX instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "NVPTXGenInstrInfo.inc" #endif diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index b1d842122060..9977d8ba0300 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -139,6 +139,9 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV, } void NVPTXAsmPrinter::emitInstruction(const MachineInstr *MI) { + NVPTX_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst Inst; lowerToMCInst(MI, Inst); EmitToStreamer(*OutStreamer, Inst); diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 2201eb19c80f..b4f7a64f144a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -270,10 +270,6 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, // ShuffleVector return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1], NewOperands[2]); - case Instruction::InsertValue: - // InsertValueConstantExpr - return Builder.CreateInsertValue(NewOperands[0], NewOperands[1], - C->getIndices()); case Instruction::GetElementPtr: // GetElementPtrConstantExpr return Builder.CreateGEP(cast<GEPOperator>(C)->getSourceElementType(), diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 746f652bfa36..6ad016dfa0a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1861,7 +1861,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Ret.getValue(2); if (ProxyRegTruncates[i]) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); + Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].value(), Ret); } InVals.push_back(Ret); diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index eeedce2d99cb..202134ed7035 100644 --- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -35,6 +35,8 @@ public: bool runOnFunction(Function &F) override; + StringRef getPassName() const override { return "NVPTX Image Optimizer"; } + private: bool replaceIsTypePSampler(Instruction &I); bool replaceIsTypePSurface(Instruction &I); diff --git a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 16fbe1a65562..7929bd2e0df0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -36,6 +36,8 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return "NVPTX Prolog Epilog Pass"; } + private: void calculateFrameObjectOffsets(MachineFunction &Fn); }; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 2d6d72777db2..4e41515b997d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -18,7 +18,6 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Mutex.h" #include <algorithm> #include <cstring> @@ -32,19 +31,27 @@ namespace llvm { namespace { typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t; typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t; -typedef std::map<const Module *, global_val_annot_t> per_module_annot_t; -} // anonymous namespace -static ManagedStatic<per_module_annot_t> annotationCache; -static sys::Mutex Lock; +struct AnnotationCache { + sys::Mutex Lock; + std::map<const Module *, global_val_annot_t> Cache; +}; + +AnnotationCache &getAnnotationCache() { + static AnnotationCache AC; + return AC; +} +} // anonymous namespace void clearAnnotationCache(const Module *Mod) { - std::lock_guard<sys::Mutex> Guard(Lock); - annotationCache->erase(Mod); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); + AC.Cache.erase(Mod); } static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); assert(md && "Invalid mdnode for annotation"); assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); // start index = 1, to skip the global variable key @@ -70,7 +77,8 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { } static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations"); if (!NMD) return; @@ -93,40 +101,42 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { if (tmp.empty()) // no annotations for this gv return; - if ((*annotationCache).find(m) != (*annotationCache).end()) - (*annotationCache)[m][gv] = std::move(tmp); + if (AC.Cache.find(m) != AC.Cache.end()) + AC.Cache[m][gv] = std::move(tmp); else { global_val_annot_t tmp1; tmp1[gv] = std::move(tmp); - (*annotationCache)[m] = std::move(tmp1); + AC.Cache[m] = std::move(tmp1); } } bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, unsigned &retval) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); const Module *m = gv->getParent(); - if ((*annotationCache).find(m) == (*annotationCache).end()) + if (AC.Cache.find(m) == AC.Cache.end()) cacheAnnotationFromMD(m, gv); - else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end()) + else if (AC.Cache[m].find(gv) == AC.Cache[m].end()) cacheAnnotationFromMD(m, gv); - if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end()) + if (AC.Cache[m][gv].find(prop) == AC.Cache[m][gv].end()) return false; - retval = (*annotationCache)[m][gv][prop][0]; + retval = AC.Cache[m][gv][prop][0]; return true; } bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); const Module *m = gv->getParent(); - if ((*annotationCache).find(m) == (*annotationCache).end()) + if (AC.Cache.find(m) == AC.Cache.end()) cacheAnnotationFromMD(m, gv); - else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end()) + else if (AC.Cache[m].find(gv) == AC.Cache[m].end()) cacheAnnotationFromMD(m, gv); - if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end()) + if (AC.Cache[m][gv].find(prop) == AC.Cache[m][gv].end()) return false; - retval = (*annotationCache)[m][gv][prop]; + retval = AC.Cache[m][gv][prop]; return true; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 46bbc44e1681..fa9e69f2e607 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -449,12 +449,9 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, return MO.getImm(); } -void PPCMCCodeEmitter::encodeInstruction( - const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - +void PPCMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); // Output the constant in big/little endian byte order. @@ -492,5 +489,4 @@ bool PPCMCCodeEmitter::isPrefixedInstruction(const MCInst &MI) const { return InstrInfo->isPrefixed(Opcode); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "PPCGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index 39b2f1211f29..c4d4d35a6665 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -121,12 +121,6 @@ public: // Is this instruction a prefixed instruction. bool isPrefixedInstruction(const MCInst &MI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // namespace llvm diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index a651362f703b..1008dc63d064 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -48,6 +48,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "PPCGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index acb860e16518..3ca6f394f60b 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -118,6 +118,7 @@ static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) { // #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "PPCGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 22f35c8fa8d3..58a75baf8081 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -230,6 +230,9 @@ private: void emitGlobalVariableHelper(const GlobalVariable *); + // Get the offset of an alias based on its AliaseeObject. + uint64_t getAliasOffset(const Constant *C); + public: PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : PPCAsmPrinter(TM, std::move(Streamer)) { @@ -656,6 +659,9 @@ static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO, /// the current output stream. /// void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { + PPC_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst TmpInst; const bool IsPPC64 = Subtarget->isPPC64(); const bool IsAIX = Subtarget->isAIXABI(); @@ -2352,6 +2358,24 @@ static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { .Default(false); } +uint64_t PPCAIXAsmPrinter::getAliasOffset(const Constant *C) { + if (auto *GA = dyn_cast<GlobalAlias>(C)) + return getAliasOffset(GA->getAliasee()); + if (auto *CE = dyn_cast<ConstantExpr>(C)) { + const MCExpr *LowC = lowerConstant(CE); + const MCBinaryExpr *CBE = dyn_cast<MCBinaryExpr>(LowC); + if (!CBE) + return 0; + if (CBE->getOpcode() != MCBinaryExpr::Add) + report_fatal_error("Only adding an offset is supported now."); + auto *RHS = dyn_cast<MCConstantExpr>(CBE->getRHS()); + if (!RHS) + report_fatal_error("Unable to get the offset of alias."); + return RHS->getValue(); + } + return 0; +} + void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { // Special LLVM global arrays have been handled at the initialization. if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV)) @@ -2422,20 +2446,34 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) { } MCSymbol *EmittedInitSym = GVSym; + + // Emit linkage for the global variable and its aliases. emitLinkage(GV, EmittedInitSym); + for (const GlobalAlias *GA : GOAliasMap[GV]) + emitLinkage(GA, getSymbol(GA)); + emitAlignment(getGVAlignment(GV, DL), GV); // When -fdata-sections is enabled, every GlobalVariable will // be put into its own csect; therefore, label is not necessary here. - if (!TM.getDataSections() || GV->hasSection()) { + if (!TM.getDataSections() || GV->hasSection()) OutStreamer->emitLabel(EmittedInitSym); + + // No alias to emit. + if (!GOAliasMap[GV].size()) { + emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); + return; } - // Emit aliasing label for global variable. - for (const GlobalAlias *Alias : GOAliasMap[GV]) - OutStreamer->emitLabel(getSymbol(Alias)); + // Aliases with the same offset should be aligned. Record the list of aliases + // associated with the offset. + AliasMapTy AliasList; + for (const GlobalAlias *GA : GOAliasMap[GV]) + AliasList[getAliasOffset(GA->getAliasee())].push_back(GA); - emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); + // Emit alias label and element value for global variable. + emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer(), + &AliasList); } void PPCAIXAsmPrinter::emitFunctionDescriptor() { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 5b9d1e66b04e..3c461a627d61 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -392,8 +392,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // MASS transformation for LLVM intrinsics with replicating fast-math flag // to be consistent to PPCGenScalarMASSEntries pass - if (TM.getOptLevel() == CodeGenOpt::Aggressive && - TM.Options.PPCGenScalarMASSEntries) { + if (TM.getOptLevel() == CodeGenOpt::Aggressive) { setOperationAction(ISD::FSIN , MVT::f64, Custom); setOperationAction(ISD::FCOS , MVT::f64, Custom); setOperationAction(ISD::FPOW , MVT::f64, Custom); @@ -17886,13 +17885,17 @@ bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const { return Op.getNode()->getFlags().hasApproximateFuncs(); } +bool PPCTargetLowering::isScalarMASSConversionEnabled() const { + return getTargetMachine().Options.PPCGenScalarMASSEntries; +} + SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName, const char *LibCallFloatName, const char *LibCallDoubleNameFinite, const char *LibCallFloatNameFinite, SDValue Op, SelectionDAG &DAG) const { - if (!isLowringToMASSSafe(Op)) + if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op)) return SDValue(); if (!isLowringToMASSFiniteSafe(Op)) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index f92a117fe27f..4a08cc42fa9d 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1293,6 +1293,7 @@ namespace llvm { SelectionDAG &DAG) const; bool isLowringToMASSFiniteSafe(SDValue Op) const; bool isLowringToMASSSafe(SDValue Op) const; + bool isScalarMASSConversionEnabled() const; SDValue lowerLibCallBase(const char *LibCallDoubleName, const char *LibCallFloatName, const char *LibCallDoubleNameFinite, diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index fbd487fbcfd5..59e8f3ff84a4 100644 --- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -43,7 +43,6 @@ namespace { } const PPCInstrInfo *TII; - LiveIntervals *LIS; protected: bool processBlock(MachineBasicBlock &MBB) { @@ -83,11 +82,8 @@ protected: Register InReg = PPC::NoRegister; Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3; Register GPR4 = Is64Bit ? PPC::X4 : PPC::R4; - SmallVector<Register, 3> OrigRegs = {OutReg, GPR3}; - if (!IsPCREL) { + if (!IsPCREL) InReg = MI.getOperand(1).getReg(); - OrigRegs.push_back(InReg); - } DebugLoc DL = MI.getDebugLoc(); unsigned Opc1, Opc2; @@ -139,11 +135,6 @@ protected: BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) .addImm(0); - // The ADDItls* instruction is the first instruction in the - // repair range. - MachineBasicBlock::iterator First = I; - --First; - if (IsAIX) { // The variable offset and region handle are copied in r4 and r3. The // copies are followed by GETtlsADDR32AIX/GETtlsADDR64AIX. @@ -177,16 +168,10 @@ protected: BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) .addReg(GPR3); - // The COPY is the last instruction in the repair range. - MachineBasicBlock::iterator Last = I; - --Last; - // Move past the original instruction and remove it. ++I; MI.removeFromParent(); - // Repair the live intervals. - LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs); Changed = true; } @@ -204,7 +189,6 @@ public: bool runOnMachineFunction(MachineFunction &MF) override { TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo(); - LIS = &getAnalysis<LiveIntervals>(); bool Changed = false; @@ -217,9 +201,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); - AU.addPreserved<LiveIntervals>(); AU.addRequired<SlotIndexes>(); - AU.addPreserved<SlotIndexes>(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 7c062387fecd..a335b2d23394 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -84,12 +84,6 @@ public: unsigned getVMaskReg(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -188,9 +182,6 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS, void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); // Get byte count of instruction. unsigned Size = Desc.getSize(); @@ -403,5 +394,4 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo, } } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "RISCVGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp index 917d93479f18..c63e0c8e737d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/ErrorHandling.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "RISCVGenInstrInfo.inc" #define GET_REGINFO_MC_DESC diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h index 276fc9efb6c0..d157257d976c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h @@ -45,6 +45,7 @@ std::unique_ptr<MCObjectTargetWriter> createRISCVELFObjectWriter(uint8_t OSABI, // Defines symbolic names for RISC-V instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "RISCVGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 5b2a247ebda0..edd39f6547ed 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -91,6 +91,9 @@ void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) { #include "RISCVGenMCPseudoLowering.inc" void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) { + RISCV_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 57d8ba6f0161..a7286b2963c2 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -899,7 +899,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, } std::pair<int64_t, Align> -RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const { +RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); // Create a buffer of RVV objects to allocate. SmallVector<int, 8> ObjectsToAllocate; for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { @@ -912,10 +913,18 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const { ObjectsToAllocate.push_back(I); } - // Allocate all RVV locals and spills - int64_t Offset = 0; // The minimum alignment is 16 bytes. Align RVVStackAlign(16); + const auto &ST = MF.getSubtarget<RISCVSubtarget>(); + + if (!ST.hasVInstructions()) { + assert(ObjectsToAllocate.empty() && + "Can't allocate scalable-vector objects without V instructions"); + return std::make_pair(0, RVVStackAlign); + } + + // Allocate all RVV locals and spills + int64_t Offset = 0; for (int FI : ObjectsToAllocate) { // ObjectSize in bytes. int64_t ObjectSize = MFI.getObjectSize(FI); @@ -997,7 +1006,7 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized( int64_t RVVStackSize; Align RVVStackAlign; - std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MFI); + std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MF); RVFI->setRVVStackSize(RVVStackSize); RVFI->setRVVStackAlign(RVVStackAlign); diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 466cd059b749..a5cf68a6ea94 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -84,7 +84,7 @@ private: MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount, MachineInstr::MIFlag Flag) const; std::pair<int64_t, Align> - assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const; + assignRVVStackObjectOffsets(MachineFunction &MF) const; }; -} +} // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index cfaafc7b53d2..5b823af1e9b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -43,92 +43,95 @@ namespace RISCV { } // namespace llvm void RISCVDAGToDAGISel::PreprocessISelDAG() { - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); - I != E;) { - SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); - // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point - // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden. - if (N->getOpcode() == ISD::SPLAT_VECTOR) { + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + if (N->use_empty()) + continue; + + SDValue Result; + switch (N->getOpcode()) { + case ISD::SPLAT_VECTOR: { + // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point + // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden. MVT VT = N->getSimpleValueType(0); unsigned Opc = VT.isInteger() ? RISCVISD::VMV_V_X_VL : RISCVISD::VFMV_V_F_VL; SDLoc DL(N); SDValue VL = CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT()); - SDValue Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT), - N->getOperand(0), VL); - - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); - ++I; - CurDAG->DeleteNode(N); - continue; + Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT), + N->getOperand(0), VL); + break; } + case RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL: { + // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector + // load. Done after lowering and combining so that we have a chance to + // optimize this to VMV_V_X_VL when the upper bits aren't needed. + assert(N->getNumOperands() == 4 && "Unexpected number of operands"); + MVT VT = N->getSimpleValueType(0); + SDValue Passthru = N->getOperand(0); + SDValue Lo = N->getOperand(1); + SDValue Hi = N->getOperand(2); + SDValue VL = N->getOperand(3); + assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() && + Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 && + "Unexpected VTs!"); + MachineFunction &MF = CurDAG->getMachineFunction(); + RISCVMachineFunctionInfo *FuncInfo = + MF.getInfo<RISCVMachineFunctionInfo>(); + SDLoc DL(N); - // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector - // load. Done after lowering and combining so that we have a chance to - // optimize this to VMV_V_X_VL when the upper bits aren't needed. - if (N->getOpcode() != RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL) - continue; - - assert(N->getNumOperands() == 4 && "Unexpected number of operands"); - MVT VT = N->getSimpleValueType(0); - SDValue Passthru = N->getOperand(0); - SDValue Lo = N->getOperand(1); - SDValue Hi = N->getOperand(2); - SDValue VL = N->getOperand(3); - assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() && - Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 && - "Unexpected VTs!"); - MachineFunction &MF = CurDAG->getMachineFunction(); - RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>(); - SDLoc DL(N); - - // We use the same frame index we use for moving two i32s into 64-bit FPR. - // This is an analogous operation. - int FI = FuncInfo->getMoveF64FrameIndex(MF); - MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); - const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); - SDValue StackSlot = - CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout())); + // We use the same frame index we use for moving two i32s into 64-bit FPR. + // This is an analogous operation. + int FI = FuncInfo->getMoveF64FrameIndex(MF); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); + const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); + SDValue StackSlot = + CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout())); - SDValue Chain = CurDAG->getEntryNode(); - Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8)); + SDValue Chain = CurDAG->getEntryNode(); + Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8)); - SDValue OffsetSlot = - CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL); - Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4), - Align(8)); + SDValue OffsetSlot = + CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL); + Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4), + Align(8)); - Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); - SDVTList VTs = CurDAG->getVTList({VT, MVT::Other}); - SDValue IntID = - CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64); - SDValue Ops[] = {Chain, - IntID, - Passthru, - StackSlot, - CurDAG->getRegister(RISCV::X0, MVT::i64), - VL}; + SDVTList VTs = CurDAG->getVTList({VT, MVT::Other}); + SDValue IntID = + CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64); + SDValue Ops[] = {Chain, + IntID, + Passthru, + StackSlot, + CurDAG->getRegister(RISCV::X0, MVT::i64), + VL}; - SDValue Result = CurDAG->getMemIntrinsicNode( - ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MVT::i64, MPI, Align(8), - MachineMemOperand::MOLoad); + Result = CurDAG->getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, + MVT::i64, MPI, Align(8), + MachineMemOperand::MOLoad); + break; + } + } - // We're about to replace all uses of the SPLAT_VECTOR_SPLIT_I64 with the - // vlse we created. This will cause general havok on the dag because - // anything below the conversion could be folded into other existing nodes. - // To avoid invalidating 'I', back it up to the convert node. - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + if (Result) { + LLVM_DEBUG(dbgs() << "RISCV DAG preprocessing replacing:\nOld: "); + LLVM_DEBUG(N->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\nNew: "); + LLVM_DEBUG(Result->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\n"); - // Now that we did that, the node is dead. Increment the iterator to the - // next node to process, then delete N. - ++I; - CurDAG->DeleteNode(N); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + MadeChange = true; + } } + + if (MadeChange) + CurDAG->RemoveDeadNodes(); } void RISCVDAGToDAGISel::PostprocessISelDAG() { @@ -143,7 +146,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { continue; MadeChange |= doPeepholeSExtW(N); - MadeChange |= doPeepholeLoadStoreADDI(N); MadeChange |= doPeepholeMaskedRVV(N); } @@ -153,40 +155,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { CurDAG->RemoveDeadNodes(); } -// Returns true if N is a MachineSDNode that has a reg and simm12 memory -// operand. The indices of the base pointer and offset are returned in BaseOpIdx -// and OffsetOpIdx. -static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx, - unsigned &OffsetOpIdx) { - switch (N->getMachineOpcode()) { - case RISCV::LB: - case RISCV::LH: - case RISCV::LW: - case RISCV::LBU: - case RISCV::LHU: - case RISCV::LWU: - case RISCV::LD: - case RISCV::FLH: - case RISCV::FLW: - case RISCV::FLD: - BaseOpIdx = 0; - OffsetOpIdx = 1; - return true; - case RISCV::SB: - case RISCV::SH: - case RISCV::SW: - case RISCV::SD: - case RISCV::FSH: - case RISCV::FSW: - case RISCV::FSD: - BaseOpIdx = 1; - OffsetOpIdx = 2; - return true; - } - - return false; -} - static SDNode *selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, RISCVMatInt::InstSeq &Seq) { SDNode *Result = nullptr; @@ -285,9 +253,7 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands( SDValue Chain = Node->getOperand(0); SDValue Glue; - SDValue Base; - SelectBaseAddr(Node->getOperand(CurOp++), Base); - Operands.push_back(Base); // Base pointer. + Operands.push_back(Node->getOperand(CurOp++)); // Base pointer. if (IsStridedOrIndexed) { Operands.push_back(Node->getOperand(CurOp++)); // Index. @@ -651,83 +617,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget)); return; } - case ISD::ADD: { - // Try to select ADD + immediate used as memory addresses to - // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by - // doPeepholeLoadStoreADDI. - - // LHS should be an immediate. - auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); - if (!N1C) - break; - - int64_t Offset = N1C->getSExtValue(); - int64_t Lo12 = SignExtend64<12>(Offset); - - // Don't do this if the lower 12 bits are 0 or we could use ADDI directly. - if (Lo12 == 0 || isInt<12>(Offset)) - break; - - // Don't do this if we can use a pair of ADDIs. - if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2)) - break; - - RISCVMatInt::InstSeq Seq = - RISCVMatInt::generateInstSeq(Offset, Subtarget->getFeatureBits()); - - Offset -= Lo12; - // Restore sign bits for RV32. - if (!Subtarget->is64Bit()) - Offset = SignExtend64<32>(Offset); - - // We can fold if the last operation is an ADDI or its an ADDIW that could - // be treated as an ADDI. - if (Seq.back().Opc != RISCV::ADDI && - !(Seq.back().Opc == RISCV::ADDIW && isInt<32>(Offset))) - break; - assert(Seq.back().Imm == Lo12 && "Expected immediate to match Lo12"); - // Drop the last operation. - Seq.pop_back(); - assert(!Seq.empty() && "Expected more instructions in sequence"); - - bool AllPointerUses = true; - for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - - // Is this user a memory instruction that uses a register and immediate - // that has this ADD as its pointer. - unsigned BaseOpIdx, OffsetOpIdx; - if (!User->isMachineOpcode() || - !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) || - UI.getOperandNo() != BaseOpIdx) { - AllPointerUses = false; - break; - } - - // If the memory instruction already has an offset, make sure the combined - // offset is foldable. - int64_t MemOffs = - cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue(); - MemOffs += Lo12; - if (!isInt<12>(MemOffs)) { - AllPointerUses = false; - break; - } - } - - if (!AllPointerUses) - break; - - // Emit (ADDI (ADD X, Hi), Lo) - SDNode *Imm = selectImmSeq(CurDAG, DL, VT, Seq); - SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT, - Node->getOperand(0), SDValue(Imm, 0)); - SDNode *ADDI = - CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0), - CurDAG->getTargetConstant(Lo12, DL, VT)); - ReplaceNode(Node, ADDI); - return; - } case ISD::SHL: { auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); if (!N1C) @@ -856,10 +745,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C) break; - uint64_t C2 = C->getZExtValue(); + unsigned C2 = C->getZExtValue(); unsigned XLen = Subtarget->getXLen(); - if (!C2 || C2 >= XLen) - break; + assert((C2 > 0 && C2 < XLen) && "Unexpected shift amount!"); uint64_t C1 = N1C->getZExtValue(); @@ -885,10 +773,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (srl x, c2) c1) -> (srli (slli x, c3-c2), c3) if c1 is a mask // with c3 leading zeros. if (!LeftShift && isMask_64(C1)) { - uint64_t C3 = XLen - (64 - countLeadingZeros(C1)); - if (C2 < C3) { + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); + if (C2 < Leading) { // If the number of leading zeros is C2+32 this can be SRLIW. - if (C2 + 32 == C3) { + if (C2 + 32 == Leading) { SDNode *SRLIW = CurDAG->getMachineNode( RISCV::SRLIW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT)); ReplaceNode(Node, SRLIW); @@ -900,7 +788,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // // This pattern occurs when (i32 (srl (sra 31), c3 - 32)) is type // legalized and goes through DAG combine. - if (C2 >= 32 && (C3 - C2) == 1 && N0.hasOneUse() && + if (C2 >= 32 && (Leading - C2) == 1 && N0.hasOneUse() && X.getOpcode() == ISD::SIGN_EXTEND_INREG && cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32) { SDNode *SRAIW = @@ -908,25 +796,25 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->getTargetConstant(31, DL, VT)); SDNode *SRLIW = CurDAG->getMachineNode( RISCV::SRLIW, DL, VT, SDValue(SRAIW, 0), - CurDAG->getTargetConstant(C3 - 32, DL, VT)); + CurDAG->getTargetConstant(Leading - 32, DL, VT)); ReplaceNode(Node, SRLIW); return; } // (srli (slli x, c3-c2), c3). // Skip if we could use (zext.w (sraiw X, C2)). - bool Skip = Subtarget->hasStdExtZba() && C3 == 32 && + bool Skip = Subtarget->hasStdExtZba() && Leading == 32 && X.getOpcode() == ISD::SIGN_EXTEND_INREG && cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32; // Also Skip if we can use bexti. - Skip |= Subtarget->hasStdExtZbs() && C3 == XLen - 1; + Skip |= Subtarget->hasStdExtZbs() && Leading == XLen - 1; if (OneUseOrZExtW && !Skip) { SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, X, - CurDAG->getTargetConstant(C3 - C2, DL, VT)); - SDNode *SRLI = - CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Leading - C2, DL, VT)); + SDNode *SRLI = CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, SDValue(SLLI, 0), + CurDAG->getTargetConstant(Leading, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -936,12 +824,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (shl x, c2), c1) -> (srli (slli c2+c3), c3) if c1 is a mask // shifted by c2 bits with c3 leading zeros. if (LeftShift && isShiftedMask_64(C1)) { - uint64_t C3 = XLen - (64 - countLeadingZeros(C1)); + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); - if (C2 + C3 < XLen && - C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) { + if (C2 + Leading < XLen && + C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + Leading)) << C2)) { // Use slli.uw when possible. - if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) { + if ((XLen - (C2 + Leading)) == 32 && Subtarget->hasStdExtZba()) { SDNode *SLLI_UW = CurDAG->getMachineNode( RISCV::SLLI_UW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT)); ReplaceNode(Node, SLLI_UW); @@ -952,10 +840,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (OneUseOrZExtW && !IsCANDI) { SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, X, - CurDAG->getTargetConstant(C2 + C3, DL, VT)); - SDNode *SRLI = - CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(C2 + Leading, DL, VT)); + SDNode *SRLI = CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, SDValue(SLLI, 0), + CurDAG->getTargetConstant(Leading, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -965,9 +853,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (shr x, c2), c1) -> (slli (srli x, c2+c3), c3) if c1 is a // shifted mask with c2 leading zeros and c3 trailing zeros. if (!LeftShift && isShiftedMask_64(C1)) { - uint64_t Leading = XLen - (64 - countLeadingZeros(C1)); - uint64_t C3 = countTrailingZeros(C1); - if (Leading == C2 && C2 + C3 < XLen && OneUseOrZExtW && !IsCANDI) { + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); + unsigned Trailing = countTrailingZeros(C1); + if (Leading == C2 && C2 + Trailing < XLen && OneUseOrZExtW && !IsCANDI) { unsigned SrliOpc = RISCV::SRLI; // If the input is zexti32 we should use SRLIW. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -976,22 +864,23 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { X = X.getOperand(0); } SDNode *SRLI = CurDAG->getMachineNode( - SrliOpc, DL, VT, X, CurDAG->getTargetConstant(C2 + C3, DL, VT)); + SrliOpc, DL, VT, X, + CurDAG->getTargetConstant(C2 + Trailing, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } // If the leading zero count is C2+32, we can use SRLIW instead of SRLI. - if (Leading > 32 && (Leading - 32) == C2 && C2 + C3 < 32 && + if (Leading > 32 && (Leading - 32) == C2 && C2 + Trailing < 32 && OneUseOrZExtW && !IsCANDI) { - SDNode *SRLIW = - CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X, - CurDAG->getTargetConstant(C2 + C3, DL, VT)); + SDNode *SRLIW = CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, X, + CurDAG->getTargetConstant(C2 + Trailing, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } @@ -1000,25 +889,26 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (shl x, c2), c1) -> (slli (srli x, c3-c2), c3) if c1 is a // shifted mask with no leading zeros and c3 trailing zeros. if (LeftShift && isShiftedMask_64(C1)) { - uint64_t Leading = XLen - (64 - countLeadingZeros(C1)); - uint64_t C3 = countTrailingZeros(C1); - if (Leading == 0 && C2 < C3 && OneUseOrZExtW && !IsCANDI) { + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); + unsigned Trailing = countTrailingZeros(C1); + if (Leading == 0 && C2 < Trailing && OneUseOrZExtW && !IsCANDI) { SDNode *SRLI = CurDAG->getMachineNode( - RISCV::SRLI, DL, VT, X, CurDAG->getTargetConstant(C3 - C2, DL, VT)); + RISCV::SRLI, DL, VT, X, + CurDAG->getTargetConstant(Trailing - C2, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } // If we have (32-C2) leading zeros, we can use SRLIW instead of SRLI. - if (C2 < C3 && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) { - SDNode *SRLIW = - CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X, - CurDAG->getTargetConstant(C3 - C2, DL, VT)); + if (C2 < Trailing && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) { + SDNode *SRLIW = CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, X, + CurDAG->getTargetConstant(Trailing - C2, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } @@ -1885,13 +1775,74 @@ bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, return false; } -bool RISCVDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) { - // If this is FrameIndex, select it directly. Otherwise just let it get - // selected to a register independently. - if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT()); - else - Base = Addr; +// Fold constant addresses. +static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, + const MVT VT, const RISCVSubtarget *Subtarget, + SDValue Addr, SDValue &Base, SDValue &Offset) { + if (!isa<ConstantSDNode>(Addr)) + return false; + + int64_t CVal = cast<ConstantSDNode>(Addr)->getSExtValue(); + + // If the constant is a simm12, we can fold the whole constant and use X0 as + // the base. If the constant can be materialized with LUI+simm12, use LUI as + // the base. We can't use generateInstSeq because it favors LUI+ADDIW. + int64_t Lo12 = SignExtend64<12>(CVal); + int64_t Hi = (uint64_t)CVal - (uint64_t)Lo12; + if (!Subtarget->is64Bit() || isInt<32>(Hi)) { + if (Hi) { + int64_t Hi20 = (Hi >> 12) & 0xfffff; + Base = SDValue( + CurDAG->getMachineNode(RISCV::LUI, DL, VT, + CurDAG->getTargetConstant(Hi20, DL, VT)), + 0); + } else { + Base = CurDAG->getRegister(RISCV::X0, VT); + } + Offset = CurDAG->getTargetConstant(Lo12, DL, VT); + return true; + } + + // Ask how constant materialization would handle this constant. + RISCVMatInt::InstSeq Seq = + RISCVMatInt::generateInstSeq(CVal, Subtarget->getFeatureBits()); + + // If the last instruction would be an ADDI, we can fold its immediate and + // emit the rest of the sequence as the base. + if (Seq.back().Opc != RISCV::ADDI) + return false; + Lo12 = Seq.back().Imm; + + // Drop the last instruction. + Seq.pop_back(); + assert(!Seq.empty() && "Expected more instructions in sequence"); + + Base = SDValue(selectImmSeq(CurDAG, DL, VT, Seq), 0); + Offset = CurDAG->getTargetConstant(Lo12, DL, VT); + return true; +} + +// Is this ADD instruction only used as the base pointer of scalar loads and +// stores? +static bool isWorthFoldingAdd(SDValue Add) { + for (auto Use : Add->uses()) { + if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && + Use->getOpcode() != ISD::ATOMIC_LOAD && + Use->getOpcode() != ISD::ATOMIC_STORE) + return false; + EVT VT = cast<MemSDNode>(Use)->getMemoryVT(); + if (!VT.isScalarInteger() && VT != MVT::f16 && VT != MVT::f32 && + VT != MVT::f64) + return false; + // Don't allow stores of the value. It must be used as the address. + if (Use->getOpcode() == ISD::STORE && + cast<StoreSDNode>(Use)->getValue() == Add) + return false; + if (Use->getOpcode() == ISD::ATOMIC_STORE && + cast<AtomicSDNode>(Use)->getVal() == Add) + return false; + } + return true; } @@ -1947,9 +1898,10 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); assert(!isInt<12>(CVal) && "simm12 not already handled?"); + // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use + // an ADDI for part of the offset and fold the rest into the load/store. + // This mirrors the AddiPair PatFrag in RISCVInstrInfo.td. if (isInt<12>(CVal / 2) && isInt<12>(CVal - CVal / 2)) { - // We can use an ADDI for part of the offset and fold the rest into the - // load/store. This mirrors the AddiPair PatFrag in RISCVInstrInfo.td. int64_t Adj = CVal < 0 ? -2048 : 2047; Base = SDValue( CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0), @@ -1958,8 +1910,27 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, Offset = CurDAG->getTargetConstant(CVal - Adj, DL, VT); return true; } + + // For larger immediates, we might be able to save one instruction from + // constant materialization by folding the Lo12 bits of the immediate into + // the address. We should only do this if the ADD is only used by loads and + // stores that can fold the lo12 bits. Otherwise, the ADD will get iseled + // separately with the full materialized immediate creating extra + // instructions. + if (isWorthFoldingAdd(Addr) && + selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr.getOperand(1), Base, + Offset)) { + // Insert an ADD instruction with the materialized Hi52 bits. + Base = SDValue( + CurDAG->getMachineNode(RISCV::ADD, DL, VT, Addr.getOperand(0), Base), + 0); + return true; + } } + if (selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr, Base, Offset)) + return true; + Base = Addr; Offset = CurDAG->getTargetConstant(0, DL, VT); return true; @@ -2044,6 +2015,101 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) { return false; } +/// Look for various patterns that can be done with a SHL that can be folded +/// into a SHXADD. \p ShAmt contains 1, 2, or 3 and is set based on which +/// SHXADD we are trying to match. +bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt, + SDValue &Val) { + if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) { + SDValue N0 = N.getOperand(0); + + bool LeftShift = N0.getOpcode() == ISD::SHL; + if ((LeftShift || N0.getOpcode() == ISD::SRL) && + isa<ConstantSDNode>(N0.getOperand(1))) { + uint64_t Mask = N.getConstantOperandVal(1); + unsigned C2 = N0.getConstantOperandVal(1); + + unsigned XLen = Subtarget->getXLen(); + if (LeftShift) + Mask &= maskTrailingZeros<uint64_t>(C2); + else + Mask &= maskTrailingOnes<uint64_t>(XLen - C2); + + // Look for (and (shl y, c2), c1) where c1 is a shifted mask with no + // leading zeros and c3 trailing zeros. We can use an SRLI by c2+c3 + // followed by a SHXADD with c3 for the X amount. + if (isShiftedMask_64(Mask)) { + unsigned Leading = XLen - (64 - countLeadingZeros(Mask)); + unsigned Trailing = countTrailingZeros(Mask); + if (LeftShift && Leading == 0 && C2 < Trailing && Trailing == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Trailing - C2, DL, VT)), + 0); + return true; + } + // Look for (and (shr y, c2), c1) where c1 is a shifted mask with c2 + // leading zeros and c3 trailing zeros. We can use an SRLI by C3 + // followed by a SHXADD using c3 for the X amount. + if (!LeftShift && Leading == C2 && Trailing == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue( + CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Leading + Trailing, DL, VT)), + 0); + return true; + } + } + } + } + + bool LeftShift = N.getOpcode() == ISD::SHL; + if ((LeftShift || N.getOpcode() == ISD::SRL) && + isa<ConstantSDNode>(N.getOperand(1))) { + SDValue N0 = N.getOperand(0); + if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && + isa<ConstantSDNode>(N0.getOperand(1))) { + uint64_t Mask = N0.getConstantOperandVal(1); + if (isShiftedMask_64(Mask)) { + unsigned C1 = N.getConstantOperandVal(1); + unsigned XLen = Subtarget->getXLen(); + unsigned Leading = XLen - (64 - countLeadingZeros(Mask)); + unsigned Trailing = countTrailingZeros(Mask); + // Look for (shl (and X, Mask), C1) where Mask has 32 leading zeros and + // C3 trailing zeros. If C1+C3==ShAmt we can use SRLIW+SHXADD. + if (LeftShift && Leading == 32 && Trailing > 0 && + (Trailing + C1) == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Trailing, DL, VT)), + 0); + return true; + } + // Look for (srl (and X, Mask), C1) where Mask has 32 leading zeros and + // C3 trailing zeros. If C3-C1==ShAmt we can use SRLIW+SHXADD. + if (!LeftShift && Leading == 32 && Trailing > C1 && + (Trailing - C1) == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Trailing, DL, VT)), + 0); + return true; + } + } + } + } + + return false; +} + // Return true if all users of this SDNode* only consume the lower \p Bits. // This can be used to form W instructions for add/sub/mul/shl even when the // root isn't a sext_inreg. This can allow the ADDW/SUBW/MULW/SLLIW to CSE if @@ -2271,102 +2337,6 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width, return false; } -// Merge an ADDI into the offset of a load/store instruction where possible. -// (load (addi base, off1), off2) -> (load base, off1+off2) -// (store val, (addi base, off1), off2) -> (store val, base, off1+off2) -// (load (add base, (addi src, off1)), off2) -// -> (load (add base, src), off1+off2) -// (store val, (add base, (addi src, off1)), off2) -// -> (store val, (add base, src), off1+off2) -// This is possible when off1+off2 fits a 12-bit immediate. -bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) { - unsigned OffsetOpIdx, BaseOpIdx; - if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx)) - return false; - - if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx))) - return false; - - SDValue Base = N->getOperand(BaseOpIdx); - - if (!Base.isMachineOpcode()) - return false; - - if (Base.getMachineOpcode() == RISCV::ADDI) { - // If the base is an ADDI, we can merge it in to the load/store. - } else if (Base.getMachineOpcode() == RISCV::ADDIW && - isa<ConstantSDNode>(Base.getOperand(1)) && - Base.getOperand(0).isMachineOpcode() && - Base.getOperand(0).getMachineOpcode() == RISCV::LUI && - isa<ConstantSDNode>(Base.getOperand(0).getOperand(0))) { - // ADDIW can be merged if it's part of LUI+ADDIW constant materialization - // and LUI+ADDI would have produced the same result. This is true for all - // simm32 values except 0x7ffff800-0x7fffffff. - int64_t Offset = - SignExtend64<32>(Base.getOperand(0).getConstantOperandVal(0) << 12); - Offset += cast<ConstantSDNode>(Base.getOperand(1))->getSExtValue(); - if (!isInt<32>(Offset)) - return false; - } else - return false; - - SDValue ImmOperand = Base.getOperand(1); - uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx); - - if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) { - int64_t Offset1 = Const->getSExtValue(); - int64_t CombinedOffset = Offset1 + Offset2; - if (!isInt<12>(CombinedOffset)) - return false; - ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand), - ImmOperand.getValueType()); - } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) { - // If the off1 in (addi base, off1) is a global variable's address (its - // low part, really), then we can rely on the alignment of that variable - // to provide a margin of safety before off1 can overflow the 12 bits. - // Check if off2 falls within that margin; if so off1+off2 can't overflow. - const DataLayout &DL = CurDAG->getDataLayout(); - Align Alignment = commonAlignment(GA->getGlobal()->getPointerAlignment(DL), - GA->getOffset()); - if (Offset2 != 0 && Alignment <= Offset2) - return false; - int64_t Offset1 = GA->getOffset(); - int64_t CombinedOffset = Offset1 + Offset2; - ImmOperand = CurDAG->getTargetGlobalAddress( - GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(), - CombinedOffset, GA->getTargetFlags()); - } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) { - // Ditto. - Align Alignment = commonAlignment(CP->getAlign(), CP->getOffset()); - if (Offset2 != 0 && Alignment <= Offset2) - return false; - int64_t Offset1 = CP->getOffset(); - int64_t CombinedOffset = Offset1 + Offset2; - ImmOperand = CurDAG->getTargetConstantPool( - CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(), - CombinedOffset, CP->getTargetFlags()); - } else { - return false; - } - - LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); - LLVM_DEBUG(Base->dump(CurDAG)); - LLVM_DEBUG(dbgs() << "\nN: "); - LLVM_DEBUG(N->dump(CurDAG)); - LLVM_DEBUG(dbgs() << "\n"); - - // Modify the offset operand of the load/store. - if (BaseOpIdx == 0) { // Load - N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand, - N->getOperand(2)); - } else { // Store - N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0), - ImmOperand, N->getOperand(3)); - } - - return true; -} - // Try to remove sext.w if the input is a W instruction or can be made into // a W instruction cheaply. bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) { diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index b50927cfcca5..ef46204c00ac 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -47,7 +47,6 @@ public: bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectBaseAddr(SDValue Addr, SDValue &Base); bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset); bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt); @@ -61,6 +60,17 @@ public: bool selectSExti32(SDValue N, SDValue &Val); bool selectZExti32(SDValue N, SDValue &Val); + bool selectSHXADDOp(SDValue N, unsigned ShAmt, SDValue &Val); + bool selectSH1ADDOp(SDValue N, SDValue &Val) { + return selectSHXADDOp(N, 1, Val); + } + bool selectSH2ADDOp(SDValue N, SDValue &Val) { + return selectSHXADDOp(N, 2, Val); + } + bool selectSH3ADDOp(SDValue N, SDValue &Val) { + return selectSHXADDOp(N, 3, Val); + } + bool hasAllNBitUsers(SDNode *Node, unsigned Bits) const; bool hasAllHUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 16); } bool hasAllWUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 32); } @@ -118,7 +128,6 @@ public: #include "RISCVGenDAGISel.inc" private: - bool doPeepholeLoadStoreADDI(SDNode *Node); bool doPeepholeSExtW(SDNode *Node); bool doPeepholeMaskedRVV(SDNode *Node); }; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index ff645dea4e7a..658865703079 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -526,6 +526,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_FPTOSI, ISD::VP_FPTOUI, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT, Custom); setOperationAction(ISD::VECTOR_REVERSE, VT, Custom); + + setOperationPromotedToType( + ISD::VECTOR_SPLICE, VT, + MVT::getVectorVT(MVT::i8, VT.getVectorElementCount())); } for (MVT VT : IntVecVTs) { @@ -1157,6 +1161,37 @@ bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const { return C && C->getAPIntValue().ule(10); } +bool RISCVTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getIntegerBitWidth(); + if (BitSize > Subtarget.getXLen()) + return false; + + // Fast path, assume 32-bit immediates are cheap. + int64_t Val = Imm.getSExtValue(); + if (isInt<32>(Val)) + return true; + + // A constant pool entry may be more aligned thant he load we're trying to + // replace. If we don't support unaligned scalar mem, prefer the constant + // pool. + // TODO: Can the caller pass down the alignment? + if (!Subtarget.enableUnalignedScalarMem()) + return true; + + // Prefer to keep the load if it would require many instructions. + // This uses the same threshold we use for constant pools but doesn't + // check useConstantPoolForLargeInts. + // TODO: Should we keep the load only when we're definitely going to emit a + // constant pool? + + RISCVMatInt::InstSeq Seq = + RISCVMatInt::generateInstSeq(Val, Subtarget.getFeatureBits()); + return Seq.size() <= Subtarget.getMaxBuildIntsCost(); +} + bool RISCVTargetLowering:: shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, @@ -1659,7 +1694,7 @@ static SDValue convertFromScalableVector(EVT VT, SDValue V, SelectionDAG &DAG, /// Return the type of the mask type suitable for masking the provided /// vector type. This is simply an i1 element type vector of the same /// (possibly scalable) length. -static MVT getMaskTypeFor(EVT VecVT) { +static MVT getMaskTypeFor(MVT VecVT) { assert(VecVT.isVector()); ElementCount EC = VecVT.getVectorElementCount(); return MVT::getVectorVT(MVT::i1, EC); @@ -5748,8 +5783,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_SPLICE(SDValue Op, DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VecVT, DAG.getUNDEF(VecVT), V1, DownOffset, TrueMask, UpOffset); return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VecVT, SlideDown, V2, UpOffset, - TrueMask, - DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT)); + TrueMask, DAG.getRegister(RISCV::X0, XLenVT)); } SDValue @@ -8530,12 +8564,6 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { return Opcode; } -// Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C) -// FIXME: Should this be a generic combine? There's a similar combine on X86. -// -// Also try these folds where an add or sub is in the middle. -// (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C) -// (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C) static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { assert(N->getOpcode() == ISD::SRA && "Unexpected opcode"); @@ -8543,12 +8571,40 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, if (N->getValueType(0) != MVT::i64 || !Subtarget.is64Bit()) return SDValue(); - auto *ShAmtC = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!ShAmtC || ShAmtC->getZExtValue() > 32) + if (!isa<ConstantSDNode>(N->getOperand(1))) + return SDValue(); + uint64_t ShAmt = N->getConstantOperandVal(1); + if (ShAmt > 32) return SDValue(); SDValue N0 = N->getOperand(0); + // Combine (sra (sext_inreg (shl X, C1), i32), C2) -> + // (sra (shl X, C1+32), C2+32) so it gets selected as SLLI+SRAI instead of + // SLLIW+SRAIW. SLLI+SRAI have compressed forms. + if (ShAmt < 32 && + N0.getOpcode() == ISD::SIGN_EXTEND_INREG && N0.hasOneUse() && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i32 && + N0.getOperand(0).getOpcode() == ISD::SHL && N0.getOperand(0).hasOneUse() && + isa<ConstantSDNode>(N0.getOperand(0).getOperand(1))) { + uint64_t LShAmt = N0.getOperand(0).getConstantOperandVal(1); + if (LShAmt < 32) { + SDLoc ShlDL(N0.getOperand(0)); + SDValue Shl = DAG.getNode(ISD::SHL, ShlDL, MVT::i64, + N0.getOperand(0).getOperand(0), + DAG.getConstant(LShAmt + 32, ShlDL, MVT::i64)); + SDLoc DL(N); + return DAG.getNode(ISD::SRA, DL, MVT::i64, Shl, + DAG.getConstant(ShAmt + 32, DL, MVT::i64)); + } + } + + // Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C) + // FIXME: Should this be a generic combine? There's a similar combine on X86. + // + // Also try these folds where an add or sub is in the middle. + // (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C) + // (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C) SDValue Shl; ConstantSDNode *AddC = nullptr; @@ -8594,12 +8650,12 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, In, DAG.getValueType(MVT::i32)); - if (ShAmtC->getZExtValue() == 32) + if (ShAmt == 32) return SExt; return DAG.getNode( ISD::SHL, DL, MVT::i64, SExt, - DAG.getConstant(32 - ShAmtC->getZExtValue(), DL, MVT::i64)); + DAG.getConstant(32 - ShAmt, DL, MVT::i64)); } SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, @@ -9152,10 +9208,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // FIXME: Support FP. if (Val.getOpcode() == RISCVISD::VMV_X_S) { SDValue Src = Val.getOperand(0); - EVT VecVT = Src.getValueType(); + MVT VecVT = Src.getSimpleValueType(); EVT MemVT = Store->getMemoryVT(); // The memory VT and the element type must match. - if (VecVT.getVectorElementType() == MemVT) { + if (MemVT == VecVT.getVectorElementType()) { SDLoc DL(N); MVT MaskVT = getMaskTypeFor(VecVT); return DAG.getStoreVP( @@ -9864,7 +9920,7 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second, Register FLHS = First.getOperand(1).getReg(); Register FRHS = First.getOperand(2).getReg(); // Insert appropriate branch. - BuildMI(ThisMBB, DL, TII.getBrCond(FirstCC)) + BuildMI(FirstMBB, DL, TII.getBrCond(FirstCC)) .addReg(FLHS) .addReg(FRHS) .addMBB(SinkMBB); @@ -9876,7 +9932,7 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second, auto SecondCC = static_cast<RISCVCC::CondCode>(Second.getOperand(3).getImm()); // Insert appropriate branch. - BuildMI(FirstMBB, DL, TII.getBrCond(SecondCC)) + BuildMI(ThisMBB, DL, TII.getBrCond(SecondCC)) .addReg(SLHS) .addReg(SRHS) .addMBB(SinkMBB); @@ -9884,9 +9940,9 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second, Register DestReg = Second.getOperand(0).getReg(); Register Op2Reg4 = Second.getOperand(4).getReg(); BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII.get(RISCV::PHI), DestReg) - .addReg(Op1Reg4) - .addMBB(ThisMBB) .addReg(Op2Reg4) + .addMBB(ThisMBB) + .addReg(Op1Reg4) .addMBB(FirstMBB) .addReg(Op1Reg5) .addMBB(SecondMBB); @@ -12096,6 +12152,17 @@ const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry( return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx); } +bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const { + // We define vscale to be VLEN/RVVBitsPerBlock. VLEN is always a power + // of two >= 64, and RVVBitsPerBlock is 64. Thus, vscale must be + // a power of two as well. + // FIXME: This doesn't work for zve32, but that's already broken + // elsewhere for the same reason. + assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported"); + assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed"); + return true; +} + bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index eb013d4b6682..5e15176de59c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -520,9 +520,7 @@ public: SmallVectorImpl<SDValue> &InVals) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override { - return true; - } + Type *Ty) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool shouldConsiderGEPOffsetSplit() const override { return true; } @@ -599,6 +597,8 @@ public: unsigned uid, MCContext &Ctx) const override; + bool isVScaleKnownToBeAPowerOfTwo() const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index ee4c026af8f4..06a90438838e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -384,7 +384,6 @@ def uimm6gt32 : ImmLeaf<XLenVT, [{ // Necessary because a frameindex can't be matched directly in a pattern. def FrameAddrRegImm : ComplexPattern<iPTR, 2, "SelectFrameAddrRegImm", [frameindex, or, add]>; -def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">; def AddrRegImm : ComplexPattern<iPTR, 2, "SelectAddrRegImm">; // Return the negation of an immediate value. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index f8bc241039f8..1ad634344c09 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -115,6 +115,35 @@ class VSXSched<int n, string o> : class VLFSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDFF" # n), ReadVLDX, ReadVMask]>; +// Unit-Stride Segment Loads and Stores +class VLSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLSEG" #nf #"e" #eew), ReadVLDX, ReadVMask]>; +class VSSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTE" #eew #"V"), ReadVSTX, ReadVMask]>; +class VLSEGFFSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLSEGFF" #nf #"e" #eew), ReadVLDX, ReadVMask]>; +// Strided Segment Loads and Stores +class VLSSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLSSEG" #nf #"e" #eew), ReadVLDX, ReadVLDSX, + ReadVMask]>; +class VSSSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSSSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTS" #eew #"V"), ReadVSTX, ReadVSTSX, ReadVMask]>; +// Indexed Segment Loads and Stores +class VLUXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLUXSEG" #nf #"e" #eew), ReadVLDX, ReadVLDUXV, + ReadVMask]>; +class VLOXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLOXSEG" #nf #"e" #eew), ReadVLDX, ReadVLDOXV, + ReadVMask]>; +class VSUXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSUXSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTUX" #eew), ReadVSTX, ReadVSTUXV, ReadVMask]>; +class VSOXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSOXSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTOX" #eew), ReadVSTX, ReadVSTOXV, ReadVMask]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -1476,14 +1505,9 @@ defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0, RVVConstraint = NoConstraint in { -def VMV1R_V : RVInstV<0b100111, 0, OPIVI, (outs VR:$vd), (ins VR:$vs2), - "vmv1r.v", "$vd, $vs2">, VMVRSched<1> { - let Uses = []; - let vm = 1; -} // A future extension may relax the vector register alignment restrictions. -foreach n = [2, 4, 8] in { - defvar vrc = !cast<VReg>("VRM"#n); +foreach n = [1, 2, 4, 8] in { + defvar vrc = !cast<VReg>(!if(!eq(n, 1), "VR", "VRM"#n)); def VMV#n#R_V : RVInstV<0b100111, !add(n, -1), OPIVI, (outs vrc:$vd), (ins vrc:$vs2), "vmv" # n # "r.v", "$vd, $vs2">, VMVRSched<n> { @@ -1500,31 +1524,35 @@ let Predicates = [HasVInstructions] in { defvar w = !cast<RISCVWidth>("LSWidth"#eew); def VLSEG#nf#E#eew#_V : - VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">; + VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">, + VLSEGSched<nf, eew>; def VLSEG#nf#E#eew#FF_V : - VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">; + VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">, + VLSEGFFSched<nf, eew>; def VSSEG#nf#E#eew#_V : - VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">; - + VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">, + VSSEGSched<nf, eew>; // Vector Strided Instructions def VLSSEG#nf#E#eew#_V : - VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">; + VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">, + VLSSEGSched<nf, eew>; def VSSSEG#nf#E#eew#_V : - VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">; + VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">, + VSSSEGSched<nf, eew>; // Vector Indexed Instructions def VLUXSEG#nf#EI#eew#_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, w, - "vluxseg"#nf#"ei"#eew#".v">; + "vluxseg"#nf#"ei"#eew#".v">, VLUXSEGSched<nf, eew>; def VLOXSEG#nf#EI#eew#_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, w, - "vloxseg"#nf#"ei"#eew#".v">; + "vloxseg"#nf#"ei"#eew#".v">, VLOXSEGSched<nf, eew>; def VSUXSEG#nf#EI#eew#_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, w, - "vsuxseg"#nf#"ei"#eew#".v">; + "vsuxseg"#nf#"ei"#eew#".v">, VSUXSEGSched<nf, eew>; def VSOXSEG#nf#EI#eew#_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, w, - "vsoxseg"#nf#"ei"#eew#".v">; + "vsoxseg"#nf#"ei"#eew#".v">, VSOXSEGSched<nf, eew>; } } } // Predicates = [HasVInstructions] @@ -1533,17 +1561,22 @@ let Predicates = [HasVInstructionsI64] in { foreach nf=2-8 in { // Vector Unit-strided Segment Instructions def VLSEG#nf#E64_V : - VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">; + VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">, + VLSEGSched<nf, 64>; def VLSEG#nf#E64FF_V : - VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">; + VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">, + VLSEGFFSched<nf, 64>; def VSSEG#nf#E64_V : - VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">; + VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">, + VSSEGSched<nf, 64>; // Vector Strided Segment Instructions def VLSSEG#nf#E64_V : - VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">; + VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">, + VLSSEGSched<nf, 64>; def VSSSEG#nf#E64_V : - VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">; + VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">, + VSSSEGSched<nf, 64>; } } // Predicates = [HasVInstructionsI64] let Predicates = [HasVInstructionsI64, IsRV64] in { @@ -1551,16 +1584,16 @@ let Predicates = [HasVInstructionsI64, IsRV64] in { // Vector Indexed Segment Instructions def VLUXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, LSWidth64, - "vluxseg"#nf#"ei64.v">; + "vluxseg"#nf#"ei64.v">, VLUXSEGSched<nf, 64>; def VLOXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, LSWidth64, - "vloxseg"#nf#"ei64.v">; + "vloxseg"#nf#"ei64.v">, VLOXSEGSched<nf, 64>; def VSUXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, LSWidth64, - "vsuxseg"#nf#"ei64.v">; + "vsuxseg"#nf#"ei64.v">, VSUXSEGSched<nf, 64>; def VSOXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, LSWidth64, - "vsoxseg"#nf#"ei64.v">; + "vsoxseg"#nf#"ei64.v">, VSOXSEGSched<nf, 64>; } } // Predicates = [HasVInstructionsI64, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 06d4c4d0a9e6..b7b25643e397 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -34,11 +34,11 @@ multiclass VPatUSLoadStoreSDNode<ValueType type, defvar load_instr = !cast<Instruction>("PseudoVLE"#sew#"_V_"#vlmul.MX); defvar store_instr = !cast<Instruction>("PseudoVSE"#sew#"_V_"#vlmul.MX); // Load - def : Pat<(type (load BaseAddr:$rs1)), - (load_instr BaseAddr:$rs1, avl, log2sew)>; + def : Pat<(type (load GPR:$rs1)), + (load_instr GPR:$rs1, avl, log2sew)>; // Store - def : Pat<(store type:$rs2, BaseAddr:$rs1), - (store_instr reg_class:$rs2, BaseAddr:$rs1, avl, log2sew)>; + def : Pat<(store type:$rs2, GPR:$rs1), + (store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>; } multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type, @@ -53,11 +53,11 @@ multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type, !cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V"); // Load - def : Pat<(type (load BaseAddr:$rs1)), - (load_instr BaseAddr:$rs1)>; + def : Pat<(type (load GPR:$rs1)), + (load_instr GPR:$rs1)>; // Store - def : Pat<(store type:$rs2, BaseAddr:$rs1), - (store_instr reg_class:$rs2, BaseAddr:$rs1)>; + def : Pat<(store type:$rs2, GPR:$rs1), + (store_instr reg_class:$rs2, GPR:$rs1)>; } multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> @@ -65,11 +65,11 @@ multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX); defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX); // Load - def : Pat<(m.Mask (load BaseAddr:$rs1)), - (load_instr BaseAddr:$rs1, m.AVL, m.Log2SEW)>; + def : Pat<(m.Mask (load GPR:$rs1)), + (load_instr GPR:$rs1, m.AVL, m.Log2SEW)>; // Store - def : Pat<(store m.Mask:$rs2, BaseAddr:$rs1), - (store_instr VR:$rs2, BaseAddr:$rs1, m.AVL, m.Log2SEW)>; + def : Pat<(store m.Mask:$rs2, GPR:$rs1), + (store_instr VR:$rs2, GPR:$rs1, m.AVL, m.Log2SEW)>; } class VPatBinarySDNode_VV<SDNode vop, @@ -1038,10 +1038,14 @@ let Predicates = [HasVInstructionsAnyF] in foreach vti = AllFloatVectors in { // Fold store of vmv.f.s to a vse with VL=1. defvar store_instr = !cast<Instruction>("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX); - def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), BaseAddr:$rs1), - (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>; - def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), BaseAddr:$rs1), - (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>; + + let AddedComplexity = 2 in { + // Add complexity to increase the priority of this pattern being matched. + def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), GPR:$rs1), + (store_instr vti.RegClass:$rs2, GPR:$rs1, 1, vti.Log2SEW)>; + def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), GPR:$rs1), + (store_instr vti.RegClass:$rs2, GPR:$rs1, 1, vti.Log2SEW)>; + } defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_", vti.ScalarSuffix, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 081f61617d59..49306bb0f4e2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -76,13 +76,13 @@ def riscv_urem_vl : SDNode<"RISCVISD::UREM_VL", SDT_RISCVIntBinOp_VL>; def riscv_shl_vl : SDNode<"RISCVISD::SHL_VL", SDT_RISCVIntBinOp_VL>; def riscv_sra_vl : SDNode<"RISCVISD::SRA_VL", SDT_RISCVIntBinOp_VL>; def riscv_srl_vl : SDNode<"RISCVISD::SRL_VL", SDT_RISCVIntBinOp_VL>; -def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL>; -def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL>; -def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL>; -def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL>; +def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; -def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL>; -def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL>; +def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_ssubsat_vl : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>; def riscv_usubsat_vl : SDNode<"RISCVISD::USUBSAT_VL", SDT_RISCVIntBinOp_VL>; @@ -94,8 +94,8 @@ def riscv_fneg_vl : SDNode<"RISCVISD::FNEG_VL", SDT_RISCVFPUnOp_VL>; def riscv_fabs_vl : SDNode<"RISCVISD::FABS_VL", SDT_RISCVFPUnOp_VL>; def riscv_fsqrt_vl : SDNode<"RISCVISD::FSQRT_VL", SDT_RISCVFPUnOp_VL>; def riscv_fcopysign_vl : SDNode<"RISCVISD::FCOPYSIGN_VL", SDT_RISCVFPBinOp_VL>; -def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL>; -def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL>; +def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>; +def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>; def SDT_RISCVVecFMA_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 9532d1dd3dd2..02ae4f88d56a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -83,13 +83,13 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{ def BCLRXForm : SDNodeXForm<imm, [{ // Find the lowest 0. - return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(), + return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()), SDLoc(N), N->getValueType(0)); }]>; def BSETINVXForm : SDNodeXForm<imm, [{ // Find the lowest 1. - return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(), + return CurDAG->getTargetConstant(countTrailingZeros(N->getZExtValue()), SDLoc(N), N->getValueType(0)); }]>; @@ -239,6 +239,10 @@ def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{ return !C || !isInt<12>(C->getSExtValue()); }]>; +def sh1add_op : ComplexPattern<XLenVT, 1, "selectSH1ADDOp", [], [], 6>; +def sh2add_op : ComplexPattern<XLenVT, 1, "selectSH2ADDOp", [], [], 6>; +def sh3add_op : ComplexPattern<XLenVT, 1, "selectSH3ADDOp", [], [], 6>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -1095,6 +1099,14 @@ def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), non_imm12:$rs2), def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), non_imm12:$rs2), (SH3ADD GPR:$rs1, GPR:$rs2)>; +// More complex cases use a ComplexPattern. +def : Pat<(add sh1add_op:$rs1, non_imm12:$rs2), + (SH1ADD sh1add_op:$rs1, GPR:$rs2)>; +def : Pat<(add sh2add_op:$rs1, non_imm12:$rs2), + (SH2ADD sh2add_op:$rs1, GPR:$rs2)>; +def : Pat<(add sh3add_op:$rs1, non_imm12:$rs2), + (SH3ADD sh3add_op:$rs1, GPR:$rs2)>; + def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2), (SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2), @@ -1190,18 +1202,6 @@ def : Pat<(i64 (add (and GPR:$rs1, 0x3FFFFFFFC), non_imm12:$rs2)), (SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>; def : Pat<(i64 (add (and GPR:$rs1, 0x7FFFFFFF8), non_imm12:$rs2)), (SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>; - -// Use SRLIW to shift out the LSBs and zero the upper 32-bits. Use SHXADD to -// shift zeros into the LSBs the addition shl amount. -def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 1)), - non_imm12:$rs2)), - (SH2ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>; -def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 2)), - non_imm12:$rs2)), - (SH3ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>; -def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFC), (i64 1)), - non_imm12:$rs2)), - (SH3ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>; } // Predicates = [HasStdExtZba, IsRV64] let Predicates = [HasStdExtZbcOrZbkc] in { diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index 1fc424411c12..dad0aa476471 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -293,8 +293,16 @@ static void updateOperands(MachineInstr &MI, RegImmPair OldRegImm, assert((isCompressibleLoad(MI) || isCompressibleStore(MI)) && "Unsupported instruction for this optimization."); + int SkipN = 0; + + // Skip the first (value) operand to a store instruction (except if the store + // offset is zero) in order to avoid an incorrect transformation. + // e.g. sd a0, 808(a0) to addi a2, a0, 768; sd a2, 40(a2) + if (isCompressibleStore(MI) && OldRegImm.Imm != 0) + SkipN = 1; + // Update registers - for (MachineOperand &MO : MI.operands()) + for (MachineOperand &MO : drop_begin(MI.operands(), SkipN)) if (MO.isReg() && MO.getReg() == OldRegImm.Reg) { // Do not update operands that define the old register. // diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 43af1802d706..bafcf47b82e4 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -53,6 +53,20 @@ def WriteVLDFF8 : SchedWrite; def WriteVLDFF16 : SchedWrite; def WriteVLDFF32 : SchedWrite; def WriteVLDFF64 : SchedWrite; +// 7.8. Vector Segment Instructions +foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + def WriteVLSEG # nf # e # eew : SchedWrite; + def WriteVSSEG # nf # e # eew : SchedWrite; + def WriteVLSEGFF # nf # e # eew : SchedWrite; + def WriteVLSSEG # nf # e # eew : SchedWrite; + def WriteVSSSEG # nf # e # eew : SchedWrite; + def WriteVLUXSEG # nf # e # eew : SchedWrite; + def WriteVLOXSEG # nf # e # eew : SchedWrite; + def WriteVSUXSEG # nf # e # eew : SchedWrite; + def WriteVSOXSEG # nf # e # eew : SchedWrite; + } +} // 7.9. Vector Whole Register Instructions def WriteVLD1R8 : SchedWrite; def WriteVLD1R16 : SchedWrite; @@ -538,6 +552,20 @@ def : WriteRes<WriteVST1R, []>; def : WriteRes<WriteVST2R, []>; def : WriteRes<WriteVST4R, []>; def : WriteRes<WriteVST8R, []>; +// Vector Segment Loads and Stores +foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + def : WriteRes <!cast<SchedWrite>("WriteVLSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLSEGFF" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLSSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSSSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLUXSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLOXSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSUXSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSOXSEG" # nf # "e" # eew), []>; + } +} // 12. Vector Integer Arithmetic Instructions def : WriteRes<WriteVIALUV, []>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 7caf0fedb2ca..96c46fb7554f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -57,6 +57,10 @@ public: bool shouldExpandReduction(const IntrinsicInst *II) const; bool supportsScalableVectors() const { return ST->hasVInstructions(); } + PredicationStyle emitGetActiveLaneMask() const { + return ST->hasVInstructions() ? PredicationStyle::Data + : PredicationStyle::None; + } Optional<unsigned> getMaxVScale() const; Optional<unsigned> getVScaleForTuning() const; diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp index d953bc590473..f726f42c9bcb 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp @@ -46,12 +46,6 @@ public: void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -110,9 +104,6 @@ static void emitUntypedInstrOperands(const MCInst &MI, EndianWriter &OSE) { void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - auto Features = computeAvailableFeatures(STI.getFeatureBits()); - verifyInstructionPredicates(MI, Features); - EndianWriter OSE(OS, support::little); // Encode the first 32 SPIR-V bytes with the number of args and the opcode. @@ -128,5 +119,4 @@ void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, emitUntypedInstrOperands(MI, OSE); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SPIRVGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp index 6b8b4a73af92..62ce15550ae7 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp @@ -22,6 +22,7 @@ #include "llvm/MC/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SPIRVGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h index 4009fa96aa68..abc8df34be0a 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h @@ -44,6 +44,7 @@ std::unique_ptr<MCObjectTargetWriter> createSPIRVObjectTargetWriter(); // Defines symbolic names for the SPIR-V instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "SPIRVGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 0de232651377..605bf949187f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -215,6 +215,9 @@ void SPIRVAsmPrinter::outputInstruction(const MachineInstr *MI) { } void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) { + SPIRV_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + if (!MAI->getSkipEmission(MI)) outputInstruction(MI); diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index df07a126eeea..5b6b82aebf30 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -68,6 +68,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const { assert(GR && "Must initialize the SPIRV type registry before lowering args."); + GR->setCurrentFunc(MIRBuilder.getMF()); // Assign types and names to all args, and store their types for later. SmallVector<Register, 4> ArgTypeVRegs; @@ -114,6 +115,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, auto MRI = MIRBuilder.getMRI(); Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass); + if (F.isDeclaration()) + GR->add(&F, &MIRBuilder.getMF(), FuncVReg); auto *FTy = F.getFunctionType(); auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder); @@ -136,6 +139,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, MIRBuilder.buildInstr(SPIRV::OpFunctionParameter) .addDef(VRegs[i][0]) .addUse(ArgTypeVRegs[i]); + if (F.isDeclaration()) + GR->add(F.getArg(i), &MIRBuilder.getMF(), VRegs[i][0]); } // Name the function. if (F.hasName()) @@ -165,6 +170,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (Info.OrigRet.Regs.size() > 1) return false; + GR->setCurrentFunc(MIRBuilder.getMF()); Register ResVReg = Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0]; // Emit a regular OpFunctionCall. If it's an externally declared function, diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp new file mode 100644 index 000000000000..57cd4bafd351 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp @@ -0,0 +1,95 @@ +//===-- SPIRVDuplicatesTracker.cpp - SPIR-V Duplicates Tracker --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// General infrastructure for keeping track of the values that according to +// the SPIR-V binary layout should be global to the whole module. +// +//===----------------------------------------------------------------------===// + +#include "SPIRVDuplicatesTracker.h" + +using namespace llvm; + +template <typename T> +void SPIRVGeneralDuplicatesTracker::prebuildReg2Entry( + SPIRVDuplicatesTracker<T> &DT, SPIRVReg2EntryTy &Reg2Entry) { + for (auto &TPair : DT.getAllUses()) { + for (auto &RegPair : TPair.second) { + const MachineFunction *MF = RegPair.first; + Register R = RegPair.second; + MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(R); + if (!MI) + continue; + Reg2Entry[&MI->getOperand(0)] = &TPair.second; + } + } +} + +void SPIRVGeneralDuplicatesTracker::buildDepsGraph( + std::vector<SPIRV::DTSortableEntry *> &Graph, + MachineModuleInfo *MMI = nullptr) { + SPIRVReg2EntryTy Reg2Entry; + prebuildReg2Entry(TT, Reg2Entry); + prebuildReg2Entry(CT, Reg2Entry); + prebuildReg2Entry(GT, Reg2Entry); + prebuildReg2Entry(FT, Reg2Entry); + prebuildReg2Entry(AT, Reg2Entry); + + for (auto &Op2E : Reg2Entry) { + SPIRV::DTSortableEntry *E = Op2E.second; + Graph.push_back(E); + for (auto &U : *E) { + const MachineRegisterInfo &MRI = U.first->getRegInfo(); + MachineInstr *MI = MRI.getUniqueVRegDef(U.second); + if (!MI) + continue; + assert(MI && MI->getParent() && "No MachineInstr created yet"); + for (auto i = MI->getNumDefs(); i < MI->getNumOperands(); i++) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg()) + continue; + MachineOperand *RegOp = &MRI.getVRegDef(Op.getReg())->getOperand(0); + assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) || + Reg2Entry.count(RegOp)); + if (Reg2Entry.count(RegOp)) + E->addDep(Reg2Entry[RegOp]); + } + + if (E->getIsFunc()) { + MachineInstr *Next = MI->getNextNode(); + if (Next && (Next->getOpcode() == SPIRV::OpFunction || + Next->getOpcode() == SPIRV::OpFunctionParameter)) { + E->addDep(Reg2Entry[&Next->getOperand(0)]); + } + } + } + } + + if (MMI) { + const Module *M = MMI->getModule(); + for (auto F = M->begin(), E = M->end(); F != E; ++F) { + const MachineFunction *MF = MMI->getMachineFunction(*F); + if (!MF) + continue; + for (const MachineBasicBlock &MBB : *MF) { + for (const MachineInstr &CMI : MBB) { + MachineInstr &MI = const_cast<MachineInstr &>(CMI); + MI.dump(); + if (MI.getNumExplicitDefs() > 0 && + Reg2Entry.count(&MI.getOperand(0))) { + dbgs() << "\t["; + for (SPIRV::DTSortableEntry *D : + Reg2Entry.lookup(&MI.getOperand(0))->getDeps()) + dbgs() << Register::virtReg2Index(D->lookup(MF)) << ", "; + dbgs() << "]\n"; + } + } + } + } + } +}
\ No newline at end of file diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h new file mode 100644 index 000000000000..58ae1f86ce42 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -0,0 +1,174 @@ +//===-- SPIRVDuplicatesTracker.h - SPIR-V Duplicates Tracker ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// General infrastructure for keeping track of the values that according to +// the SPIR-V binary layout should be global to the whole module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H +#define LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H + +#include "MCTargetDesc/SPIRVBaseInfo.h" +#include "MCTargetDesc/SPIRVMCTargetDesc.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" + +#include <type_traits> + +namespace llvm { +namespace SPIRV { +// NOTE: using MapVector instead of DenseMap because it helps getting +// everything ordered in a stable manner for a price of extra (NumKeys)*PtrSize +// memory and expensive removals which do not happen anyway. +class DTSortableEntry : public MapVector<const MachineFunction *, Register> { + SmallVector<DTSortableEntry *, 2> Deps; + + struct FlagsTy { + unsigned IsFunc : 1; + unsigned IsGV : 1; + // NOTE: bit-field default init is a C++20 feature. + FlagsTy() : IsFunc(0), IsGV(0) {} + }; + FlagsTy Flags; + +public: + // Common hoisting utility doesn't support function, because their hoisting + // require hoisting of params as well. + bool getIsFunc() const { return Flags.IsFunc; } + bool getIsGV() const { return Flags.IsGV; } + void setIsFunc(bool V) { Flags.IsFunc = V; } + void setIsGV(bool V) { Flags.IsGV = V; } + + const SmallVector<DTSortableEntry *, 2> &getDeps() const { return Deps; } + void addDep(DTSortableEntry *E) { Deps.push_back(E); } +}; +} // namespace SPIRV + +template <typename KeyTy> class SPIRVDuplicatesTrackerBase { +public: + // NOTE: using MapVector instead of DenseMap helps getting everything ordered + // in a stable manner for a price of extra (NumKeys)*PtrSize memory and + // expensive removals which don't happen anyway. + using StorageTy = MapVector<KeyTy, SPIRV::DTSortableEntry>; + +private: + StorageTy Storage; + +public: + void add(KeyTy V, const MachineFunction *MF, Register R) { + if (find(V, MF).isValid()) + return; + + Storage[V][MF] = R; + if (std::is_same<Function, + typename std::remove_const< + typename std::remove_pointer<KeyTy>::type>::type>() || + std::is_same<Argument, + typename std::remove_const< + typename std::remove_pointer<KeyTy>::type>::type>()) + Storage[V].setIsFunc(true); + if (std::is_same<GlobalVariable, + typename std::remove_const< + typename std::remove_pointer<KeyTy>::type>::type>()) + Storage[V].setIsGV(true); + } + + Register find(KeyTy V, const MachineFunction *MF) const { + auto iter = Storage.find(V); + if (iter != Storage.end()) { + auto Map = iter->second; + auto iter2 = Map.find(MF); + if (iter2 != Map.end()) + return iter2->second; + } + return Register(); + } + + const StorageTy &getAllUses() const { return Storage; } + +private: + StorageTy &getAllUses() { return Storage; } + + // The friend class needs to have access to the internal storage + // to be able to build dependency graph, can't declare only one + // function a 'friend' due to the incomplete declaration at this point + // and mutual dependency problems. + friend class SPIRVGeneralDuplicatesTracker; +}; + +template <typename T> +class SPIRVDuplicatesTracker : public SPIRVDuplicatesTrackerBase<const T *> {}; + +class SPIRVGeneralDuplicatesTracker { + SPIRVDuplicatesTracker<Type> TT; + SPIRVDuplicatesTracker<Constant> CT; + SPIRVDuplicatesTracker<GlobalVariable> GT; + SPIRVDuplicatesTracker<Function> FT; + SPIRVDuplicatesTracker<Argument> AT; + + // NOTE: using MOs instead of regs to get rid of MF dependency to be able + // to use flat data structure. + // NOTE: replacing DenseMap with MapVector doesn't affect overall correctness + // but makes LITs more stable, should prefer DenseMap still due to + // significant perf difference. + using SPIRVReg2EntryTy = + MapVector<MachineOperand *, SPIRV::DTSortableEntry *>; + + template <typename T> + void prebuildReg2Entry(SPIRVDuplicatesTracker<T> &DT, + SPIRVReg2EntryTy &Reg2Entry); + +public: + void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph, + MachineModuleInfo *MMI); + + void add(const Type *T, const MachineFunction *MF, Register R) { + TT.add(T, MF, R); + } + + void add(const Constant *C, const MachineFunction *MF, Register R) { + CT.add(C, MF, R); + } + + void add(const GlobalVariable *GV, const MachineFunction *MF, Register R) { + GT.add(GV, MF, R); + } + + void add(const Function *F, const MachineFunction *MF, Register R) { + FT.add(F, MF, R); + } + + void add(const Argument *Arg, const MachineFunction *MF, Register R) { + AT.add(Arg, MF, R); + } + + Register find(const Type *T, const MachineFunction *MF) { + return TT.find(const_cast<Type *>(T), MF); + } + + Register find(const Constant *C, const MachineFunction *MF) { + return CT.find(const_cast<Constant *>(C), MF); + } + + Register find(const GlobalVariable *GV, const MachineFunction *MF) { + return GT.find(const_cast<GlobalVariable *>(GV), MF); + } + + Register find(const Function *F, const MachineFunction *MF) { + return FT.find(const_cast<Function *>(F), MF); + } + + Register find(const Argument *Arg, const MachineFunction *MF) { + return AT.find(const_cast<Argument *>(Arg), MF); + } +}; +} // namespace llvm +#endif
\ No newline at end of file diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 02a6905a1abc..5f890c003cbc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -101,7 +101,6 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, SPIRVType *SpvType, bool EmitIR) { auto &MF = MIRBuilder.getMF(); - Register Res; const IntegerType *LLVMIntTy; if (SpvType) LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType)); @@ -110,15 +109,18 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, // Find a constant in DT or build a new one. const auto ConstInt = ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val); - unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; - Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); - assignTypeToVReg(LLVMIntTy, Res, MIRBuilder); - if (EmitIR) - MIRBuilder.buildConstant(Res, *ConstInt); - else - MIRBuilder.buildInstr(SPIRV::OpConstantI) - .addDef(Res) - .addImm(ConstInt->getSExtValue()); + Register Res = DT.find(ConstInt, &MF); + if (!Res.isValid()) { + unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; + Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); + assignTypeToVReg(LLVMIntTy, Res, MIRBuilder); + if (EmitIR) + MIRBuilder.buildConstant(Res, *ConstInt); + else + MIRBuilder.buildInstr(SPIRV::OpConstantI) + .addDef(Res) + .addImm(ConstInt->getSExtValue()); + } return Res; } @@ -126,7 +128,6 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType) { auto &MF = MIRBuilder.getMF(); - Register Res; const Type *LLVMFPTy; if (SpvType) { LLVMFPTy = getTypeForSPIRVType(SpvType); @@ -136,10 +137,13 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val, } // Find a constant in DT or build a new one. const auto ConstFP = ConstantFP::get(LLVMFPTy->getContext(), Val); - unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; - Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); - assignTypeToVReg(LLVMFPTy, Res, MIRBuilder); - MIRBuilder.buildFConstant(Res, *ConstFP); + Register Res = DT.find(ConstFP, &MF); + if (!Res.isValid()) { + unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; + Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); + assignTypeToVReg(LLVMFPTy, Res, MIRBuilder); + MIRBuilder.buildFConstant(Res, *ConstFP); + } return Res; } @@ -184,6 +188,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( *Subtarget.getRegBankInfo()); } Reg = MIB->getOperand(0).getReg(); + DT.add(GVar, &MIRBuilder.getMF(), Reg); // Set to Reg the same type as ResVReg has. auto MRI = MIRBuilder.getMRI(); @@ -318,10 +323,11 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const { SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType( const Type *Type, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AccessQual, bool EmitIR) { + Register Reg = DT.find(Type, &MIRBuilder.getMF()); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR); - VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType; - SPIRVToLLVMType[SpirvType] = Type; - return SpirvType; + return restOfCreateSPIRVType(Type, SpirvType); } bool SPIRVGlobalRegistry::isScalarOfType(Register VReg, @@ -387,17 +393,21 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth, MIRBuilder); } -SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(Type *LLVMTy, - MachineInstrBuilder MIB) { - SPIRVType *SpirvType = MIB; +SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(const Type *LLVMTy, + SPIRVType *SpirvType) { + assert(CurMF == SpirvType->getMF()); VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType; SPIRVToLLVMType[SpirvType] = LLVMTy; + DT.add(LLVMTy, CurMF, getSPIRVTypeID(SpirvType)); return SpirvType; } SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType( unsigned BitWidth, MachineInstr &I, const SPIRVInstrInfo &TII) { Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), BitWidth); + Register Reg = DT.find(LLVMTy, CurMF); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); MachineBasicBlock &BB = *I.getParent(); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeInt)) .addDef(createTypeVReg(CurMF->getRegInfo())) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index 952ab4c13e29..13dcc20a3e0a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H #include "MCTargetDesc/SPIRVBaseInfo.h" +#include "SPIRVDuplicatesTracker.h" #include "SPIRVInstrInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -30,7 +31,10 @@ class SPIRVGlobalRegistry { // where Reg = OpType... // while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not // type-declaring ones) - DenseMap<MachineFunction *, DenseMap<Register, SPIRVType *>> VRegToTypeMap; + DenseMap<const MachineFunction *, DenseMap<Register, SPIRVType *>> + VRegToTypeMap; + + SPIRVGeneralDuplicatesTracker DT; DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType; @@ -48,6 +52,39 @@ public: MachineFunction *CurMF; + void add(const Constant *C, MachineFunction *MF, Register R) { + DT.add(C, MF, R); + } + + void add(const GlobalVariable *GV, MachineFunction *MF, Register R) { + DT.add(GV, MF, R); + } + + void add(const Function *F, MachineFunction *MF, Register R) { + DT.add(F, MF, R); + } + + void add(const Argument *Arg, MachineFunction *MF, Register R) { + DT.add(Arg, MF, R); + } + + Register find(const Constant *C, MachineFunction *MF) { + return DT.find(C, MF); + } + + Register find(const GlobalVariable *GV, MachineFunction *MF) { + return DT.find(GV, MF); + } + + Register find(const Function *F, MachineFunction *MF) { + return DT.find(F, MF); + } + + void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph, + MachineModuleInfo *MMI = nullptr) { + DT.buildDepsGraph(Graph, MMI); + } + // Get or create a SPIR-V type corresponding the given LLVM IR type, // and map it to the given VReg by creating an ASSIGN_TYPE instruction. SPIRVType *assignTypeToVReg( @@ -136,7 +173,7 @@ private: SPIRVType *getOpTypeFunction(SPIRVType *RetType, const SmallVectorImpl<SPIRVType *> &ArgTypes, MachineIRBuilder &MIRBuilder); - SPIRVType *restOfCreateSPIRVType(Type *LLVMTy, MachineInstrBuilder MIB); + SPIRVType *restOfCreateSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType); public: Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 9294a60506a8..90b921a06f21 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -807,23 +807,29 @@ void SPIRVInstructionSelector::renderImm32(MachineInstrBuilder &MIB, Register SPIRVInstructionSelector::buildI32Constant(uint32_t Val, MachineInstr &I, const SPIRVType *ResType) const { + Type *LLVMTy = IntegerType::get(GR.CurMF->getFunction().getContext(), 32); const SPIRVType *SpvI32Ty = ResType ? ResType : GR.getOrCreateSPIRVIntegerType(32, I, TII); - Register NewReg; - NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); - MachineInstr *MI; - MachineBasicBlock &BB = *I.getParent(); - if (Val == 0) { - MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) - .addDef(NewReg) - .addUse(GR.getSPIRVTypeID(SpvI32Ty)); - } else { - MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) - .addDef(NewReg) - .addUse(GR.getSPIRVTypeID(SpvI32Ty)) - .addImm(APInt(32, Val).getZExtValue()); + // Find a constant in DT or build a new one. + auto ConstInt = ConstantInt::get(LLVMTy, Val); + Register NewReg = GR.find(ConstInt, GR.CurMF); + if (!NewReg.isValid()) { + NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + GR.add(ConstInt, GR.CurMF, NewReg); + MachineInstr *MI; + MachineBasicBlock &BB = *I.getParent(); + if (Val == 0) { + MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) + .addDef(NewReg) + .addUse(GR.getSPIRVTypeID(SpvI32Ty)); + } else { + MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) + .addDef(NewReg) + .addUse(GR.getSPIRVTypeID(SpvI32Ty)) + .addImm(APInt(32, Val).getZExtValue()); + } + constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); } - constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); return NewReg; } diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index fa78dd7942c6..a39df5234935 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -28,6 +28,11 @@ using namespace llvm; #define DEBUG_TYPE "spirv-module-analysis" +static cl::opt<bool> + SPVDumpDeps("spv-dump-deps", + cl::desc("Dump MIR with SPIR-V dependencies info"), + cl::Optional, cl::init(false)); + char llvm::SPIRVModuleAnalysis::ID = 0; namespace llvm { @@ -113,6 +118,83 @@ static bool findSameInstrInMS(const MachineInstr &A, return false; } +// Collect MI which defines the register in the given machine function. +static void collectDefInstr(Register Reg, const MachineFunction *MF, + SPIRV::ModuleAnalysisInfo *MAI, + SPIRV::ModuleSectionType MSType, + bool DoInsert = true) { + assert(MAI->hasRegisterAlias(MF, Reg) && "Cannot find register alias"); + MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(Reg); + assert(MI && "There should be an instruction that defines the register"); + MAI->setSkipEmission(MI); + if (DoInsert) + MAI->MS[MSType].push_back(MI); +} + +void SPIRVModuleAnalysis::collectGlobalEntities( + const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, + SPIRV::ModuleSectionType MSType, + std::function<bool(const SPIRV::DTSortableEntry *)> Pred, + bool UsePreOrder) { + DenseSet<const SPIRV::DTSortableEntry *> Visited; + for (const auto *E : DepsGraph) { + std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil; + // NOTE: here we prefer recursive approach over iterative because + // we don't expect depchains long enough to cause SO. + RecHoistUtil = [MSType, UsePreOrder, &Visited, &Pred, + &RecHoistUtil](const SPIRV::DTSortableEntry *E) { + if (Visited.count(E) || !Pred(E)) + return; + Visited.insert(E); + + // Traversing deps graph in post-order allows us to get rid of + // register aliases preprocessing. + // But pre-order is required for correct processing of function + // declaration and arguments processing. + if (!UsePreOrder) + for (auto *S : E->getDeps()) + RecHoistUtil(S); + + Register GlobalReg = Register::index2VirtReg(MAI.getNextID()); + bool IsFirst = true; + for (auto &U : *E) { + const MachineFunction *MF = U.first; + Register Reg = U.second; + MAI.setRegisterAlias(MF, Reg, GlobalReg); + if (!MF->getRegInfo().getUniqueVRegDef(Reg)) + continue; + collectDefInstr(Reg, MF, &MAI, MSType, IsFirst); + IsFirst = false; + if (E->getIsGV()) + MAI.GlobalVarList.push_back(MF->getRegInfo().getUniqueVRegDef(Reg)); + } + + if (UsePreOrder) + for (auto *S : E->getDeps()) + RecHoistUtil(S); + }; + RecHoistUtil(E); + } +} + +// The function initializes global register alias table for types, consts, +// global vars and func decls and collects these instruction for output +// at module level. Also it collects explicit OpExtension/OpCapability +// instructions. +void SPIRVModuleAnalysis::processDefInstrs(const Module &M) { + std::vector<SPIRV::DTSortableEntry *> DepsGraph; + + GR->buildDepsGraph(DepsGraph, SPVDumpDeps ? MMI : nullptr); + + collectGlobalEntities( + DepsGraph, SPIRV::MB_TypeConstVars, + [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }, false); + + collectGlobalEntities( + DepsGraph, SPIRV::MB_ExtFuncDecls, + [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true); +} + // Look for IDs declared with Import linkage, and map the imported name string // to the register defining that variable (which will usually be the result of // an OpFunction). This lets us call externally imported functions using @@ -146,10 +228,9 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI, // numbering has already occurred by this point. We can directly compare reg // arguments when detecting duplicates. static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, - SPIRV::ModuleSectionType MSType, - bool IsConstOrType = false) { + SPIRV::ModuleSectionType MSType) { MAI.setSkipEmission(&MI); - if (findSameInstrInMS(MI, MSType, MAI, IsConstOrType, IsConstOrType ? 1 : 0)) + if (findSameInstrInMS(MI, MSType, MAI, false)) return; // Found a duplicate, so don't add it. // No duplicates, so add it. MAI.MS[MSType].push_back(&MI); @@ -163,18 +244,11 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { continue; MachineFunction *MF = MMI->getMachineFunction(*F); assert(MF); - unsigned FCounter = 0; for (MachineBasicBlock &MBB : *MF) for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == SPIRV::OpFunction) - FCounter++; if (MAI.getSkipEmission(&MI)) continue; const unsigned OpCode = MI.getOpcode(); - const bool IsFuncOrParm = - OpCode == SPIRV::OpFunction || OpCode == SPIRV::OpFunctionParameter; - const bool IsConstOrType = - TII->isConstantInstr(MI) || TII->isTypeDeclInstr(MI); if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) { collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames); } else if (OpCode == SPIRV::OpEntryPoint) { @@ -182,12 +256,6 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { } else if (TII->isDecorationInstr(MI)) { collectOtherInstr(MI, MAI, SPIRV::MB_Annotations); collectFuncNames(MI, *F); - } else if (IsConstOrType || (FCounter > 1 && IsFuncOrParm)) { - // Now OpSpecConstant*s are not in DT, - // but they need to be collected anyway. - enum SPIRV::ModuleSectionType Type = - IsFuncOrParm ? SPIRV::MB_ExtFuncDecls : SPIRV::MB_TypeConstVars; - collectOtherInstr(MI, MAI, Type, IsConstOrType); } else if (OpCode == SPIRV::OpFunction) { collectFuncNames(MI, *F); } @@ -239,6 +307,7 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) { // TODO: Process type/const/global var/func decl instructions, number their // destination registers from 0 to N, collect Extensions and Capabilities. + processDefInstrs(M); // Number rest of registers from N+1 onwards. numberRegistersGlobally(M); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 1bef13d458c1..585868909d28 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_SPIRV_SPIRVMODULEANALYSIS_H #include "MCTargetDesc/SPIRVBaseInfo.h" +#include "SPIRVDuplicatesTracker.h" #include "SPIRVSubtarget.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" @@ -123,6 +124,11 @@ public: private: void setBaseInfo(const Module &M); template <typename T> void collectTypesConstsVars(); + void collectGlobalEntities( + const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, + SPIRV::ModuleSectionType MSType, + std::function<bool(const SPIRV::DTSortableEntry *)> Pred, + bool UsePreOrder); void processDefInstrs(const Module &M); void collectFuncNames(MachineInstr &MI, const Function &F); void processOtherInstrs(const Module &M); diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index d75d41b35838..ee460002fc58 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -44,12 +44,11 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); namespace { class SparcMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; MCContext &Ctx; public: - SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), Ctx(ctx) {} + SparcMCCodeEmitter(const MCInstrInfo &, MCContext &ctx) + : Ctx(ctx) {} SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete; SparcMCCodeEmitter &operator=(const SparcMCCodeEmitter &) = delete; ~SparcMCCodeEmitter() override = default; @@ -84,12 +83,6 @@ public: unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -97,9 +90,6 @@ private: void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI); support::endian::write(OS, Bits, Ctx.getAsmInfo()->isLittleEndian() ? support::little @@ -253,7 +243,6 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo, return 0; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SparcGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index 49b75b7e0bd1..b11c786e7856 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -24,6 +24,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SparcGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index 7ef043d9df40..8e6a9ebdb2dd 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -46,6 +46,7 @@ std::unique_ptr<MCObjectTargetWriter> createSparcELFObjectWriter(bool Is64Bit, // Defines symbolic names for the Sparc instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "SparcGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index f6f9c0a1de81..c8961d507c72 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -250,6 +250,8 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI, } void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) { + Sparc_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); switch (MI->getOpcode()) { default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index 242f566da2c9..1a71ff28424f 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -150,23 +150,13 @@ private: return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC24DBL, 3, false); } - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace -void SystemZMCCodeEmitter:: -encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - +void SystemZMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { MemOpsEmitted = 0; uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); unsigned Size = MCII.get(MI.getOpcode()).getSize(); @@ -329,7 +319,6 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum, return 0; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SystemZGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 03141ecf551d..08886507fdb7 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -23,6 +23,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SystemZGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index db4485423416..f2bfc9ac48e5 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -95,6 +95,7 @@ std::unique_ptr<MCObjectTargetWriter> createSystemZObjectWriter(uint8_t OSABI); // Defines symbolic names for the SystemZ instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "SystemZGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index 6fb080607f51..1d55bf9a5804 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -143,6 +143,9 @@ void SystemZAsmPrinter::emitCallInformation(CallType CT) { } void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { + SystemZ_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + SystemZMCInstLower Lower(MF->getContext(), *this); MCInst LoweredMI; switch (MI->getOpcode()) { diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index a7ea5e1e4bf8..fdd82a01f211 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -162,11 +162,7 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>; //===----------------------------------------------------------------------===// // z/OS XPLINK64 callee-saved registers //===----------------------------------------------------------------------===// -// %R7D is volatile by the spec, but it must be saved in the prologue by -// any non-leaf function and restored in the epilogue for use by the -// return instruction so it functions exactly like a callee-saved register. -def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15), - (sequence "R%dD", 4, 4), +def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15), (sequence "F%dD", 15, 8))>; def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 43bc7426cfa8..975eb8862e82 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -918,72 +918,74 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>(); const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + auto &GRRegClass = SystemZ::GR64BitRegClass; + + // For non-leaf functions: + // - the address of callee (entry point) register R6 must be saved + CSI.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister())); + CSI.back().setRestored(false); + + // The return address register R7 must be saved and restored. + CSI.push_back(CalleeSavedInfo(Regs.getReturnFunctionAddressRegister())); + + // If the function needs a frame pointer, or if the backchain pointer should + // be stored, then save the stack pointer register R4. + if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain")) + CSI.push_back(CalleeSavedInfo(Regs.getStackPointerRegister())); // Scan the call-saved GPRs and find the bounds of the register spill area. - unsigned LowGPR = 0; - int LowOffset = INT32_MAX; - unsigned HighGPR = LowGPR; + Register LowRestoreGPR = 0; + int LowRestoreOffset = INT32_MAX; + Register LowSpillGPR = 0; + int LowSpillOffset = INT32_MAX; + Register HighGPR = 0; int HighOffset = -1; - unsigned RegSP = Regs.getStackPointerRegister(); - auto &GRRegClass = SystemZ::GR64BitRegClass; - const unsigned RegSize = 8; - - auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) { - for (auto &CS : CSIList) { - Register Reg = CS.getReg(); - int Offset = RegSpillOffsets[Reg]; - if (Offset >= 0) { - if (GRRegClass.contains(Reg)) { - if (LowOffset > Offset) { - LowOffset = Offset; - LowGPR = Reg; - } + for (auto &CS : CSI) { + Register Reg = CS.getReg(); + int Offset = RegSpillOffsets[Reg]; + if (Offset >= 0) { + if (GRRegClass.contains(Reg)) { + if (LowSpillOffset > Offset) { + LowSpillOffset = Offset; + LowSpillGPR = Reg; + } + if (CS.isRestored() && LowRestoreOffset > Offset) { + LowRestoreOffset = Offset; + LowRestoreGPR = Reg; + } - if (Offset > HighOffset) { - HighOffset = Offset; - HighGPR = Reg; - } + if (Offset > HighOffset) { + HighOffset = Offset; + HighGPR = Reg; } + // Non-volatile GPRs are saved in the dedicated register save area at + // the bottom of the stack and are not truly part of the "normal" stack + // frame. Mark the frame index as NoAlloc to indicate it as such. + unsigned RegSize = 8; int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset); CS.setFrameIdx(FrameIdx); - } else - CS.setFrameIdx(INT32_MAX); + MFFrame.setStackID(FrameIdx, TargetStackID::NoAlloc); + } + } else { + Register Reg = CS.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + Align Alignment = TRI->getSpillAlign(*RC); + unsigned Size = TRI->getSpillSize(*RC); + Alignment = std::min(Alignment, getStackAlign()); + int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true); + CS.setFrameIdx(FrameIdx); } - }; - - std::vector<CalleeSavedInfo> Spills; - - // For non-leaf functions: - // - the address of callee (entry point) register R6 must be saved - Spills.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister())); - - // If the function needs a frame pointer, or if the backchain pointer should - // be stored, then save the stack pointer register R4. - if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain")) - Spills.push_back(CalleeSavedInfo(RegSP)); + } // Save the range of call-saved registers, for use by the // prologue/epilogue inserters. - ProcessCSI(CSI); - MFI->setRestoreGPRRegs(LowGPR, HighGPR, LowOffset); + if (LowRestoreGPR) + MFI->setRestoreGPRRegs(LowRestoreGPR, HighGPR, LowRestoreOffset); // Save the range of call-saved registers, for use by the epilogue inserter. - ProcessCSI(Spills); - MFI->setSpillGPRRegs(LowGPR, HighGPR, LowOffset); - - // Create spill slots for the remaining registers. - for (auto &CS : CSI) { - if (CS.getFrameIdx() != INT32_MAX) - continue; - Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - Align Alignment = TRI->getSpillAlign(*RC); - unsigned Size = TRI->getSpillSize(*RC); - Alignment = std::min(Alignment, getStackAlign()); - int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true); - CS.setFrameIdx(FrameIdx); - } + assert(LowSpillGPR && "Expected registers to spill"); + MFI->setSpillGPRRegs(LowSpillGPR, HighGPR, LowSpillOffset); return true; } @@ -1001,13 +1003,6 @@ void SystemZXPLINKFrameLowering::determineCalleeSaves(MachineFunction &MF, // frame pointer will be clobbered. if (HasFP) SavedRegs.set(Regs.getFramePointerRegister()); - - // If the function is not an XPLeaf function, we need to save the - // return address register. We also always use that register for - // the return instruction, so it needs to be restored in the - // epilogue even though that register is considered to be volatile. - // #TODO: Implement leaf detection. - SavedRegs.set(Regs.getReturnFunctionAddressRegister()); } bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 8f633adbb9ef..29cc2840310d 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -240,6 +240,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO, return SectionKind::getBSS(); } + // Global variables with '!exclude' should get the exclude section kind if + // they have an explicit section and no other metadata. + if (GVar->hasSection()) + if (MDNode *MD = GVar->getMetadata(LLVMContext::MD_exclude)) + if (!MD->getNumOperands()) + return SectionKind::getExclude(); + // If the global is marked constant, we can put it into a mergable section, // a mergable string section, or general .data if it contains relocations. if (GVar->isConstant()) { diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp index 3eb246f73679..45facd34f84e 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp @@ -39,12 +39,11 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); namespace { class VEMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; MCContext &Ctx; public: - VEMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), Ctx(ctx) {} + VEMCCodeEmitter(const MCInstrInfo &, MCContext &ctx) + : Ctx(ctx) {} VEMCCodeEmitter(const VEMCCodeEmitter &) = delete; VEMCCodeEmitter &operator=(const VEMCCodeEmitter &) = delete; ~VEMCCodeEmitter() override = default; @@ -74,12 +73,6 @@ public: uint64_t getRDOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -87,9 +80,6 @@ private: void VEMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); support::endian::write<uint64_t>(OS, Bits, support::little); @@ -155,7 +145,6 @@ uint64_t VEMCCodeEmitter::getRDOpValue(const MCInst &MI, unsigned OpNo, return 0; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "VEGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createVEMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp index f4fbf763e59c..5a562d77f941 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp @@ -24,6 +24,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "VEGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h index d8f9d0634c24..935a0bfc0c4c 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h @@ -44,6 +44,7 @@ std::unique_ptr<MCObjectTargetWriter> createVEELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the VE instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "VEGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp index af69d04a17ca..5553087d6f47 100644 --- a/llvm/lib/Target/VE/VEAsmPrinter.cpp +++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp @@ -325,6 +325,8 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI, } void VEAsmPrinter::emitInstruction(const MachineInstr *MI) { + VE_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); switch (MI->getOpcode()) { default: diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index 85285749b4fa..e54453b31354 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -325,22 +325,22 @@ def VEMEMziiAsmOperand : AsmOperandClass { // ASX format uses single assembly instruction format. def MEMrri : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops ptr_rc, ptr_rc, i32imm); + let MIOperandInfo = (ops ptr_rc, ptr_rc, i64imm); let ParserMatchClass = VEMEMrriAsmOperand; } def MEMrii : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops ptr_rc, i32imm, i32imm); + let MIOperandInfo = (ops ptr_rc, i32imm, i64imm); let ParserMatchClass = VEMEMriiAsmOperand; } def MEMzri : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i32imm); + let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i64imm); let ParserMatchClass = VEMEMzriAsmOperand; } def MEMzii : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i32imm); + let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i64imm); let ParserMatchClass = VEMEMziiAsmOperand; } diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp index d175ad26c742..f334af128162 100644 --- a/llvm/lib/Target/VE/VERegisterInfo.cpp +++ b/llvm/lib/Target/VE/VERegisterInfo.cpp @@ -27,6 +27,8 @@ using namespace llvm; +#define DEBUG_TYPE "ve-register-info" + #define GET_REGINFO_TARGET_DESC #include "VEGenRegisterInfo.inc" @@ -133,66 +135,179 @@ static unsigned offsetToDisp(MachineInstr &MI) { return OffDisp; } -static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II, - MachineInstr &MI, const DebugLoc &dl, - unsigned FIOperandNum, int Offset, Register FrameReg) { - // Replace frame index with a frame pointer reference directly. - // VE has 32 bit offset field, so no need to expand a target instruction. - // Directly encode it. +class EliminateFrameIndex { + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; + const DebugLoc &DL; + MachineBasicBlock &MBB; + MachineBasicBlock::iterator II; + Register clobber; + + // Some helper functions for the ease of instruction building. + MachineFunction &getFunc() const { return *MBB.getParent(); } + inline MCRegister getSubReg(MCRegister Reg, unsigned Idx) const { + return TRI.getSubReg(Reg, Idx); + } + inline const MCInstrDesc &get(unsigned Opcode) const { + return TII.get(Opcode); + } + inline MachineInstrBuilder build(const MCInstrDesc &MCID, Register DestReg) { + return BuildMI(MBB, II, DL, MCID, DestReg); + } + inline MachineInstrBuilder build(unsigned InstOpc, Register DestReg) { + return build(get(InstOpc), DestReg); + } + inline MachineInstrBuilder build(const MCInstrDesc &MCID) { + return BuildMI(MBB, II, DL, MCID); + } + inline MachineInstrBuilder build(unsigned InstOpc) { + return build(get(InstOpc)); + } + + // Calculate an address of frame index from a frame register and a given + // offset if the offset doesn't fit in the immediate field. Use a clobber + // register to hold calculated address. + void prepareReplaceFI(MachineInstr &MI, Register &FrameReg, int64_t &Offset, + int64_t Bytes = 0); + // Replace the frame index in \p MI with a frame register and a given offset + // if it fits in the immediate field. Otherwise, use pre-calculated address + // in a clobber regsiter. + void replaceFI(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + + // Expand and eliminate Frame Index of pseudo STQrii and LDQrii. + void processSTQ(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + +public: + EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, + const DebugLoc &DL, MachineBasicBlock &MBB, + MachineBasicBlock::iterator II) + : TII(TII), TRI(TRI), DL(DL), MBB(MBB), II(II), clobber(VE::SX13) {} + + // Expand and eliminate Frame Index from MI + void processMI(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); +}; + +// Prepare the frame index if it doesn't fit in the immediate field. Use +// clobber register to hold calculated address. +void EliminateFrameIndex::prepareReplaceFI(MachineInstr &MI, Register &FrameReg, + int64_t &Offset, int64_t Bytes) { + if (isInt<32>(Offset) && isInt<32>(Offset + Bytes)) { + // If the offset is small enough to fit in the immediate field, directly + // encode it. So, nothing to prepare here. + return; + } + + // If the offset doesn't fit, emit following codes. This clobbers SX13 + // which we always know is available here. + // lea %clobber, Offset@lo + // and %clobber, %clobber, (32)0 + // lea.sl %clobber, Offset@hi(FrameReg, %clobber) + build(VE::LEAzii, clobber).addImm(0).addImm(0).addImm(Lo_32(Offset)); + build(VE::ANDrm, clobber).addReg(clobber).addImm(M0(32)); + build(VE::LEASLrri, clobber) + .addReg(clobber) + .addReg(FrameReg) + .addImm(Hi_32(Offset)); + + // Use clobber register as a frame register and 0 offset + FrameReg = clobber; + Offset = 0; +} + +// Replace the frame index in \p MI with a proper byte and framereg offset. +void EliminateFrameIndex::replaceFI(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(isInt<32>(Offset)); + + // The offset must be small enough to fit in the immediate field after + // call of prepareReplaceFI. Therefore, we directly encode it. MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false); MI.getOperand(FIOperandNum + offsetToDisp(MI)).ChangeToImmediate(Offset); } +void EliminateFrameIndex::processSTQ(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::STQrii); + LLVM_DEBUG(dbgs() << "processSTQ: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 8); + + Register SrcReg = MI.getOperand(3).getReg(); + Register SrcHiReg = getSubReg(SrcReg, VE::sub_even); + Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd); + // VE stores HiReg to 8(addr) and LoReg to 0(addr) + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(SrcLoReg); + replaceFI(*StMI, FrameReg, Offset, 0); + // Mutate to 'hi' store. + MI.setDesc(get(VE::STrii)); + MI.getOperand(3).setReg(SrcHiReg); + Offset += 8; + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::LDQrii); + LLVM_DEBUG(dbgs() << "processLDQ: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 8); + + Register DestReg = MI.getOperand(0).getReg(); + Register DestHiReg = getSubReg(DestReg, VE::sub_even); + Register DestLoReg = getSubReg(DestReg, VE::sub_odd); + // VE loads HiReg from 8(addr) and LoReg from 0(addr) + MachineInstr *StMI = + build(VE::LDrii, DestLoReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*StMI, FrameReg, Offset, 1); + MI.setDesc(get(VE::LDrii)); + MI.getOperand(0).setReg(DestHiReg); + Offset += 8; + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + switch (MI.getOpcode()) { + case VE::STQrii: + processSTQ(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::LDQrii: + processLDQ(MI, FrameReg, Offset, FIOperandNum); + return; + } + prepareReplaceFI(MI, FrameReg, Offset); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { assert(SPAdj == 0 && "Unexpected"); MachineInstr &MI = *II; - DebugLoc dl = MI.getDebugLoc(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + MachineFunction &MF = *MI.getParent()->getParent(); - const VEFrameLowering *TFI = getFrameLowering(MF); + const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>(); + const VEFrameLowering &TFI = *getFrameLowering(MF); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const VERegisterInfo &TRI = *Subtarget.getRegisterInfo(); + DebugLoc DL = MI.getDebugLoc(); + EliminateFrameIndex EFI(TII, TRI, DL, *MI.getParent(), II); + // Retrieve FrameReg and byte offset for stack slot. Register FrameReg; - int Offset; - Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed(); - + int64_t Offset = + TFI.getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed(); Offset += MI.getOperand(FIOperandNum + offsetToDisp(MI)).getImm(); - if (MI.getOpcode() == VE::STQrii) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - Register SrcReg = MI.getOperand(3).getReg(); - Register SrcHiReg = getSubReg(SrcReg, VE::sub_even); - Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd); - // VE stores HiReg to 8(addr) and LoReg to 0(addr) - MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(VE::STrii)) - .addReg(FrameReg) - .addImm(0) - .addImm(0) - .addReg(SrcLoReg); - replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg); - MI.setDesc(TII.get(VE::STrii)); - MI.getOperand(3).setReg(SrcHiReg); - Offset += 8; - } else if (MI.getOpcode() == VE::LDQrii) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - Register DestReg = MI.getOperand(0).getReg(); - Register DestHiReg = getSubReg(DestReg, VE::sub_even); - Register DestLoReg = getSubReg(DestReg, VE::sub_odd); - // VE loads HiReg from 8(addr) and LoReg from 0(addr) - MachineInstr *StMI = - BuildMI(*MI.getParent(), II, dl, TII.get(VE::LDrii), DestLoReg) - .addReg(FrameReg) - .addImm(0) - .addImm(0); - replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg); - MI.setDesc(TII.get(VE::LDrii)); - MI.getOperand(0).setReg(DestHiReg); - Offset += 8; - } - - replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg); + EFI.processMI(MI, FrameReg, Offset, FIOperandNum); } Register VERegisterInfo::getFrameRegister(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp index 330eef4c7c2b..f88f298bc603 100644 --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -41,7 +41,7 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const { auto VVPOpcodeOpt = getVVPOpcode(Opcode); if (!VVPOpcodeOpt) return SDValue(); - unsigned VVPOpcode = VVPOpcodeOpt.getValue(); + unsigned VVPOpcode = VVPOpcodeOpt.value(); const bool FromVP = ISD::isVPOpcode(Opcode); // The representative and legalized vector type of this operation. diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index ec72c1de0503..d31715e367ec 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -87,15 +87,14 @@ bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc, if (Stack.empty()) { return typeError(ErrorLoc, EVT ? StringRef("empty stack while popping ") + - WebAssembly::typeToString(EVT.getValue()) + WebAssembly::typeToString(EVT.value()) : StringRef("empty stack while popping value")); } auto PVT = Stack.pop_back_val(); - if (EVT && EVT.getValue() != PVT) { + if (EVT && EVT.value() != PVT) { return typeError( ErrorLoc, StringRef("popped ") + WebAssembly::typeToString(PVT) + - ", expected " + - WebAssembly::typeToString(EVT.getValue())); + ", expected " + WebAssembly::typeToString(EVT.value())); } return false; } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index f52545a65dbb..97dbc35c991b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -26,6 +26,7 @@ using namespace llvm; #define DEBUG_TYPE "wasm-mc-target-desc" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "WebAssemblyGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 75d5d0675990..b5b12200505b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -124,6 +124,7 @@ enum TOF { // Defines symbolic names for the WebAssembly instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "WebAssemblyGenInstrInfo.inc" namespace llvm { diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp index e3daf6bfa72e..ef2c77ade8cc 100644 --- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp +++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp @@ -37,4 +37,5 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() { // which have to be in a shared location between CodeGen and MC. #define GET_INSTRMAP_INFO 1 #define GET_INSTRINFO_ENUM 1 +#define GET_INSTRINFO_MC_HELPER_DECLS #include "WebAssemblyGenInstrInfo.inc" diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp index 0f1655718481..f380b2582c65 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp @@ -13,6 +13,7 @@ #include "WebAssemblyTypeUtilities.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" // Get register classes enum. #define GET_REGINFO_ENUM @@ -168,6 +169,11 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) { } } +wasm::ValType WebAssembly::regClassToValType(const TargetRegisterClass *RC) { + assert(RC != nullptr); + return regClassToValType(RC->getID()); +} + void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, const SmallVector<MVT, 1> &VTs) { assert(!Sym->getType()); @@ -175,33 +181,28 @@ void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, // Tables are represented as Arrays in LLVM IR therefore // they reach this point as aggregate Array types with an element type // that is a reference type. - wasm::ValType Type; + wasm::ValType ValTy; bool IsTable = false; if (GlobalVT->isArrayTy() && WebAssembly::isRefType(GlobalVT->getArrayElementType())) { - MVT VT; IsTable = true; - switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) { - case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF: - VT = MVT::funcref; - break; - case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF: - VT = MVT::externref; - break; - default: - report_fatal_error("unhandled address space type"); - } - Type = WebAssembly::toValType(VT); + const Type *ElTy = GlobalVT->getArrayElementType(); + if (WebAssembly::isExternrefType(ElTy)) + ValTy = wasm::ValType::EXTERNREF; + else if (WebAssembly::isFuncrefType(ElTy)) + ValTy = wasm::ValType::FUNCREF; + else + report_fatal_error("unhandled reference type"); } else if (VTs.size() == 1) { - Type = WebAssembly::toValType(VTs[0]); + ValTy = WebAssembly::toValType(VTs[0]); } else report_fatal_error("Aggregate globals not yet implemented"); if (IsTable) { Sym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); - Sym->setTableType(Type); + Sym->setTableType(ValTy); } else { Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true}); + Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(ValTy), /*Mutable=*/true}); } } diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h index 8fc67d37925c..86211700c70a 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h @@ -22,6 +22,9 @@ #include "llvm/Support/MachineValueType.h" namespace llvm { + +class TargetRegisterClass; + namespace WebAssembly { /// Used as immediate MachineOperands for block signatures @@ -108,9 +111,12 @@ std::string signatureToString(const wasm::WasmSignature *Sig); // Convert a MVT into its corresponding wasm ValType. wasm::ValType toValType(MVT Type); -// Convert a register class to a wasm ValType. +// Convert a register class ID to a wasm ValType. wasm::ValType regClassToValType(unsigned RC); +// Convert a register class to a wasm ValType. +wasm::ValType regClassToValType(const TargetRegisterClass *RC); + /// Sets a Wasm Symbol Type. void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, const SmallVector<MVT, 1> &VTs); diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp index b87c884c9e4a..277bbee83a6f 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp @@ -179,3 +179,25 @@ MachineInstr *WebAssembly::findCatch(MachineBasicBlock *EHPad) { return &*Pos; return nullptr; } + +unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) { + assert(RC != nullptr); + switch (RC->getID()) { + case WebAssembly::I32RegClassID: + return WebAssembly::COPY_I32; + case WebAssembly::I64RegClassID: + return WebAssembly::COPY_I64; + case WebAssembly::F32RegClassID: + return WebAssembly::COPY_F32; + case WebAssembly::F64RegClassID: + return WebAssembly::COPY_F64; + case WebAssembly::V128RegClassID: + return WebAssembly::COPY_V128; + case WebAssembly::FUNCREFRegClassID: + return WebAssembly::COPY_FUNCREF; + case WebAssembly::EXTERNREFRegClassID: + return WebAssembly::COPY_EXTERNREF; + default: + llvm_unreachable("Unexpected register class"); + } +} diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h index cdfc758db7ac..d0639208fda9 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h @@ -24,6 +24,7 @@ class MachineInstr; class MachineOperand; class MCContext; class MCSymbolWasm; +class TargetRegisterClass; class WebAssemblyFunctionInfo; class WebAssemblySubtarget; @@ -65,6 +66,9 @@ getOrCreateFuncrefCallTableSymbol(MCContext &Ctx, /// instruction found or the catch is in an invalid location. MachineInstr *findCatch(MachineBasicBlock *EHPad); +/// Returns the appropriate copy opcode for the given register class. +unsigned getCopyOpcodeForRegClass(const TargetRegisterClass *RC); + } // end namespace WebAssembly } // end namespace llvm diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 57d51634e849..bcb6cf1b4e1d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -597,6 +597,8 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() { void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) { LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n'); + WebAssembly_MC::verifyInstructionPredicates(MI->getOpcode(), + Subtarget->getFeatureBits()); switch (MI->getOpcode()) { case WebAssembly::ARGUMENT_i32: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 02e873a0f9a6..d2eb4b29e9fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -781,25 +781,6 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) { } } -// Get the appropriate copy opcode for the given register class. -static unsigned getCopyOpcode(const TargetRegisterClass *RC) { - if (RC == &WebAssembly::I32RegClass) - return WebAssembly::COPY_I32; - if (RC == &WebAssembly::I64RegClass) - return WebAssembly::COPY_I64; - if (RC == &WebAssembly::F32RegClass) - return WebAssembly::COPY_F32; - if (RC == &WebAssembly::F64RegClass) - return WebAssembly::COPY_F64; - if (RC == &WebAssembly::V128RegClass) - return WebAssembly::COPY_V128; - if (RC == &WebAssembly::FUNCREFRegClass) - return WebAssembly::COPY_FUNCREF; - if (RC == &WebAssembly::EXTERNREFRegClass) - return WebAssembly::COPY_EXTERNREF; - llvm_unreachable("Unexpected register class"); -} - // When MBB is split into MBB and Split, we should unstackify defs in MBB that // have their uses in Split. static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, @@ -851,7 +832,8 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, if (!MFI.isVRegStackified(TeeReg)) { // Now we are not using TEE anymore, so unstackify DefReg too MFI.unstackifyVReg(DefReg); - unsigned CopyOpc = getCopyOpcode(MRI.getRegClass(DefReg)); + unsigned CopyOpc = + WebAssembly::getCopyOpcodeForRegClass(MRI.getRegClass(DefReg)); BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), TeeReg) .addReg(DefReg); BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), Reg).addReg(DefReg); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 5484c0db7775..9316826e3d92 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -66,23 +66,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, ? MRI.getRegClass(DestReg) : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg); - unsigned CopyOpcode; - if (RC == &WebAssembly::I32RegClass) - CopyOpcode = WebAssembly::COPY_I32; - else if (RC == &WebAssembly::I64RegClass) - CopyOpcode = WebAssembly::COPY_I64; - else if (RC == &WebAssembly::F32RegClass) - CopyOpcode = WebAssembly::COPY_F32; - else if (RC == &WebAssembly::F64RegClass) - CopyOpcode = WebAssembly::COPY_F64; - else if (RC == &WebAssembly::V128RegClass) - CopyOpcode = WebAssembly::COPY_V128; - else if (RC == &WebAssembly::FUNCREFRegClass) - CopyOpcode = WebAssembly::COPY_FUNCREF; - else if (RC == &WebAssembly::EXTERNREFRegClass) - CopyOpcode = WebAssembly::COPY_EXTERNREF; - else - llvm_unreachable("Unexpected register class"); + unsigned CopyOpcode = WebAssembly::getCopyOpcodeForRegClass(RC); BuildMI(MBB, I, DL, get(CopyOpcode), DestReg) .addReg(SrcReg, KillSrc ? RegState::Kill : 0); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 2db4bd822349..7a1a769c6b16 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -553,7 +553,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) { std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs(); SizeArg += 1; if (NEltArg) - NEltArg = NEltArg.getValue() + 1; + NEltArg = NEltArg.value() + 1; FnAttrs.addAllocSizeAttr(SizeArg, NEltArg); } // In case the callee has 'noreturn' attribute, We need to remove it, because diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 2e6027a5605c..e8b3542df12f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -154,25 +154,6 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand( return MCOperand::createExpr(Expr); } -// Return the WebAssembly type associated with the given register class. -static wasm::ValType getType(const TargetRegisterClass *RC) { - if (RC == &WebAssembly::I32RegClass) - return wasm::ValType::I32; - if (RC == &WebAssembly::I64RegClass) - return wasm::ValType::I64; - if (RC == &WebAssembly::F32RegClass) - return wasm::ValType::F32; - if (RC == &WebAssembly::F64RegClass) - return wasm::ValType::F64; - if (RC == &WebAssembly::V128RegClass) - return wasm::ValType::V128; - if (RC == &WebAssembly::EXTERNREFRegClass) - return wasm::ValType::EXTERNREF; - if (RC == &WebAssembly::FUNCREFRegClass) - return wasm::ValType::FUNCREF; - llvm_unreachable("Unexpected register class"); -} - static void getFunctionReturns(const MachineInstr *MI, SmallVectorImpl<wasm::ValType> &Returns) { const Function &F = MI->getMF()->getFunction(); @@ -221,10 +202,12 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); for (const MachineOperand &MO : MI->defs()) - Returns.push_back(getType(MRI.getRegClass(MO.getReg()))); + Returns.push_back( + WebAssembly::regClassToValType(MRI.getRegClass(MO.getReg()))); for (const MachineOperand &MO : MI->explicit_uses()) if (MO.isReg()) - Params.push_back(getType(MRI.getRegClass(MO.getReg()))); + Params.push_back( + WebAssembly::regClassToValType(MRI.getRegClass(MO.getReg()))); // call_indirect instructions have a callee operand at the end which // doesn't count as a param. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp index ba1c4b7233f2..5fcee7af9bde 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "Utils/WebAssemblyUtilities.h" #include "WebAssembly.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" @@ -95,31 +96,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, if (!MFI.isVRegStackified(Reg)) { unsigned CopyLocalOpc; const TargetRegisterClass *RegClass = MRI.getRegClass(Reg); - switch (RegClass->getID()) { - case WebAssembly::I32RegClassID: - CopyLocalOpc = WebAssembly::COPY_I32; - break; - case WebAssembly::I64RegClassID: - CopyLocalOpc = WebAssembly::COPY_I64; - break; - case WebAssembly::F32RegClassID: - CopyLocalOpc = WebAssembly::COPY_F32; - break; - case WebAssembly::F64RegClassID: - CopyLocalOpc = WebAssembly::COPY_F64; - break; - case WebAssembly::V128RegClassID: - CopyLocalOpc = WebAssembly::COPY_V128; - break; - case WebAssembly::FUNCREFRegClassID: - CopyLocalOpc = WebAssembly::COPY_FUNCREF; - break; - case WebAssembly::EXTERNREFRegClassID: - CopyLocalOpc = WebAssembly::COPY_EXTERNREF; - break; - default: - llvm_unreachable("Unexpected register class for return operand"); - } + CopyLocalOpc = WebAssembly::getCopyOpcodeForRegClass(RegClass); Register NewReg = MRI.createVirtualRegister(RegClass); BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg) .addReg(Reg); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 388c0f9110b7..0b3e534315d5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -21,7 +21,6 @@ #include "WebAssemblyRuntimeLibcallSignatures.h" #include "WebAssemblySubtarget.h" #include "llvm/CodeGen/RuntimeLibcalls.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; @@ -482,10 +481,13 @@ struct RuntimeLibcallSignatureTable { } }; -ManagedStatic<RuntimeLibcallSignatureTable> RuntimeLibcallSignatures; +RuntimeLibcallSignatureTable &getRuntimeLibcallSignatures() { + static RuntimeLibcallSignatureTable RuntimeLibcallSignatures; + return RuntimeLibcallSignatures; +} // Maps libcall names to their RTLIB::Libcall number. Builds the map in a -// constructor for use with ManagedStatic +// constructor for use with a static variable struct StaticLibcallNameMap { StringMap<RTLIB::Libcall> Map; StaticLibcallNameMap() { @@ -496,7 +498,8 @@ struct StaticLibcallNameMap { }; for (const auto &NameLibcall : NameLibcalls) { if (NameLibcall.first != nullptr && - RuntimeLibcallSignatures->Table[NameLibcall.second] != unsupported) { + getRuntimeLibcallSignatures().Table[NameLibcall.second] != + unsupported) { assert(Map.find(NameLibcall.first) == Map.end() && "duplicate libcall names in name map"); Map[NameLibcall.first] = NameLibcall.second; @@ -523,7 +526,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, wasm::ValType PtrTy = Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32; - auto &Table = RuntimeLibcallSignatures->Table; + auto &Table = getRuntimeLibcallSignatures().Table; switch (Table[LC]) { case func: break; @@ -885,14 +888,14 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, } } -static ManagedStatic<StaticLibcallNameMap> LibcallNameMap; // TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed // other than here, just roll its logic into this version. void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, StringRef Name, SmallVectorImpl<wasm::ValType> &Rets, SmallVectorImpl<wasm::ValType> &Params) { - auto &Map = LibcallNameMap->Map; + static StaticLibcallNameMap LibcallNameMap; + auto &Map = LibcallNameMap.Map; auto Val = Map.find(Name); #ifndef NDEBUG if (Val == Map.end()) { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index a903c5f455a2..da90befb2320 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -622,7 +622,7 @@ static bool printFMAComments(const MCInst *MI, raw_ostream &OS, OS << '-'; OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' ' - << AccName; + << AccName << '\n'; return true; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp index 901082ce6cf3..640efd468135 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp @@ -13,6 +13,7 @@ #include "X86InstrRelaxTables.h" #include "X86InstrInfo.h" #include "llvm/ADT/STLExtras.h" +#include <atomic> using namespace llvm; @@ -119,7 +120,7 @@ const X86InstrRelaxTableEntry *llvm::lookupRelaxTable(unsigned ShortOp) { namespace { // This class stores the short form tables. It is instantiated as a -// ManagedStatic to lazily init the short form table. +// function scope static variable to lazily init the short form table. struct X86ShortFormTable { // Stores relaxation table entries sorted by relaxed form opcode. SmallVector<X86InstrRelaxTableEntry, 0> Table; @@ -137,10 +138,9 @@ struct X86ShortFormTable { }; } // namespace -static ManagedStatic<X86ShortFormTable> ShortTable; - const X86InstrRelaxTableEntry *llvm::lookupShortTable(unsigned RelaxOp) { - auto &Table = ShortTable->Table; + static X86ShortFormTable ShortTable; + auto &Table = ShortTable.Table; auto I = llvm::lower_bound(Table, RelaxOp); if (I != Table.end() && I->KeyOp == RelaxOp) return &*I; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 49660883ad83..4c962de16530 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -37,6 +37,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC #define GET_INSTRINFO_MC_HELPERS +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "X86GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 7344900f2e31..0ac916527495 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -132,6 +132,9 @@ FunctionPass *createX86EvexToVexInsts(); /// This pass creates the thunks for the retpoline feature. FunctionPass *createX86IndirectThunksPass(); +/// This pass replaces ret instructions with jmp's to __x86_return thunk. +FunctionPass *createX86ReturnThunksPass(); + /// This pass ensures instructions featuring a memory operand /// have distinctive <LineNumber, Discriminator> (with respect to eachother) FunctionPass *createX86DiscriminateMemOpsPass(); @@ -185,6 +188,7 @@ void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86PreAMXConfigPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); +void initializeX86ReturnThunksPass(PassRegistry &); namespace X86AS { enum : unsigned { diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a5c6b40c493c..a859176220c7 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -266,6 +266,8 @@ def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true", "Write Back No Invalidate">; def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", "Support RDPID instructions">; +def FeatureRDPRU : SubtargetFeature<"rdpru", "HasRDPRU", "true", + "Support RDPRU instructions">; def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", @@ -1238,6 +1240,7 @@ def ProcessorFeatures { TuningInsertVZEROUPPER]; list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, + FeatureRDPRU, FeatureWBNOINVD]; list<SubtargetFeature> ZN2Tuning = ZNTuning; list<SubtargetFeature> ZN2Features = diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp index c7a013a0b17a..cff95d17c14c 100644 --- a/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" +#include <atomic> #include <cassert> #include <cstdint> diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 61c1fd25031d..12af6087cb47 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -594,7 +594,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Half type will be promoted by default. setOperationAction(ISD::FABS, MVT::f16, Promote); setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); @@ -629,6 +629,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); @@ -2817,6 +2845,21 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { AddressSpace = X86AS::FS; else if (GuardReg == "gs") AddressSpace = X86AS::GS; + + // Use symbol guard if user specify. + StringRef GuardSymb = M->getStackProtectorGuardSymbol(); + if (!GuardSymb.empty()) { + GlobalVariable *GV = M->getGlobalVariable(GuardSymb); + if (!GV) { + Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) + : Type::getInt32Ty(M->getContext()); + GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, + nullptr, GuardSymb, nullptr, + GlobalValue::NotThreadLocal, AddressSpace); + } + return GV; + } + return SegmentOffset(IRB, Offset, AddressSpace); } } @@ -11757,15 +11800,17 @@ static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in -/// both. +/// both, or via a known bits test. static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, + const SelectionDAG &DAG, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; - assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && + assert(llvm::all_of(ExpectedMask, + [Size](int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"); // Check for out-of-range target shuffle mask indices. @@ -11778,12 +11823,28 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits()) V2 = SDValue(); + APInt ZeroV1 = APInt::getNullValue(Size); + APInt ZeroV2 = APInt::getNullValue(Size); + for (int i = 0; i < Size; ++i) { int MaskIdx = Mask[i]; int ExpectedIdx = ExpectedMask[i]; if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) continue; - if (0 <= MaskIdx && 0 <= ExpectedIdx) { + if (MaskIdx == SM_SentinelZero) { + // If we need this expected index to be a zero element, then update the + // relevant zero mask and perform the known bits at the end to minimize + // repeated computes. + SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; + if (ExpectedV && + Size == (int)ExpectedV.getValueType().getVectorNumElements()) { + int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); + APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2; + ZeroMask.setBit(BitIdx); + continue; + } + } + if (MaskIdx >= 0) { SDValue MaskV = MaskIdx < Size ? V1 : V2; SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); @@ -11791,15 +11852,16 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) continue; } - // TODO - handle SM_Sentinel equivalences. return false; } - return true; + return (ZeroV1.isNullValue() || DAG.MaskedVectorIsZero(V1, ZeroV1)) && + (ZeroV2.isNullValue() || DAG.MaskedVectorIsZero(V2, ZeroV2)); } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. -static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { +static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT, + const SelectionDAG &DAG) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; @@ -11809,12 +11871,13 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { SmallVector<int, 8> Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); - bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) || - isTargetShuffleEquivalent(VT, Mask, Unpckhwd)); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) || + isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG)); return IsUnpackwdMask; } -static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { +static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask, + const SelectionDAG &DAG) { // Create 128-bit vector type based on mask size. MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); MVT VT = MVT::getVectorVT(EltVT, Mask.size()); @@ -11827,8 +11890,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { for (unsigned i = 0; i != 4; ++i) { SmallVector<int, 16> UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); - if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) || - isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask)) + if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) || + isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG)) return true; } return false; @@ -12021,7 +12084,7 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector<int, 64> Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1, + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); @@ -12030,7 +12093,7 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1, + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); @@ -12069,14 +12132,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; @@ -12464,14 +12527,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, // Try binary shuffle. SmallVector<int, 32> BinaryMask; createPackShuffleMask(VT, BinaryMask, false, NumStages); - if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2)) + if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2)) if (MatchPACK(V1, V2, PackVT)) return true; // Try unary shuffle. SmallVector<int, 32> UnaryMask; createPackShuffleMask(VT, UnaryMask, true, NumStages); - if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1)) + if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1)) if (MatchPACK(V1, V1, PackVT)) return true; } @@ -14283,7 +14346,7 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps // because that avoids a constant load from memory. if (NumElts == 4 && - (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) + (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG))) return SDValue(); // Extend the shuffle mask with undef elements. @@ -17230,7 +17293,7 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2()) { // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && - !is128BitUnpackShuffleMask(HalfMask) && + !is128BitUnpackShuffleMask(HalfMask, DAG) && (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableCrossLaneShuffle())) return SDValue(); @@ -17892,7 +17955,7 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. - if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) + if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG)) return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); @@ -17930,7 +17993,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. - if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && + if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() && !Subtarget.hasAVX512()) return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); @@ -27887,11 +27950,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, } // Read Performance Monitoring Counters. case RDPMC: + // Read Processor Register. + case RDPRU: // GetExtended Control Register. case XGETBV: { SmallVector<SDValue, 2> Results; // RDPMC uses ECX to select the index of the performance counter to read. + // RDPRU uses ECX to select the processor register to read. // XGETBV uses ECX to select the index of the XCR register to return. // The result is stored into registers EDX:EAX. expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, @@ -29902,14 +29968,12 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt); SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, {4, 5, 6, 7, -1, -1, -1, -1}); - Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, - {0, 1, 1, 1, -1, -1, -1, -1}); - Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, - {2, 3, 3, 3, -1, -1, -1, -1}); - Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23, - {0, 1, 1, 1, -1, -1, -1, -1}); - Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23, - {2, 3, 3, 3, -1, -1, -1, -1}); + SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG); + SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG); + Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02); + Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13); + Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02); + Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13); } } @@ -30797,6 +30861,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; @@ -32894,6 +32960,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, Results); return; + case Intrinsic::x86_rdpru: + expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget, + Results); + return; case Intrinsic::x86_xgetbv: expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, Results); @@ -36985,8 +37055,9 @@ static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, bool AllowFloatDomain, bool AllowIntDomain, - SDValue V1, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { + SDValue V1, const SelectionDAG &DAG, + const X86Subtarget &Subtarget, unsigned &Shuffle, + MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); @@ -37057,17 +37128,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; @@ -37076,17 +37147,19 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, + V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG, + V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; @@ -37096,21 +37169,22 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, + V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( MaskVT, Mask, - {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) { + {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( MaskVT, Mask, - {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) { + {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; @@ -37126,6 +37200,7 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, + const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { @@ -37269,33 +37344,36 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) && + AllowFloatDomain) { V2 = V1; V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1); Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) && + AllowFloatDomain) { V2 = V1; Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) && Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; SrcVT = DstVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}, + DAG) && Subtarget.hasFP16()) { Shuffle = X86ISD::MOVSH; SrcVT = DstVT = MVT::v8f16; @@ -37678,7 +37756,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, scaleShuffleElements(Mask, NumElts, ScaledMask)) { for (unsigned i = 0; i != NumElts; ++i) IdentityMask.push_back(i); - if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2)) + if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1, + V2)) return CanonicalizeShuffleInput(RootVT, V1); } } @@ -37902,7 +37981,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, - Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && + DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) @@ -37913,7 +37992,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, Subtarget, Shuffle, ShuffleVT, + AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { @@ -37931,7 +38010,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // TODO: Handle other insertions here as well? if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && Subtarget.hasSSE41() && - !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) { + !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) { if (MaskEltSizeInBits == 32) { SDValue SrcV1 = V1, SrcV2 = V2; if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, @@ -37947,12 +38026,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } if (MaskEltSizeInBits == 64 && - isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) && + isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) && V2.getOpcode() == ISD::SCALAR_TO_VECTOR && V2.getScalarValueSizeInBits() <= 32) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) return SDValue(); // Nothing to do! - PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0); + PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0); Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, CanonicalizeShuffleInput(MVT::v4f32, V1), CanonicalizeShuffleInput(MVT::v4f32, V2), @@ -51654,9 +51733,13 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. - if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX()) || - (OpSize == 512 && Subtarget.useAVX512Regs())) { + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX()) || + (OpSize == 512 && Subtarget.useAVX512Regs()))) { bool HasPT = Subtarget.hasSSE41(); // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index a55b95960aa6..6124755ca539 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1532,44 +1532,6 @@ def : Pat<(xor GR32:$src1, -2147483648), } //===----------------------------------------------------------------------===// -// Pattern match SUB as XOR -//===----------------------------------------------------------------------===// - -// An immediate in the LHS of a subtract can't be encoded in the instruction. -// If there is no possibility of a borrow we can use an XOR instead of a SUB -// to enable the immediate to be folded. -// TODO: Move this to a DAG combine? - -def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{ - if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) { - KnownBits Known = CurDAG->computeKnownBits(N->getOperand(1)); - - // If all possible ones in the RHS are set in the LHS then there can't be - // a borrow and we can use xor. - return (~Known.Zero).isSubsetOf(CN->getAPIntValue()); - } - - return false; -}]>; - -let AddedComplexity = 5 in { -def : Pat<(sub_is_xor imm:$src2, GR8:$src1), - (XOR8ri GR8:$src1, imm:$src2)>; -def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1), - (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(sub_is_xor imm:$src2, GR16:$src1), - (XOR16ri GR16:$src1, imm:$src2)>; -def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1), - (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; -def : Pat<(sub_is_xor imm:$src2, GR32:$src1), - (XOR32ri GR32:$src1, imm:$src2)>; -def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1), - (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1), - (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; -} - -//===----------------------------------------------------------------------===// // Some peepholes //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index 52b2a62316cd..c4317be664fd 100644 --- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -13,8 +13,8 @@ #include "X86InstrFMA3Info.h" #include "X86InstrInfo.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Threading.h" +#include <atomic> #include <cassert> #include <cstdint> diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 27220a8d4d99..8aeb169929f2 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -13,6 +13,7 @@ #include "X86InstrFoldTables.h" #include "X86InstrInfo.h" #include "llvm/ADT/STLExtras.h" +#include <atomic> #include <vector> using namespace llvm; @@ -6102,7 +6103,7 @@ llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) { namespace { // This class stores the memory unfolding tables. It is instantiated as a -// ManagedStatic to lazily init the unfolding table. +// function scope static variable to lazily init the unfolding table. struct X86MemUnfoldTable { // Stores memory unfolding tables entries sorted by opcode. std::vector<X86MemoryFoldTableEntry> Table; @@ -6159,11 +6160,10 @@ struct X86MemUnfoldTable { }; } -static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable; - const X86MemoryFoldTableEntry * llvm::lookupUnfoldTable(unsigned MemOp) { - auto &Table = MemUnfoldTable->Table; + static X86MemUnfoldTable MemUnfoldTable; + auto &Table = MemUnfoldTable.Table; auto I = llvm::lower_bound(Table, MemOp); if (I != Table.end() && I->KeyOp == MemOp) return &*I; diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 7f6ef3479d40..4a9a281d5b99 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -978,6 +978,7 @@ def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCLWB : Predicate<"Subtarget->hasCLWB()">; def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; def HasRDPID : Predicate<"Subtarget->hasRDPID()">; +def HasRDPRU : Predicate<"Subtarget->hasRDPRU()">; def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; def HasCX8 : Predicate<"Subtarget->hasCX8()">; diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td index 3a653a56e534..b1ca87279007 100644 --- a/llvm/lib/Target/X86/X86InstrSystem.td +++ b/llvm/lib/Target/X86/X86InstrSystem.td @@ -735,6 +735,15 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst), } // SchedRW //===----------------------------------------------------------------------===// +// RDPRU - Read Processor Register instruction. + +let SchedRW = [WriteSystem] in { +let Uses = [ECX], Defs = [EAX, EDX] in + def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, PS, + Requires<[HasRDPRU]>; +} + +//===----------------------------------------------------------------------===// // Platform Configuration instruction // From ISA docs: diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 3c8be95b43e3..6112c0b7d6c3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -37,7 +37,7 @@ enum IntrinsicType : uint16_t { TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2, - ROUNDP, ROUNDS + ROUNDP, ROUNDS, RDPRU }; struct IntrinsicData { @@ -309,6 +309,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0), + X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index b107de692365..3fbdb18a0793 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2413,6 +2413,10 @@ static void addConstantComments(const MachineInstr *MI, } void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // X86_MC::verifyInstructionPredicates(MI->getOpcode(), + // Subtarget->getFeatureBits()); + X86MCInstLower MCInstLowering(*MF, *this); const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 7761f7323358..c760a32e2579 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -439,8 +439,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { while (!Worklist.empty()) { Value *V = Worklist.pop_back_val(); - if (!Visited.insert(V).second) - continue; + if (!Visited.insert(V).second) + continue; if (auto *PN = dyn_cast<PHINode>(V)) { // PHI node should have single use unless it is the root node, then it @@ -466,7 +466,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { // gets us back to this node. if (BO->hasNUses(BO == Root ? 3 : 2)) { PHINode *PN = nullptr; - for (auto *U : Root->users()) + for (auto *U : BO->users()) if (auto *P = dyn_cast<PHINode>(U)) if (!Visited.count(P)) PN = P; diff --git a/llvm/lib/Target/X86/X86ReturnThunks.cpp b/llvm/lib/Target/X86/X86ReturnThunks.cpp new file mode 100644 index 000000000000..4b203229ba83 --- /dev/null +++ b/llvm/lib/Target/X86/X86ReturnThunks.cpp @@ -0,0 +1,92 @@ +//==- X86ReturnThunks.cpp - Replace rets with thunks or inline thunks --=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that replaces ret instructions with a jmp to __x86_return_thunk. +/// +/// This corresponds to -mfunction-return=thunk-extern or +/// __attribute__((function_return("thunk-extern"). +/// +/// This pass is a minimal implementation necessary to help mitigate +/// RetBleed for the Linux kernel. +/// +/// Should support for thunk or thunk-inline be necessary in the future, then +/// this pass should be combined with x86-retpoline-thunks which already has +/// machinery to emit thunks. Until then, YAGNI. +/// +/// This pass is very similar to x86-lvi-ret. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define PASS_KEY "x86-return-thunks" +#define DEBUG_TYPE PASS_KEY + +struct X86ReturnThunks final : public MachineFunctionPass { + static char ID; + X86ReturnThunks() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { return "X86 Return Thunks"; } + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +char X86ReturnThunks::ID = 0; + +bool X86ReturnThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << "\n"); + + bool Modified = false; + + if (!MF.getFunction().hasFnAttribute(llvm::Attribute::FnRetThunkExtern)) + return Modified; + + StringRef ThunkName = "__x86_return_thunk"; + if (MF.getFunction().getName() == ThunkName) + return Modified; + + const auto &ST = MF.getSubtarget<X86Subtarget>(); + const bool Is64Bit = ST.getTargetTriple().getArch() == Triple::x86_64; + const unsigned RetOpc = Is64Bit ? X86::RET64 : X86::RET32; + SmallVector<MachineInstr *, 16> Rets; + + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &Term : MBB.terminators()) + if (Term.getOpcode() == RetOpc) + Rets.push_back(&Term); + + const MCInstrDesc &JMP = ST.getInstrInfo()->get(X86::TAILJMPd); + + for (MachineInstr *Ret : Rets) { + BuildMI(Ret->getParent(), Ret->getDebugLoc(), JMP) + .addExternalSymbol(ThunkName.data()); + Ret->eraseFromParent(); + Modified = true; + } + + return Modified; +} + +INITIALIZE_PASS(X86ReturnThunks, PASS_KEY, "X86 Return Thunks", false, false) + +FunctionPass *llvm::createX86ReturnThunksPass() { + return new X86ReturnThunks(); +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 4249788e3540..f4e25e4194db 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -100,6 +100,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86OptimizeLEAPassPass(PR); initializeX86PartialReductionPass(PR); initializePseudoProbeInserterPass(PR); + initializeX86ReturnThunksPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -575,6 +576,7 @@ void X86PassConfig::addPreEmitPass2() { // hand inspection of the codegen output. addPass(createX86SpeculativeExecutionSideEffectSuppression()); addPass(createX86IndirectThunksPass()); + addPass(createX86ReturnThunksPass()); // Insert extra int3 instructions after trailing call instructions to avoid // issues in the unwinder. diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp index c286b747a271..a782ff436dc0 100644 --- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp +++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp @@ -29,6 +29,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "XCoreGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h index 096b22415a22..ec4418333859 100644 --- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h +++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h @@ -22,6 +22,7 @@ // Defines symbolic names for the XCore instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "XCoreGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 8fea61d125d2..691fdf16bc0f 100644 --- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -256,6 +256,9 @@ bool XCoreAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void XCoreAsmPrinter::emitInstruction(const MachineInstr *MI) { + XCore_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + SmallString<128> Str; raw_svector_ostream O(Str); diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td index 0d97f77e525f..9d969b040ef2 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/Options.td +++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td @@ -44,5 +44,7 @@ def help_q : Flag<["/??", "-??", "/?", "-?"], "">, Alias<help>; //============================================================================== def ltcg : F<"ltcg">; +def nodefaultlib: P<"nodefaultlib", "">; +def nodefaultlib_all: F<"nodefaultlib">; def nologo : F<"nologo">; def subsystem : P<"subsystem", "">; diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index d09607bb1c4c..51eb8ebf0369 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -881,16 +881,16 @@ static DIType *solveDIType(DIBuilder &Builder, Type *Ty, dwarf::DW_ATE_float, llvm::DINode::FlagArtificial); } else if (Ty->isPointerTy()) { - // Construct BasicType instead of PointerType to avoid infinite - // search problem. - // For example, we would be in trouble if we traverse recursively: + // Construct PointerType points to null (aka void *) instead of exploring + // pointee type to avoid infinite search problem. For example, we would be + // in trouble if we traverse recursively: // // struct Node { // Node* ptr; // }; - RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty), - dwarf::DW_ATE_address, - llvm::DINode::FlagArtificial); + RetType = Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty), + Layout.getABITypeAlignment(Ty), + /*DWARFAddressSpace=*/None, Name); } else if (Ty->isStructTy()) { auto *DIStruct = Builder.createStructType( Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty), @@ -914,13 +914,21 @@ static DIType *solveDIType(DIBuilder &Builder, Type *Ty, RetType = DIStruct; } else { - LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n";); - SmallString<32> Buffer; - raw_svector_ostream OS(Buffer); - OS << Name.str() << "_" << Layout.getTypeSizeInBits(Ty); - RetType = Builder.createBasicType(OS.str(), Layout.getTypeSizeInBits(Ty), - dwarf::DW_ATE_address, - llvm::DINode::FlagArtificial); + LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n"); + TypeSize Size = Layout.getTypeSizeInBits(Ty); + auto *CharSizeType = Builder.createBasicType( + Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial); + + if (Size <= 8) + RetType = CharSizeType; + else { + if (Size % 8 != 0) + Size = TypeSize::Fixed(Size + 8 - (Size % 8)); + + RetType = Builder.createArrayType( + Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType, + Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8))); + } } DITypeCache.insert({Ty, RetType}); @@ -971,7 +979,8 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, unsigned LineNum = PromiseDIVariable->getLine(); DICompositeType *FrameDITy = DBuilder.createStructType( - DIS, "__coro_frame_ty", DFile, LineNum, Shape.FrameSize * 8, + DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), + DFile, LineNum, Shape.FrameSize * 8, Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray()); StructType *FrameTy = Shape.FrameTy; @@ -995,14 +1004,12 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, *IndexTy = FrameTy->getElementType(IndexIndex); DenseMap<unsigned, DIType *> TyCache; - TyCache.insert({ResumeIndex, - DBuilder.createBasicType("__resume_fn", - Layout.getTypeSizeInBits(ResumeFnTy), - dwarf::DW_ATE_address)}); TyCache.insert( - {DestroyIndex, DBuilder.createBasicType( - "__destroy_fn", Layout.getTypeSizeInBits(DestroyFnTy), - dwarf::DW_ATE_address)}); + {ResumeIndex, DBuilder.createPointerType( + nullptr, Layout.getTypeSizeInBits(ResumeFnTy))}); + TyCache.insert( + {DestroyIndex, DBuilder.createPointerType( + nullptr, Layout.getTypeSizeInBits(DestroyFnTy))}); /// FIXME: If we fill the field `SizeInBits` with the actual size of /// __coro_index in bits, then __coro_index wouldn't show in the debugger. diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index ead552d9be4e..9c1b247cdb39 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -389,7 +389,7 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) { // Replace CoroSave with a store to Index: // %index.addr = getelementptr %f.frame... (index field number) - // store i32 0, i32* %index.addr1 + // store i32 %IndexVal, i32* %index.addr1 auto *Save = S->getCoroSave(); Builder.SetInsertPoint(Save); if (S->isFinal()) { diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index b05b7990e3f0..e5ff98e4f73f 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -718,8 +718,8 @@ Argument *IRPosition::getAssociatedArgument() const { } // If we found a unique callback candidate argument, return it. - if (CBCandidateArg && CBCandidateArg.getValue()) - return CBCandidateArg.getValue(); + if (CBCandidateArg && CBCandidateArg.value()) + return CBCandidateArg.value(); // If no callbacks were found, or none used the underlying call site operand // exclusively, use the direct callee argument if available. @@ -1048,11 +1048,11 @@ Attributor::getAssumedConstant(const IRPosition &IRP, recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); return llvm::None; } - if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) { + if (isa_and_nonnull<UndefValue>(SimplifiedV.value())) { recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL); return UndefValue::get(IRP.getAssociatedType()); } - Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue()); + Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.value()); if (CI) CI = dyn_cast_or_null<Constant>( AA::getWithType(*CI, *IRP.getAssociatedType())); @@ -2697,8 +2697,8 @@ void InformationCache::initializeInformationCache(const Function &CF, Optional<short> &NumUses = AssumeUsesMap[I]; if (!NumUses) NumUses = I->getNumUses(); - NumUses = NumUses.getValue() - /* this assume */ 1; - if (NumUses.getValue() != 0) + NumUses = NumUses.value() - /* this assume */ 1; + if (NumUses.value() != 0) continue; AssumeOnlyValues.insert(I); for (const Value *Op : I->operands()) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 4d99ce7e3175..1ff54b78e27e 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -437,7 +437,7 @@ static bool genericValueTraversal( A.getAssumedSimplified(*V, QueryingAA, UsedAssumedInformation); if (!SimpleV) continue; - Value *NewV = SimpleV.getValue(); + Value *NewV = SimpleV.value(); if (NewV && NewV != V) { if ((VS & AA::Interprocedural) || !CtxI || AA::isValidInScope(*NewV, CtxI->getFunction())) { @@ -1891,14 +1891,14 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { // Check if we have an assumed unique return value that we could manifest. Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A); - if (!UniqueRV || !UniqueRV.getValue()) + if (!UniqueRV || !UniqueRV.value()) return Changed; // Bookkeeping. STATS_DECLTRACK(UniqueReturnValue, FunctionReturn, "Number of function with unique return"); // If the assumed unique return value is an argument, annotate it. - if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) { + if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.value())) { if (UniqueRVArg->getType()->canLosslesslyBitCastTo( getAssociatedFunction()->getReturnType())) { getIRPosition() = IRPosition::argument(*UniqueRVArg); @@ -2666,9 +2666,9 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { // Either we stopped and the appropriate action was taken, // or we got back a simplified value to continue. Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I); - if (!SimplifiedPtrOp || !SimplifiedPtrOp.getValue()) + if (!SimplifiedPtrOp || !SimplifiedPtrOp.value()) return true; - const Value *PtrOpVal = SimplifiedPtrOp.getValue(); + const Value *PtrOpVal = SimplifiedPtrOp.value(); // A memory access through a pointer is considered UB // only if the pointer has constant null value. @@ -2757,14 +2757,14 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { IRPosition::value(*ArgVal), *this, UsedAssumedInformation); if (UsedAssumedInformation) continue; - if (SimplifiedVal && !SimplifiedVal.getValue()) + if (SimplifiedVal && !SimplifiedVal.value()) return true; - if (!SimplifiedVal || isa<UndefValue>(*SimplifiedVal.getValue())) { + if (!SimplifiedVal || isa<UndefValue>(*SimplifiedVal.value())) { KnownUBInsts.insert(&I); continue; } if (!ArgVal->getType()->isPointerTy() || - !isa<ConstantPointerNull>(*SimplifiedVal.getValue())) + !isa<ConstantPointerNull>(*SimplifiedVal.value())) continue; auto &NonNullAA = A.getAAFor<AANonNull>(*this, CalleeArgumentIRP, DepClassTy::NONE); @@ -4101,11 +4101,11 @@ identifyAliveSuccessors(Attributor &A, const SwitchInst &SI, bool UsedAssumedInformation = false; Optional<Constant *> C = A.getAssumedConstant(*SI.getCondition(), AA, UsedAssumedInformation); - if (!C || isa_and_nonnull<UndefValue>(C.getValue())) { + if (!C || isa_and_nonnull<UndefValue>(C.value())) { // No value yet, assume all edges are dead. - } else if (isa_and_nonnull<ConstantInt>(C.getValue())) { + } else if (isa_and_nonnull<ConstantInt>(C.value())) { for (auto &CaseIt : SI.cases()) { - if (CaseIt.getCaseValue() == C.getValue()) { + if (CaseIt.getCaseValue() == C.value()) { AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front()); return UsedAssumedInformation; } @@ -5523,11 +5523,10 @@ struct AAValueSimplifyImpl : AAValueSimplify { if (!SimpleV) return PoisonValue::get(&Ty); Value *EffectiveV = &V; - if (SimpleV.getValue()) - EffectiveV = SimpleV.getValue(); + if (SimpleV.value()) + EffectiveV = SimpleV.value(); if (auto *C = dyn_cast<Constant>(EffectiveV)) - if (!C->canTrap()) - return C; + return C; if (CtxI && AA::isValidAtPosition(AA::ValueAndContext(*EffectiveV, *CtxI), A.getInfoCache())) return ensureType(A, *EffectiveV, Ty, CtxI, Check); @@ -5541,7 +5540,7 @@ struct AAValueSimplifyImpl : AAValueSimplify { /// nullptr if we don't have one that makes sense. Value *manifestReplacementValue(Attributor &A, Instruction *CtxI) const { Value *NewV = SimplifiedAssociatedValue - ? SimplifiedAssociatedValue.getValue() + ? SimplifiedAssociatedValue.value() : UndefValue::get(getAssociatedType()); if (NewV && NewV != &getAssociatedValue()) { ValueToValueMapTy VMap; @@ -5672,7 +5671,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl { A.getAssumedConstant(ACSArgPos, *this, UsedAssumedInformation); if (!SimpleArgOp) return true; - if (!SimpleArgOp.getValue()) + if (!SimpleArgOp.value()) return false; if (!AA::isDynamicallyUnique(A, *this, **SimpleArgOp)) return false; @@ -5787,7 +5786,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return true; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return false; LHS = *SimplifiedLHS; @@ -5796,7 +5795,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return true; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return false; RHS = *SimplifiedRHS; @@ -5868,8 +5867,8 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { if (!SimplifiedOp) return true; - if (SimplifiedOp.getValue()) - NewOps[Idx] = SimplifiedOp.getValue(); + if (SimplifiedOp.value()) + NewOps[Idx] = SimplifiedOp.value(); else NewOps[Idx] = Op; @@ -6112,6 +6111,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack { /// but which is not in the deallocation infos. bool HasPotentiallyFreeingUnknownUses = false; + /// Flag to indicate that we should place the new alloca in the function + /// entry block rather than where the call site (CB) is. + bool MoveAllocaIntoEntry = true; + /// The set of free calls that use this allocation. SmallSetVector<CallBase *, 1> PotentialFreeCalls{}; }; @@ -6242,17 +6245,6 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Function *F = getAnchorScope(); const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); - LoopInfo *LI = - A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(*F); - Optional<bool> MayContainIrreducibleControl; - auto IsInLoop = [&](BasicBlock &BB) { - if (!MayContainIrreducibleControl.has_value()) - MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI); - if (MayContainIrreducibleControl.value()) - return true; - return LI->getLoopFor(&BB) != nullptr; - }; - for (auto &It : AllocationInfos) { AllocationInfo &AI = *It.second; if (AI.Status == AllocationInfo::INVALID) @@ -6294,25 +6286,25 @@ struct AAHeapToStackFunction final : public AAHeapToStack { Size = SizeOffsetPair.first; } - Instruction *IP = (!SizeAPI.has_value() || IsInLoop(*AI.CB->getParent())) - ? AI.CB - : &F->getEntryBlock().front(); + Instruction *IP = + AI.MoveAllocaIntoEntry ? &F->getEntryBlock().front() : AI.CB; Align Alignment(1); if (MaybeAlign RetAlign = AI.CB->getRetAlign()) Alignment = std::max(Alignment, *RetAlign); if (Value *Align = getAllocAlignment(AI.CB, TLI)) { Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align); - assert(AlignmentAPI && AlignmentAPI.getValue().getZExtValue() > 0 && + assert(AlignmentAPI && AlignmentAPI.value().getZExtValue() > 0 && "Expected an alignment during manifest!"); Alignment = std::max( - Alignment, assumeAligned(AlignmentAPI.getValue().getZExtValue())); + Alignment, assumeAligned(AlignmentAPI.value().getZExtValue())); } // TODO: Hoist the alloca towards the function entry. unsigned AS = DL.getAllocaAddrSpace(); - Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS, - Size, Alignment, "", IP); + Instruction *Alloca = + new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment, + AI.CB->getName() + ".h2s", IP); if (Alloca->getType() != AI.CB->getType()) Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( @@ -6354,7 +6346,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack { A.getAssumedConstant(V, AA, UsedAssumedInformation); if (!SimpleV) return APInt(64, 0); - if (auto *CI = dyn_cast_or_null<ConstantInt>(SimpleV.getValue())) + if (auto *CI = dyn_cast_or_null<ConstantInt>(SimpleV.value())) return CI->getValue(); return llvm::None; } @@ -6400,6 +6392,21 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { bool StackIsAccessibleByOtherThreads = A.getInfoCache().stackIsAccessibleByOtherThreads(); + LoopInfo *LI = + A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(*F); + Optional<bool> MayContainIrreducibleControl; + auto IsInLoop = [&](BasicBlock &BB) { + if (&F->getEntryBlock() == &BB) + return false; + if (!MayContainIrreducibleControl.has_value()) + MayContainIrreducibleControl = mayContainIrreducibleControl(*F, LI); + if (MayContainIrreducibleControl.value()) + return true; + if (!LI) + return true; + return LI->getLoopFor(&BB) != nullptr; + }; + // Flag to ensure we update our deallocation information at most once per // updateImpl call and only if we use the free check reasoning. bool HasUpdatedFrees = false; @@ -6617,21 +6624,20 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { AI.Status = AllocationInfo::INVALID; Changed = ChangeStatus::CHANGED; continue; - } else { - if (APAlign->ugt(llvm::Value::MaximumAlignment) || - !APAlign->isPowerOf2()) { - LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign - << "\n"); - AI.Status = AllocationInfo::INVALID; - Changed = ChangeStatus::CHANGED; - continue; - } + } + if (APAlign->ugt(llvm::Value::MaximumAlignment) || + !APAlign->isPowerOf2()) { + LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign + << "\n"); + AI.Status = AllocationInfo::INVALID; + Changed = ChangeStatus::CHANGED; + continue; } } + Optional<APInt> Size = getSize(A, *this, AI); if (MaxHeapToStackSize != -1) { - Optional<APInt> Size = getSize(A, *this, AI); - if (!Size || Size.getValue().ugt(MaxHeapToStackSize)) { + if (!Size || Size.value().ugt(MaxHeapToStackSize)) { LLVM_DEBUG({ if (!Size) dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n"; @@ -6649,18 +6655,23 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { switch (AI.Status) { case AllocationInfo::STACK_DUE_TO_USE: if (UsesCheck(AI)) - continue; + break; AI.Status = AllocationInfo::STACK_DUE_TO_FREE; LLVM_FALLTHROUGH; case AllocationInfo::STACK_DUE_TO_FREE: if (FreeCheck(AI)) - continue; + break; AI.Status = AllocationInfo::INVALID; Changed = ChangeStatus::CHANGED; - continue; + break; case AllocationInfo::INVALID: llvm_unreachable("Invalid allocations should never reach this point!"); }; + + // Check if we still think we can move it into the entry block. + if (AI.MoveAllocaIntoEntry && + (!Size.has_value() || IsInLoop(*AI.CB->getParent()))) + AI.MoveAllocaIntoEntry = false; } return Changed; @@ -6748,8 +6759,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { LLVM_DEBUG({ dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: "; - if (CSTy && CSTy.getValue()) - CSTy.getValue()->print(dbgs()); + if (CSTy && CSTy.value()) + CSTy.value()->print(dbgs()); else if (CSTy) dbgs() << "<nullptr>"; else @@ -6760,8 +6771,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { LLVM_DEBUG({ dbgs() << " : New Type: "; - if (Ty && Ty.getValue()) - Ty.getValue()->print(dbgs()); + if (Ty && Ty.value()) + Ty.value()->print(dbgs()); else if (Ty) dbgs() << "<nullptr>"; else @@ -6769,7 +6780,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { dbgs() << "\n"; }); - return !Ty || Ty.getValue(); + return !Ty || Ty.value(); }; if (!A.checkForAllCallSites(CallSiteCheck, *this, true, @@ -6783,7 +6794,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { PrivatizableType = identifyPrivatizableType(A); if (!PrivatizableType) return ChangeStatus::UNCHANGED; - if (!PrivatizableType.getValue()) + if (!PrivatizableType.value()) return indicatePessimisticFixpoint(); // The dependence is optional so we don't give up once we give up on the @@ -6871,7 +6882,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType(); if (!CBArgPrivTy) continue; - if (CBArgPrivTy.getValue() == PrivatizableType) + if (CBArgPrivTy.value() == PrivatizableType) continue; } @@ -6918,7 +6929,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType(); if (!DCArgPrivTy) return true; - if (DCArgPrivTy.getValue() == PrivatizableType) + if (DCArgPrivTy.value() == PrivatizableType) return true; } } @@ -7060,7 +7071,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { ChangeStatus manifest(Attributor &A) override { if (!PrivatizableType) return ChangeStatus::UNCHANGED; - assert(PrivatizableType.getValue() && "Expected privatizable type!"); + assert(PrivatizableType.value() && "Expected privatizable type!"); // Collect all tail calls in the function as we cannot allow new allocas to // escape into tail recursion. @@ -7093,9 +7104,9 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { Instruction *IP = &*EntryBB.getFirstInsertionPt(); const DataLayout &DL = IP->getModule()->getDataLayout(); unsigned AS = DL.getAllocaAddrSpace(); - Instruction *AI = new AllocaInst(PrivatizableType.getValue(), AS, + Instruction *AI = new AllocaInst(PrivatizableType.value(), AS, Arg->getName() + ".priv", IP); - createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn, + createInitialization(PrivatizableType.value(), *AI, ReplacementFn, ArgIt->getArgNo(), *IP); if (AI->getType() != Arg->getType()) @@ -7203,7 +7214,7 @@ struct AAPrivatizablePtrCallSiteArgument final PrivatizableType = identifyPrivatizableType(A); if (!PrivatizableType) return ChangeStatus::UNCHANGED; - if (!PrivatizableType.getValue()) + if (!PrivatizableType.value()) return indicatePessimisticFixpoint(); const IRPosition &IRP = getIRPosition(); @@ -8664,7 +8675,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return true; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return false; LHS = *SimplifiedLHS; @@ -8673,7 +8684,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return true; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return false; RHS = *SimplifiedRHS; @@ -8717,7 +8728,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedOpV) return true; - if (!SimplifiedOpV.getValue()) + if (!SimplifiedOpV.value()) return false; OpV = *SimplifiedOpV; @@ -8747,7 +8758,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return true; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return false; LHS = *SimplifiedLHS; @@ -8756,7 +8767,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return true; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return false; RHS = *SimplifiedRHS; @@ -8821,7 +8832,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { *this, UsedAssumedInformation); if (!SimplifiedOpV) return true; - if (!SimplifiedOpV.getValue()) + if (!SimplifiedOpV.value()) return false; Value *VPtr = *SimplifiedOpV; @@ -9182,7 +9193,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return indicatePessimisticFixpoint(); LHS = *SimplifiedLHS; @@ -9191,7 +9202,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return indicatePessimisticFixpoint(); RHS = *SimplifiedRHS; @@ -9265,7 +9276,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return indicatePessimisticFixpoint(); LHS = *SimplifiedLHS; @@ -9274,7 +9285,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return indicatePessimisticFixpoint(); RHS = *SimplifiedRHS; @@ -9340,7 +9351,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedSrc) return ChangeStatus::UNCHANGED; - if (!SimplifiedSrc.getValue()) + if (!SimplifiedSrc.value()) return indicatePessimisticFixpoint(); Src = *SimplifiedSrc; @@ -9373,7 +9384,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedLHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedLHS.getValue()) + if (!SimplifiedLHS.value()) return indicatePessimisticFixpoint(); LHS = *SimplifiedLHS; @@ -9382,7 +9393,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { *this, UsedAssumedInformation); if (!SimplifiedRHS) return ChangeStatus::UNCHANGED; - if (!SimplifiedRHS.getValue()) + if (!SimplifiedRHS.value()) return indicatePessimisticFixpoint(); RHS = *SimplifiedRHS; @@ -9441,7 +9452,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl { UsedAssumedInformation); if (!SimplifiedIncomingValue) continue; - if (!SimplifiedIncomingValue.getValue()) + if (!SimplifiedIncomingValue.value()) return indicatePessimisticFixpoint(); IncomingValue = *SimplifiedIncomingValue; @@ -9930,7 +9941,7 @@ private: const Function &Fn) { Optional<bool> Cached = isCachedReachable(Fn); if (Cached) - return Cached.getValue(); + return Cached.value(); // The query was not cached, thus it is new. We need to request an update // explicitly to make sure this the information is properly run to a diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 1a1bde4f0668..1ad6e2b2a1d2 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1584,11 +1584,6 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, } Value *StoredOnceValue = GS.getStoredOnceValue(); if (GS.StoredType == GlobalStatus::StoredOnce && StoredOnceValue) { - // Avoid speculating constant expressions that might trap (div/rem). - auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue); - if (SOVConstant && SOVConstant->canTrap()) - return Changed; - Function &StoreFn = const_cast<Function &>(*GS.StoredOnceStore->getFunction()); bool CanHaveNonUndefGlobalInitializer = @@ -1601,6 +1596,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, // This is restricted to address spaces that allow globals to have // initializers. NVPTX, for example, does not support initializers for // shared memory (AS 3). + auto *SOVConstant = dyn_cast<Constant>(StoredOnceValue); if (SOVConstant && isa<UndefValue>(GV->getInitializer()) && DL.getTypeAllocSize(SOVConstant->getType()) == DL.getTypeAllocSize(GV->getValueType()) && diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index d75d99e307fd..28bc43aa1633 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -555,7 +555,7 @@ collectRegionsConstants(OutlinableRegion &Region, for (Value *V : ID.OperVals) { Optional<unsigned> GVNOpt = C.getGVN(V); assert(GVNOpt && "Expected a GVN for operand?"); - unsigned GVN = GVNOpt.getValue(); + unsigned GVN = GVNOpt.value(); // Check if this global value has been found to not be the same already. if (NotSame.contains(GVN)) { @@ -570,7 +570,7 @@ collectRegionsConstants(OutlinableRegion &Region, // it is considered to not be the same value. Optional<bool> ConstantMatches = constantMatches(V, GVN, GVNToConstant); if (ConstantMatches) { - if (ConstantMatches.getValue()) + if (ConstantMatches.value()) continue; else ConstantsTheSame = false; @@ -651,7 +651,7 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group, // Transfer the swifterr attribute to the correct function parameter. if (Group.SwiftErrorArgument) - Group.OutlinedFunction->addParamAttr(Group.SwiftErrorArgument.getValue(), + Group.OutlinedFunction->addParamAttr(Group.SwiftErrorArgument.value(), Attribute::SwiftError); Group.OutlinedFunction->addFnAttr(Attribute::OptimizeForSize); @@ -809,7 +809,7 @@ static void mapInputsToGVNs(IRSimilarityCandidate &C, if (OutputMappings.find(Input) != OutputMappings.end()) Input = OutputMappings.find(Input)->second; assert(C.getGVN(Input) && "Could not find a numbering for the given input"); - EndInputNumbers.push_back(C.getGVN(Input).getValue()); + EndInputNumbers.push_back(C.getGVN(Input).value()); } } @@ -948,11 +948,11 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region, for (unsigned InputVal : InputGVNs) { Optional<unsigned> CanonicalNumberOpt = C.getCanonicalNum(InputVal); assert(CanonicalNumberOpt && "Canonical number not found?"); - unsigned CanonicalNumber = CanonicalNumberOpt.getValue(); + unsigned CanonicalNumber = CanonicalNumberOpt.value(); Optional<Value *> InputOpt = C.fromGVN(InputVal); assert(InputOpt && "Global value number not found?"); - Value *Input = InputOpt.getValue(); + Value *Input = InputOpt.value(); DenseMap<unsigned, unsigned>::iterator AggArgIt = Group.CanonicalNumberToAggArg.find(CanonicalNumber); @@ -1236,13 +1236,13 @@ static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region, Optional<unsigned> BBGVN = Cand.getGVN(PHIBB); assert(BBGVN && "Could not find GVN for the incoming block!"); - BBGVN = Cand.getCanonicalNum(BBGVN.getValue()); + BBGVN = Cand.getCanonicalNum(BBGVN.value()); assert(BBGVN && "Could not find canonical number for the incoming block!"); // Create a pair of the exit block canonical value, and the aggregate // argument location, connected to the canonical numbers stored in the // PHINode. PHINodeData TemporaryPair = - std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs); + std::make_pair(std::make_pair(BBGVN.value(), AggArgIdx), PHIGVNs); hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair); // Look for and create a new entry in our connection between canonical @@ -1516,8 +1516,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) { // Make sure that the argument in the new function has the SwiftError // argument. if (Group.SwiftErrorArgument) - Call->addParamAttr(Group.SwiftErrorArgument.getValue(), - Attribute::SwiftError); + Call->addParamAttr(Group.SwiftErrorArgument.value(), Attribute::SwiftError); return Call; } @@ -2082,9 +2081,9 @@ static void alignOutputBlockWithAggFunc( if (MatchingBB) { LLVM_DEBUG(dbgs() << "Set output block for region in function" << Region.ExtractedFunction << " to " - << MatchingBB.getValue()); + << MatchingBB.value()); - Region.OutputBlockNum = MatchingBB.getValue(); + Region.OutputBlockNum = MatchingBB.value(); for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) VtoBB.second->eraseFromParent(); return; @@ -2679,15 +2678,14 @@ void IROutliner::updateOutputMapping(OutlinableRegion &Region, if (!OutputIdx) return; - if (OutputMappings.find(Outputs[OutputIdx.getValue()]) == - OutputMappings.end()) { + if (OutputMappings.find(Outputs[OutputIdx.value()]) == OutputMappings.end()) { LLVM_DEBUG(dbgs() << "Mapping extracted output " << *LI << " to " - << *Outputs[OutputIdx.getValue()] << "\n"); - OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.getValue()])); + << *Outputs[OutputIdx.value()] << "\n"); + OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.value()])); } else { - Value *Orig = OutputMappings.find(Outputs[OutputIdx.getValue()])->second; + Value *Orig = OutputMappings.find(Outputs[OutputIdx.value()])->second; LLVM_DEBUG(dbgs() << "Mapping extracted output " << *Orig << " to " - << *Outputs[OutputIdx.getValue()] << "\n"); + << *Outputs[OutputIdx.value()] << "\n"); OutputMappings.insert(std::make_pair(LI, Orig)); } } diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 227ad8501f25..8e0ca8c6c997 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3340,6 +3340,9 @@ struct AAKernelInfoFunction : AAKernelInfo { } bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { + if (!mayContainParallelRegion()) + return false; + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); if (!SPMDCompatibilityTracker.isAssumed()) { @@ -4428,10 +4431,10 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { if (!SimplifiedValue) return Str + std::string("none"); - if (!SimplifiedValue.getValue()) + if (!SimplifiedValue.value()) return Str + std::string("nullptr"); - if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue())) + if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.value())) return Str + std::to_string(CI->getSExtValue()); return Str + std::string("unknown"); @@ -4456,7 +4459,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional<Value *> { assert((isValidState() || - (SimplifiedValue && SimplifiedValue.getValue() == nullptr)) && + (SimplifiedValue && SimplifiedValue.value() == nullptr)) && "Unexpected invalid state!"); if (!isAtFixpoint()) { diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index ae787be40c55..8eef82675e86 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -898,183 +898,6 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createAnnotationRemarksLegacyPass()); } -void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { - // Load sample profile before running the LTO optimization pipeline. - if (!PGOSampleUse.empty()) { - PM.add(createPruneEHPass()); - PM.add(createSampleProfileLoaderPass(PGOSampleUse)); - } - - // Remove unused virtual tables to improve the quality of code generated by - // whole-program devirtualization and bitset lowering. - PM.add(createGlobalDCEPass()); - - // Provide AliasAnalysis services for optimizations. - addInitialAliasAnalysisPasses(PM); - - // Allow forcing function attributes as a debugging and tuning aid. - PM.add(createForceFunctionAttrsLegacyPass()); - - // Infer attributes about declarations if possible. - PM.add(createInferFunctionAttrsLegacyPass()); - - if (OptLevel > 1) { - // Split call-site with more constrained arguments. - PM.add(createCallSiteSplittingPass()); - - // Propage constant function arguments by specializing the functions. - if (EnableFunctionSpecialization && OptLevel > 2) - PM.add(createFunctionSpecializationPass()); - - // Propagate constants at call sites into the functions they call. This - // opens opportunities for globalopt (and inlining) by substituting function - // pointers passed as arguments to direct uses of functions. - PM.add(createIPSCCPPass()); - - // Attach metadata to indirect call sites indicating the set of functions - // they may target at run-time. This should follow IPSCCP. - PM.add(createCalledValuePropagationPass()); - - // Infer attributes on declarations, call sites, arguments, etc. - if (AttributorRun & AttributorRunOption::MODULE) - PM.add(createAttributorLegacyPass()); - } - - // Infer attributes about definitions. The readnone attribute in particular is - // required for virtual constant propagation. - PM.add(createPostOrderFunctionAttrsLegacyPass()); - PM.add(createReversePostOrderFunctionAttrsPass()); - - // Split globals using inrange annotations on GEP indices. This can help - // improve the quality of generated code when virtual constant propagation or - // control flow integrity are enabled. - PM.add(createGlobalSplitPass()); - - // Apply whole-program devirtualization and virtual constant propagation. - PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); - - // That's all we need at opt level 1. - if (OptLevel == 1) - return; - - // Now that we internalized some globals, see if we can hack on them! - PM.add(createGlobalOptimizerPass()); - // Promote any localized global vars. - PM.add(createPromoteMemoryToRegisterPass()); - - // Linking modules together can lead to duplicated global constants, only - // keep one copy of each constant. - PM.add(createConstantMergePass()); - - // Remove unused arguments from functions. - PM.add(createDeadArgEliminationPass()); - - // Reduce the code after globalopt and ipsccp. Both can open up significant - // simplification opportunities, and both can propagate functions through - // function pointers. When this happens, we often have to resolve varargs - // calls, etc, so let instcombine do this. - if (OptLevel > 2) - PM.add(createAggressiveInstCombinerPass()); - PM.add(createInstructionCombiningPass()); - addExtensionsToPM(EP_Peephole, PM); - - // Inline small functions - bool RunInliner = Inliner; - if (RunInliner) { - PM.add(Inliner); - Inliner = nullptr; - } - - PM.add(createPruneEHPass()); // Remove dead EH info. - - // Infer attributes on declarations, call sites, arguments, etc. for an SCC. - if (AttributorRun & AttributorRunOption::CGSCC) - PM.add(createAttributorCGSCCLegacyPass()); - - // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if - // there are no OpenMP runtime calls present in the module. - if (OptLevel > 1) - PM.add(createOpenMPOptCGSCCLegacyPass()); - - // Optimize globals again if we ran the inliner. - if (RunInliner) - PM.add(createGlobalOptimizerPass()); - PM.add(createGlobalDCEPass()); // Remove dead functions. - - // The IPO passes may leave cruft around. Clean up after them. - PM.add(createInstructionCombiningPass()); - addExtensionsToPM(EP_Peephole, PM); - PM.add(createJumpThreadingPass()); - - // Break up allocas - PM.add(createSROAPass()); - - // LTO provides additional opportunities for tailcall elimination due to - // link-time inlining, and visibility of nocapture attribute. - if (OptLevel > 1) - PM.add(createTailCallEliminationPass()); - - // Infer attributes on declarations, call sites, arguments, etc. - PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture. - // Run a few AA driven optimizations here and now, to cleanup the code. - PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. - - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, - /*AllowSpeculation=*/true)); - PM.add(NewGVN ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. - PM.add(createMemCpyOptPass()); // Remove dead memcpys. - - // Nuke dead stores. - PM.add(createDeadStoreEliminationPass()); - PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds. - - // More loops are countable; try to optimize them. - if (EnableLoopFlatten) - PM.add(createLoopFlattenPass()); - PM.add(createIndVarSimplifyPass()); - PM.add(createLoopDeletionPass()); - if (EnableLoopInterchange) - PM.add(createLoopInterchangePass()); - - if (EnableConstraintElimination) - PM.add(createConstraintEliminationPass()); - - // Unroll small loops and perform peeling. - PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createLoopDistributePass()); - - addVectorPasses(PM, /* IsFullLTO */ true); - - addExtensionsToPM(EP_Peephole, PM); - - PM.add(createJumpThreadingPass()); -} - -void PassManagerBuilder::addLateLTOOptimizationPasses( - legacy::PassManagerBase &PM) { - // See comment in the new PM for justification of scheduling splitting at - // this stage (\ref buildLTODefaultPipeline). - if (EnableHotColdSplit) - PM.add(createHotColdSplittingPass()); - - // Delete basic blocks, which optimization passes may have killed. - PM.add( - createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true))); - - // Drop bodies of available externally objects to improve GlobalDCE. - PM.add(createEliminateAvailableExternallyPass()); - - // Now that we have optimized the program, discard unreachable functions. - PM.add(createGlobalDCEPass()); - - // FIXME: this is profitable (for compiler time) to do at -O0 too, but - // currently it damages debug info. - if (MergeFunctions) - PM.add(createMergeFunctionsPass()); -} - LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() { PassManagerBuilder *PMB = new PassManagerBuilder(); return wrap(PMB); diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index 6859953de962..764fd57d245f 100644 --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -130,7 +130,7 @@ void ContextTrieNode::addFunctionSize(uint32_t FSize) { if (!FuncSize) FuncSize = 0; - FuncSize = FuncSize.getValue() + FSize; + FuncSize = FuncSize.value() + FSize; } LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; } diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 40de69bbf2cf..55fee213cd5f 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1350,14 +1350,14 @@ SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { Optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); - return Cost ? !!Cost.getValue() : false; + return Cost ? !!Cost.value() : false; } InlineCost SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { if (Optional<InlineCost> ReplayCost = getExternalInlineAdvisorCost(*Candidate.CallInstr)) - return ReplayCost.getValue(); + return ReplayCost.value(); // Adjust threshold based on call site hotness, only do this for callsite // prioritized inliner because otherwise cost-benefit check is done earlier. int SampleThreshold = SampleColdCallSiteThreshold; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index f4d8b79a5311..535a7736454c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1660,8 +1660,9 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { Constant *MulC; if (match(&I, m_c_FAdd(m_FMul(m_Value(X), m_ImmConstant(MulC)), m_Deferred(X)))) { - MulC = ConstantExpr::getFAdd(MulC, ConstantFP::get(I.getType(), 1.0)); - return BinaryOperator::CreateFMulFMF(X, MulC, &I); + if (Constant *NewMulC = ConstantFoldBinaryOpOperands( + Instruction::FAdd, MulC, ConstantFP::get(I.getType(), 1.0), DL)) + return BinaryOperator::CreateFMulFMF(X, NewMulC, &I); } if (Value *V = FAddCombine(Builder).simplify(&I)) @@ -1750,6 +1751,52 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS, return Builder.CreateIntCast(Result, Ty, true); } +static Instruction *foldSubOfMinMax(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Type *Ty = I.getType(); + auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op1); + if (!MinMax) + return nullptr; + + // sub(add(X,Y), s/umin(X,Y)) --> s/umax(X,Y) + // sub(add(X,Y), s/umax(X,Y)) --> s/umin(X,Y) + Value *X = MinMax->getLHS(); + Value *Y = MinMax->getRHS(); + if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) && + (Op0->hasOneUse() || Op1->hasOneUse())) { + Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); + Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + return CallInst::Create(F, {X, Y}); + } + + // sub(add(X,Y),umin(Y,Z)) --> add(X,usub.sat(Y,Z)) + // sub(add(X,Z),umin(Y,Z)) --> add(X,usub.sat(Z,Y)) + Value *Z; + if (match(Op1, m_OneUse(m_UMin(m_Value(Y), m_Value(Z))))) { + if (match(Op0, m_OneUse(m_c_Add(m_Specific(Y), m_Value(X))))) { + Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, Ty, {Y, Z}); + return BinaryOperator::CreateAdd(X, USub); + } + if (match(Op0, m_OneUse(m_c_Add(m_Specific(Z), m_Value(X))))) { + Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, Ty, {Z, Y}); + return BinaryOperator::CreateAdd(X, USub); + } + } + + // sub Op0, smin((sub nsw Op0, Z), 0) --> smax Op0, Z + // sub Op0, smax((sub nsw Op0, Z), 0) --> smin Op0, Z + if (MinMax->isSigned() && match(Y, m_ZeroInt()) && + match(X, m_NSWSub(m_Specific(Op0), m_Value(Z)))) { + Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); + Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + return CallInst::Create(F, {Op0, Z}); + } + + return nullptr; +} + Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { if (Value *V = simplifySubInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), @@ -1919,14 +1966,12 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2)); } + // If there's no chance any bit will need to borrow from an adjacent bit: + // sub C, X --> xor X, C const APInt *Op0C; - if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) { - // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known - // zero. - KnownBits RHSKnown = computeKnownBits(Op1, 0, &I); - if ((*Op0C | RHSKnown.Zero).isAllOnes()) - return BinaryOperator::CreateXor(Op1, Op0); - } + if (match(Op0, m_APInt(Op0C)) && + (~computeKnownBits(Op1, 0, &I).Zero).isSubsetOf(*Op0C)) + return BinaryOperator::CreateXor(Op1, Op0); { Value *Y; @@ -2016,36 +2061,8 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { } } - if (auto *II = dyn_cast<MinMaxIntrinsic>(Op1)) { - { - // sub(add(X,Y), s/umin(X,Y)) --> s/umax(X,Y) - // sub(add(X,Y), s/umax(X,Y)) --> s/umin(X,Y) - Value *X = II->getLHS(); - Value *Y = II->getRHS(); - if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) && - (Op0->hasOneUse() || Op1->hasOneUse())) { - Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID()); - Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y); - return replaceInstUsesWith(I, InvMaxMin); - } - } - - { - // sub(add(X,Y),umin(Y,Z)) --> add(X,usub.sat(Y,Z)) - // sub(add(X,Z),umin(Y,Z)) --> add(X,usub.sat(Z,Y)) - Value *X, *Y, *Z; - if (match(Op1, m_OneUse(m_UMin(m_Value(Y), m_Value(Z))))) { - if (match(Op0, m_OneUse(m_c_Add(m_Specific(Y), m_Value(X))))) - return BinaryOperator::CreateAdd( - X, Builder.CreateIntrinsic(Intrinsic::usub_sat, I.getType(), - {Y, Z})); - if (match(Op0, m_OneUse(m_c_Add(m_Specific(Z), m_Value(X))))) - return BinaryOperator::CreateAdd( - X, Builder.CreateIntrinsic(Intrinsic::usub_sat, I.getType(), - {Z, Y})); - } - } - } + if (Instruction *R = foldSubOfMinMax(I, Builder)) + return R; { // If we have a subtraction between some value and a select between @@ -2437,13 +2454,15 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) { // (X * C) - X --> X * (C - 1.0) if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) { - Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0)); - return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I); + if (Constant *CSubOne = ConstantFoldBinaryOpOperands( + Instruction::FSub, C, ConstantFP::get(Ty, 1.0), DL)) + return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I); } // X - (X * C) --> X * (1.0 - C) if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) { - Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C); - return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I); + if (Constant *OneSubC = ConstantFoldBinaryOpOperands( + Instruction::FSub, ConstantFP::get(Ty, 1.0), C, DL)) + return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I); } // Reassociate fsub/fadd sequences to create more fadd instructions and diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ae8865651ece..a8f2cd79830a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1771,6 +1771,16 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { return new ZExtInst(IsZero, Ty); } + // (-(X & 1)) & Y --> (X & 1) == 0 ? 0 : Y + Value *Neg; + if (match(&I, + m_c_And(m_CombineAnd(m_Value(Neg), + m_OneUse(m_Neg(m_And(m_Value(), m_One())))), + m_Value(Y)))) { + Value *Cmp = Builder.CreateIsNull(Neg); + return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Y); + } + const APInt *C; if (match(Op1, m_APInt(C))) { const APInt *XorC; @@ -1798,7 +1808,8 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { unsigned Width = Ty->getScalarSizeInBits(); const APInt *ShiftC; - if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) { + if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC))))) && + ShiftC->ult(Width)) { if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) { // We are clearing high bits that were potentially set by sext+ashr: // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index 2540e545ae4d..0327efbf9614 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -61,7 +61,13 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) { /// equivalent to its value operand. bool isSaturating(AtomicRMWInst& RMWI) { if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand())) - switch(RMWI.getOperation()) { + switch (RMWI.getOperation()) { + case AtomicRMWInst::FMax: + // maxnum(x, +inf) -> +inf + return !CF->isNegative() && CF->isInfinity(); + case AtomicRMWInst::FMin: + // minnum(x, -inf) -> +inf + return CF->isNegative() && CF->isInfinity(); case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: return CF->isNaN(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 67ef2e895b6c..edfdf70c2b97 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1543,7 +1543,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { !ShAmtC->containsConstantExpression()) { // Canonicalize a shift amount constant operand to modulo the bit-width. Constant *WidthC = ConstantInt::get(Ty, BitWidth); - Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); + Constant *ModuloC = + ConstantFoldBinaryOpOperands(Instruction::URem, ShAmtC, WidthC, DL); + if (!ModuloC) + return nullptr; if (ModuloC != ShAmtC) return replaceOperand(*II, 2, ModuloC); @@ -2679,7 +2682,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Handle target specific intrinsics Optional<Instruction *> V = targetInstCombineIntrinsic(*II); if (V) - return V.getValue(); + return V.value(); break; } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index e9e779b8619b..a9a930555b3c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1756,11 +1756,12 @@ static bool isKnownExactCastIntToFP(CastInst &I, InstCombinerImpl &IC) { // TODO: // Try harder to find if the source integer type has less significant bits. - // For example, compute number of sign bits or compute low bit mask. + // For example, compute number of sign bits. KnownBits SrcKnown = IC.computeKnownBits(Src, 0, &I); - int LowBits = - (int)SrcTy->getScalarSizeInBits() - SrcKnown.countMinLeadingZeros(); - if (LowBits <= DestNumSigBits) + int SigBits = (int)SrcTy->getScalarSizeInBits() - + SrcKnown.countMinLeadingZeros() - + SrcKnown.countMinTrailingZeros(); + if (SigBits <= DestNumSigBits) return true; return false; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index d1f89973caa1..9f6d36b85522 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1436,7 +1436,7 @@ Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) { // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...). Constant *C = dyn_cast<Constant>(Op1); - if (!C || C->canTrap()) + if (!C) return nullptr; if (auto *Phi = dyn_cast<PHINode>(Op0)) @@ -1777,11 +1777,16 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp, return new ICmpInst(NewPred, X, Zero); } + APInt NewC2 = *C2; + KnownBits Know = computeKnownBits(And->getOperand(0), 0, And); + // Set high zeros of C2 to allow matching negated power-of-2. + NewC2 = *C2 + APInt::getHighBitsSet(C2->getBitWidth(), + Know.countMinLeadingZeros()); + // Restrict this fold only for single-use 'and' (PR10267). // ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two. - if ((~(*C2) + 1).isPowerOf2()) { - Constant *NegBOC = - ConstantExpr::getNeg(cast<Constant>(And->getOperand(1))); + if (NewC2.isNegatedPowerOf2()) { + Constant *NegBOC = ConstantInt::get(And->getType(), -NewC2); auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; return new ICmpInst(NewPred, X, NegBOC); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 2a34edbf6cb8..8cb09cbac86f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -505,20 +505,23 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { Constant *C1; if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { // (C1 / X) * C --> (C * C1) / X - Constant *CC1 = ConstantExpr::getFMul(C, C1); - if (CC1->isNormalFP()) + Constant *CC1 = + ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL); + if (CC1 && CC1->isNormalFP()) return BinaryOperator::CreateFDivFMF(CC1, X, &I); } if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { // (X / C1) * C --> X * (C / C1) - Constant *CDivC1 = ConstantExpr::getFDiv(C, C1); - if (CDivC1->isNormalFP()) + Constant *CDivC1 = + ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C1, DL); + if (CDivC1 && CDivC1->isNormalFP()) return BinaryOperator::CreateFMulFMF(X, CDivC1, &I); // If the constant was a denormal, try reassociating differently. // (X / C1) * C --> X / (C1 / C) - Constant *C1DivC = ConstantExpr::getFDiv(C1, C); - if (Op0->hasOneUse() && C1DivC->isNormalFP()) + Constant *C1DivC = + ConstantFoldBinaryOpOperands(Instruction::FDiv, C1, C, DL); + if (C1DivC && Op0->hasOneUse() && C1DivC->isNormalFP()) return BinaryOperator::CreateFDivFMF(X, C1DivC, &I); } @@ -527,15 +530,19 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { // further folds and (X * C) + C2 is 'fma'. if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) { // (X + C1) * C --> (X * C) + (C * C1) - Constant *CC1 = ConstantExpr::getFMul(C, C1); - Value *XC = Builder.CreateFMulFMF(X, C, &I); - return BinaryOperator::CreateFAddFMF(XC, CC1, &I); + if (Constant *CC1 = ConstantFoldBinaryOpOperands( + Instruction::FMul, C, C1, DL)) { + Value *XC = Builder.CreateFMulFMF(X, C, &I); + return BinaryOperator::CreateFAddFMF(XC, CC1, &I); + } } if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) { // (C1 - X) * C --> (C * C1) - (X * C) - Constant *CC1 = ConstantExpr::getFMul(C, C1); - Value *XC = Builder.CreateFMulFMF(X, C, &I); - return BinaryOperator::CreateFSubFMF(CC1, XC, &I); + if (Constant *CC1 = ConstantFoldBinaryOpOperands( + Instruction::FMul, C, C1, DL)) { + Value *XC = Builder.CreateFMulFMF(X, C, &I); + return BinaryOperator::CreateFSubFMF(CC1, XC, &I); + } } } @@ -1232,8 +1239,10 @@ static Instruction *foldFDivConstantDivisor(BinaryOperator &I) { // on all targets. // TODO: Use Intrinsic::canonicalize or let function attributes tell us that // denorms are flushed? - auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C); - if (!RecipC->isNormalFP()) + const DataLayout &DL = I.getModule()->getDataLayout(); + auto *RecipC = ConstantFoldBinaryOpOperands( + Instruction::FDiv, ConstantFP::get(I.getType(), 1.0), C, DL); + if (!RecipC || !RecipC->isNormalFP()) return nullptr; // X / C --> X * (1 / C) @@ -1256,12 +1265,13 @@ static Instruction *foldFDivConstantDividend(BinaryOperator &I) { // Try to reassociate C / X expressions where X includes another constant. Constant *C2, *NewC = nullptr; + const DataLayout &DL = I.getModule()->getDataLayout(); if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) { // C / (X * C2) --> (C / C2) / X - NewC = ConstantExpr::getFDiv(C, C2); + NewC = ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C2, DL); } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) { // C / (X / C2) --> (C * C2) / X - NewC = ConstantExpr::getFMul(C, C2); + NewC = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C2, DL); } // Disallow denormal constants because we don't know what would happen // on all targets. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 9d4c01ac03e2..febd0f51d25f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -925,7 +925,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic( *II, DemandedMask, Known, KnownBitsComputed); if (V) - return V.getValue(); + return V.value(); break; } } @@ -1636,7 +1636,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, *II, DemandedElts, UndefElts, UndefElts2, UndefElts3, simplifyAndSetOp); if (V) - return V.getValue(); + return V.value(); break; } } // switch on IntrinsicID diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 22659a8e4951..b80c58183dd5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -228,8 +228,9 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { // truncate a subset of scalar bits of an insert op. if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) { Value *Scalar; + Value *Vec; uint64_t InsIndexC; - if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar), + if (!match(X, m_InsertElt(m_Value(Vec), m_Value(Scalar), m_ConstantInt(InsIndexC)))) return nullptr; @@ -239,8 +240,19 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { // of elements 4-7 of the bitcasted vector. unsigned NarrowingRatio = NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue(); - if (ExtIndexC / NarrowingRatio != InsIndexC) + + if (ExtIndexC / NarrowingRatio != InsIndexC) { + // Remove insertelement, if we don't use the inserted element. + // extractelement (bitcast (insertelement (Vec, b)), a) -> + // extractelement (bitcast (Vec), a) + // FIXME: this should be removed to SimplifyDemandedVectorElts, + // once scale vectors are supported. + if (X->hasOneUse() && Ext.getVectorOperand()->hasOneUse()) { + Value *NewBC = Builder.CreateBitCast(Vec, Ext.getVectorOperandType()); + return ExtractElementInst::Create(NewBC, Ext.getIndexOperand()); + } return nullptr; + } // We are extracting part of the original scalar. How that scalar is // inserted into the vector depends on the endian-ness. Example: diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 0816a4a575d9..75520a0c8d5f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -523,11 +523,12 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) { // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" // if C1 and C2 are constants. Value *A, *B; - Constant *C1, *C2; + Constant *C1, *C2, *CRes; if (Op0 && Op1 && Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode && match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) && - match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) { + match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2)))) && + (CRes = ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL))) { bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0) && hasNoUnsignedWrap(*Op1); @@ -544,7 +545,7 @@ bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) { InsertNewInstWith(NewBO, I); NewBO->takeName(Op1); replaceOperand(I, 0, NewBO); - replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2)); + replaceOperand(I, 1, CRes); // Conservatively clear the optional flags, since they may not be // preserved by the reassociation. ClearSubclassDataAfterReassociation(I); @@ -1324,6 +1325,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { if (!isGuaranteedToTransferExecutionToSuccessor(&*BBIter)) return nullptr; + // Fold constants for the predecessor block with constant incoming values. + Constant *NewC = ConstantFoldBinaryOpOperands(BO.getOpcode(), C0, C1, DL); + if (!NewC) + return nullptr; + // Make a new binop in the predecessor block with the non-constant incoming // values. Builder.SetInsertPoint(PredBlockBranch); @@ -1333,9 +1339,6 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { if (auto *NotFoldedNewBO = dyn_cast<BinaryOperator>(NewBO)) NotFoldedNewBO->copyIRFlags(&BO); - // Fold constants for the predecessor block with constant incoming values. - Constant *NewC = ConstantExpr::get(BO.getOpcode(), C0, C1); - // Replace the binop with a phi of the new values. The old phis are dead. PHINode *NewPhi = PHINode::Create(BO.getType(), 2); NewPhi->addIncoming(NewBO, OtherBB); @@ -1774,9 +1777,10 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { // for target-independent shuffle creation. if (I >= SrcVecNumElts || ShMask[I] < 0) { Constant *MaybeUndef = - ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) - : ConstantExpr::get(Opcode, CElt, UndefScalar); - if (!match(MaybeUndef, m_Undef())) { + ConstOp1 + ? ConstantFoldBinaryOpOperands(Opcode, UndefScalar, CElt, DL) + : ConstantFoldBinaryOpOperands(Opcode, CElt, UndefScalar, DL); + if (!MaybeUndef || !match(MaybeUndef, m_Undef())) { MayChange = false; break; } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 7a5a74aa4fff..4fed4bd18fb1 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -101,6 +101,7 @@ static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL; static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000; static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44; static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52; +static const uint64_t kMIPS_ShadowOffsetN32 = 1ULL << 29; static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000; static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37; static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36; @@ -476,6 +477,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, TargetTriple.getArch() == Triple::ppc64le; bool IsSystemZ = TargetTriple.getArch() == Triple::systemz; bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64; + bool IsMIPSN32ABI = TargetTriple.getEnvironment() == Triple::GNUABIN32; bool IsMIPS32 = TargetTriple.isMIPS32(); bool IsMIPS64 = TargetTriple.isMIPS64(); bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb(); @@ -496,6 +498,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize, if (LongSize == 32) { if (IsAndroid) Mapping.Offset = kDynamicShadowSentinel; + else if (IsMIPSN32ABI) + Mapping.Offset = kMIPS_ShadowOffsetN32; else if (IsMIPS32) Mapping.Offset = kMIPS32_ShadowOffset32; else if (IsFreeBSD) diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp index b11b84d65d23..57c491436b93 100644 --- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp +++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp @@ -39,7 +39,8 @@ addModuleFlags(Module &M, Nodes.push_back(MDNode::get(Context, Vals)); } - M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes)); + M.addModuleFlag(Module::Append, "CG Profile", + MDTuple::getDistinct(Context, Nodes)); return true; } diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 218b4bbfb6c0..b01c74320380 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -180,11 +180,31 @@ static cl::opt<bool> ClWithTls( "platforms that support this"), cl::Hidden, cl::init(true)); -static cl::opt<bool> - ClRecordStackHistory("hwasan-record-stack-history", - cl::desc("Record stack frames with tagged allocations " - "in a thread-local ring buffer"), - cl::Hidden, cl::init(true)); +// Mode for selecting how to insert frame record info into the stack ring +// buffer. +enum RecordStackHistoryMode { + // Do not record frame record info. + none, + + // Insert instructions into the prologue for storing into the stack ring + // buffer directly. + instr, + + // Add a call to __hwasan_add_frame_record in the runtime. + libcall, +}; + +static cl::opt<RecordStackHistoryMode> ClRecordStackHistory( + "hwasan-record-stack-history", + cl::desc("Record stack frames with tagged allocations in a thread-local " + "ring buffer"), + cl::values(clEnumVal(none, "Do not record stack ring history"), + clEnumVal(instr, "Insert instructions into the prologue for " + "storing into the stack ring buffer directly"), + clEnumVal(libcall, "Add a call to __hwasan_add_frame_record for " + "storing into the stack ring buffer")), + cl::Hidden, cl::init(instr)); + static cl::opt<bool> ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics", cl::desc("instrument memory intrinsics"), @@ -313,6 +333,7 @@ public: Value *getPC(IRBuilder<> &IRB); Value *getSP(IRBuilder<> &IRB); + Value *getFrameRecordInfo(IRBuilder<> &IRB); void instrumentPersonalityFunctions(); @@ -378,6 +399,7 @@ private: FunctionCallee HwasanTagMemoryFunc; FunctionCallee HwasanGenerateTagFunc; + FunctionCallee HwasanRecordFrameRecordFunc; Constant *ShadowGlobal; @@ -629,6 +651,9 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { HwasanGenerateTagFunc = M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty); + HwasanRecordFrameRecordFunc = M.getOrInsertFunction( + "__hwasan_add_frame_record", IRB.getVoidTy(), Int64Ty); + ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow", ArrayType::get(IRB.getInt8Ty(), 0)); @@ -1132,6 +1157,21 @@ Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) { return CachedSP; } +Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) { + // Prepare ring buffer data. + Value *PC = getPC(IRB); + Value *SP = getSP(IRB); + + // Mix SP and PC. + // Assumptions: + // PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero) + // SP is 0xsssssssssssSSSS0 (4 lower bits are zero) + // We only really need ~20 lower non-zero bits (SSSS), so we mix like this: + // 0xSSSSPPPPPPPPPPPP + SP = IRB.CreateShl(SP, 44); + return IRB.CreateOr(PC, SP); +} + void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { if (!Mapping.InTls) ShadowBase = getShadowNonTls(IRB); @@ -1141,50 +1181,67 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { if (!WithFrameRecord && ShadowBase) return; - Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); - assert(SlotPtr); + Value *SlotPtr = nullptr; + Value *ThreadLong = nullptr; + Value *ThreadLongMaybeUntagged = nullptr; - Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI. - Value *ThreadLongMaybeUntagged = - TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong); + auto getThreadLongMaybeUntagged = [&]() { + if (!SlotPtr) + SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); + if (!ThreadLong) + ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); + // Extract the address field from ThreadLong. Unnecessary on AArch64 with + // TBI. + return TargetTriple.isAArch64() ? ThreadLong + : untagPointer(IRB, ThreadLong); + }; if (WithFrameRecord) { - StackBaseTag = IRB.CreateAShr(ThreadLong, 3); - - // Prepare ring buffer data. - Value *PC = getPC(IRB); - Value *SP = getSP(IRB); + switch (ClRecordStackHistory) { + case libcall: { + // Emit a runtime call into hwasan rather than emitting instructions for + // recording stack history. + Value *FrameRecordInfo = getFrameRecordInfo(IRB); + IRB.CreateCall(HwasanRecordFrameRecordFunc, {FrameRecordInfo}); + break; + } + case instr: { + ThreadLongMaybeUntagged = getThreadLongMaybeUntagged(); - // Mix SP and PC. - // Assumptions: - // PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero) - // SP is 0xsssssssssssSSSS0 (4 lower bits are zero) - // We only really need ~20 lower non-zero bits (SSSS), so we mix like this: - // 0xSSSSPPPPPPPPPPPP - SP = IRB.CreateShl(SP, 44); + StackBaseTag = IRB.CreateAShr(ThreadLong, 3); - // Store data to ring buffer. - Value *RecordPtr = - IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0)); - IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr); + // Store data to ring buffer. + Value *FrameRecordInfo = getFrameRecordInfo(IRB); + Value *RecordPtr = IRB.CreateIntToPtr(ThreadLongMaybeUntagged, + IntptrTy->getPointerTo(0)); + IRB.CreateStore(FrameRecordInfo, RecordPtr); - // Update the ring buffer. Top byte of ThreadLong defines the size of the - // buffer in pages, it must be a power of two, and the start of the buffer - // must be aligned by twice that much. Therefore wrap around of the ring - // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). - // The use of AShr instead of LShr is due to - // https://bugs.llvm.org/show_bug.cgi?id=39030 - // Runtime library makes sure not to use the highest bit. - Value *WrapMask = IRB.CreateXor( - IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), - ConstantInt::get(IntptrTy, (uint64_t)-1)); - Value *ThreadLongNew = IRB.CreateAnd( - IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask); - IRB.CreateStore(ThreadLongNew, SlotPtr); + // Update the ring buffer. Top byte of ThreadLong defines the size of the + // buffer in pages, it must be a power of two, and the start of the buffer + // must be aligned by twice that much. Therefore wrap around of the ring + // buffer is simply Addr &= ~((ThreadLong >> 56) << 12). + // The use of AShr instead of LShr is due to + // https://bugs.llvm.org/show_bug.cgi?id=39030 + // Runtime library makes sure not to use the highest bit. + Value *WrapMask = IRB.CreateXor( + IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true), + ConstantInt::get(IntptrTy, (uint64_t)-1)); + Value *ThreadLongNew = IRB.CreateAnd( + IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask); + IRB.CreateStore(ThreadLongNew, SlotPtr); + break; + } + case none: { + llvm_unreachable( + "A stack history recording mode should've been selected."); + } + } } if (!ShadowBase) { + if (!ThreadLongMaybeUntagged) + ThreadLongMaybeUntagged = getThreadLongMaybeUntagged(); + // Get shadow base address by aligning RecordPtr up. // Note: this is not correct if the pointer is already aligned. // Runtime library will make sure this never happens. @@ -1408,7 +1465,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F, Instruction *InsertPt = &*F.getEntryBlock().begin(); IRBuilder<> EntryIRB(InsertPt); emitPrologue(EntryIRB, - /*WithFrameRecord*/ ClRecordStackHistory && + /*WithFrameRecord*/ ClRecordStackHistory != none && Mapping.WithFrameRecord && !SInfo.AllocasToInstrument.empty()); diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7843b1522830..3572cb3b50e2 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1244,6 +1244,7 @@ bool InstrProfiling::emitRuntimeHook() { auto *Var = new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index c33b1b3b1a5c..d4aa31db8337 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -486,7 +486,7 @@ static bool isTsanAtomic(const Instruction *I) { if (!SSID) return false; if (isa<LoadInst>(I) || isa<StoreInst>(I)) - return SSID.getValue() != SyncScope::SingleThread; + return SSID.value() != SyncScope::SingleThread; return true; } diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 8a1761505d59..fe6f9486ab0c 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -611,9 +611,9 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, ConstCand->ConstInt->getValue()); if (Diff) { const InstructionCost ImmCosts = - TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty); + TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.value(), Ty); Cost -= ImmCosts; - LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " " + LLVM_DEBUG(dbgs() << "Offset " << Diff.value() << " " << "has penalty: " << ImmCosts << "\n" << "Adjusted cost: " << Cost << "\n"); } diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 783301fe589e..b460637b7d88 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -748,14 +748,14 @@ void GVNPass::printPipeline( OS << "<"; if (Options.AllowPRE != None) - OS << (Options.AllowPRE.getValue() ? "" : "no-") << "pre;"; + OS << (Options.AllowPRE.value() ? "" : "no-") << "pre;"; if (Options.AllowLoadPRE != None) - OS << (Options.AllowLoadPRE.getValue() ? "" : "no-") << "load-pre;"; + OS << (Options.AllowLoadPRE.value() ? "" : "no-") << "load-pre;"; if (Options.AllowLoadPRESplitBackedge != None) - OS << (Options.AllowLoadPRESplitBackedge.getValue() ? "" : "no-") + OS << (Options.AllowLoadPRESplitBackedge.value() ? "" : "no-") << "split-backedge-load-pre;"; if (Options.AllowMemDep != None) - OS << (Options.AllowMemDep.getValue() ? "" : "no-") << "memdep"; + OS << (Options.AllowMemDep.value() ? "" : "no-") << "memdep"; OS << ">"; } @@ -1059,8 +1059,8 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo, if (DT->dominates(cast<Instruction>(OtherAccess), cast<Instruction>(U))) OtherAccess = U; else - assert(DT->dominates(cast<Instruction>(U), - cast<Instruction>(OtherAccess))); + assert(U == OtherAccess || DT->dominates(cast<Instruction>(U), + cast<Instruction>(OtherAccess))); } else OtherAccess = U; } @@ -1494,14 +1494,6 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, return false; } - // FIXME: Can we support the fallthrough edge? - if (isa<CallBrInst>(Pred->getTerminator())) { - LLVM_DEBUG( - dbgs() << "COULD NOT PRE LOAD BECAUSE OF CALLBR CRITICAL EDGE '" - << Pred->getName() << "': " << *Load << '\n'); - return false; - } - if (LoadBB->isEHPad()) { LLVM_DEBUG( dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" @@ -2875,11 +2867,6 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { if (isa<IndirectBrInst>(PREPred->getTerminator())) return false; - // Don't do PRE across callbr. - // FIXME: Can we do this across the fallthrough edge? - if (isa<CallBrInst>(PREPred->getTerminator())) - return false; - // We can't do PRE safely on a critical edge, so instead we schedule // the edge to be split and perform the PRE the next time we iterate // on the function. diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index e977dd18be9f..a9ca0bdc8f7b 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -106,13 +106,18 @@ static cl::opt<bool> VerifyIndvars( static cl::opt<ReplaceExitVal> ReplaceExitValue( "replexitval", cl::Hidden, cl::init(OnlyCheapRepl), cl::desc("Choose the strategy to replace exit value in IndVarSimplify"), - cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"), - clEnumValN(OnlyCheapRepl, "cheap", - "only replace exit value when the cost is cheap"), - clEnumValN(NoHardUse, "noharduse", - "only replace exit values when loop def likely dead"), - clEnumValN(AlwaysRepl, "always", - "always replace exit value whenever possible"))); + cl::values( + clEnumValN(NeverRepl, "never", "never replace exit value"), + clEnumValN(OnlyCheapRepl, "cheap", + "only replace exit value when the cost is cheap"), + clEnumValN( + UnusedIndVarInLoop, "unusedindvarinloop", + "only replace exit value when it is an unused " + "induction variable in the loop and has cheap replacement cost"), + clEnumValN(NoHardUse, "noharduse", + "only replace exit values when loop def likely dead"), + clEnumValN(AlwaysRepl, "always", + "always replace exit value whenever possible"))); static cl::opt<bool> UsePostIncrementRanges( "indvars-post-increment-ranges", cl::Hidden, @@ -1302,15 +1307,39 @@ static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken, } static void replaceLoopPHINodesWithPreheaderValues( - Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { + LoopInfo *LI, Loop *L, SmallVectorImpl<WeakTrackingVH> &DeadInsts) { assert(L->isLoopSimplifyForm() && "Should only do it in simplify form!"); auto *LoopPreheader = L->getLoopPreheader(); auto *LoopHeader = L->getHeader(); + SmallVector<Instruction *> Worklist; for (auto &PN : LoopHeader->phis()) { auto *PreheaderIncoming = PN.getIncomingValueForBlock(LoopPreheader); + for (User *U : PN.users()) + Worklist.push_back(cast<Instruction>(U)); PN.replaceAllUsesWith(PreheaderIncoming); DeadInsts.emplace_back(&PN); } + + // Replacing with the preheader value will often allow IV users to simplify + // (especially if the preheader value is a constant). + SmallPtrSet<Instruction *, 16> Visited; + while (!Worklist.empty()) { + auto *I = cast<Instruction>(Worklist.pop_back_val()); + if (!Visited.insert(I).second) + continue; + + // Don't simplify instructions outside the loop. + if (!L->contains(I)) + continue; + + Value *Res = simplifyInstruction(I, I->getModule()->getDataLayout()); + if (Res && LI->replacementPreservesLCSSAForm(I, Res)) { + for (User *U : I->users()) + Worklist.push_back(cast<Instruction>(U)); + I->replaceAllUsesWith(Res); + DeadInsts.emplace_back(I); + } + } } static void replaceWithInvariantCond( @@ -1549,14 +1578,19 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { if (!BI) return true; - // If already constant, nothing to do. - if (isa<Constant>(BI->getCondition())) - return true; - // Likewise, the loop latch must be dominated by the exiting BB. if (!DT->dominates(ExitingBB, L->getLoopLatch())) return true; + if (auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { + // If already constant, nothing to do. However, if this is an + // unconditional exit, we can still replace header phis with their + // preheader value. + if (!L->contains(BI->getSuccessor(CI->isNullValue()))) + replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts); + return true; + } + return false; }); @@ -1640,7 +1674,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { // the header PHIs with values coming from the preheader. if (ExitCount->isZero()) { foldExit(L, ExitingBB, true, DeadInsts); - replaceLoopPHINodesWithPreheaderValues(L, DeadInsts); + replaceLoopPHINodesWithPreheaderValues(LI, L, DeadInsts); Changed = true; continue; } diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 799669a19796..b54cf5e7cb20 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -1710,7 +1710,7 @@ IntersectSignedRange(ScalarEvolution &SE, return None; if (!R1) return R2; - auto &R1Value = R1.getValue(); + auto &R1Value = R1.value(); // We never return empty ranges from this function, and R1 is supposed to be // a result of intersection. Thus, R1 is never empty. assert(!R1Value.isEmpty(SE, /* IsSigned */ true) && @@ -1739,7 +1739,7 @@ IntersectUnsignedRange(ScalarEvolution &SE, return None; if (!R1) return R2; - auto &R1Value = R1.getValue(); + auto &R1Value = R1.value(); // We never return empty ranges from this function, and R1 is supposed to be // a result of intersection. Thus, R1 is never empty. assert(!R1Value.isEmpty(SE, /* IsSigned */ false) && @@ -1950,13 +1950,12 @@ bool InductiveRangeCheckElimination::run( LS.IsSignedPredicate); if (Result) { auto MaybeSafeIterRange = - IntersectRange(SE, SafeIterRange, Result.getValue()); + IntersectRange(SE, SafeIterRange, Result.value()); if (MaybeSafeIterRange) { - assert( - !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) && - "We should never return empty ranges!"); + assert(!MaybeSafeIterRange.value().isEmpty(SE, LS.IsSignedPredicate) && + "We should never return empty ranges!"); RangeChecksToEliminate.push_back(IRC); - SafeIterRange = MaybeSafeIterRange.getValue(); + SafeIterRange = MaybeSafeIterRange.value(); } } } @@ -1964,8 +1963,7 @@ bool InductiveRangeCheckElimination::run( if (!SafeIterRange) return false; - LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, - SafeIterRange.getValue()); + LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, SafeIterRange.value()); bool Changed = LC.run(); if (Changed) { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 5caefc422921..b31eab50c5ec 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1459,9 +1459,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // Add all the unavailable predecessors to the PredsToSplit list. for (BasicBlock *P : predecessors(LoadBB)) { // If the predecessor is an indirect goto, we can't split the edge. - // Same for CallBr. - if (isa<IndirectBrInst>(P->getTerminator()) || - isa<CallBrInst>(P->getTerminator())) + if (isa<IndirectBrInst>(P->getTerminator())) return false; if (!AvailablePredSet.count(P)) @@ -1685,9 +1683,8 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB, } // If the predecessor ends with an indirect goto, we can't change its - // destination. Same for CallBr. - if (isa<IndirectBrInst>(Pred->getTerminator()) || - isa<CallBrInst>(Pred->getTerminator())) + // destination. + if (isa<IndirectBrInst>(Pred->getTerminator())) continue; PredToDestList.emplace_back(Pred, DestBB); @@ -1924,10 +1921,9 @@ bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) { } // If any of predecessors end with an indirect goto, we can't change its - // destination. Same for CallBr. + // destination. if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { - return isa<IndirectBrInst>(Pred->getTerminator()) || - isa<CallBrInst>(Pred->getTerminator()); + return isa<IndirectBrInst>(Pred->getTerminator()); })) return false; @@ -2173,6 +2169,9 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB, BasicBlock *ZeroPred = nullptr; BasicBlock *OnePred = nullptr; for (BasicBlock *P : predecessors(PredBB)) { + // If PredPred ends with IndirectBrInst, we can't handle it. + if (isa<IndirectBrInst>(P->getTerminator())) + continue; if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>( evaluateOnPredecessorEdge(BB, P, Cond))) { if (CI->isZero()) { diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 492f4e40395a..f54264b1dca6 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1508,8 +1508,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad()) return false; for (BasicBlock *BBPred : predecessors(BB)) { - if (isa<IndirectBrInst>(BBPred->getTerminator()) || - isa<CallBrInst>(BBPred->getTerminator())) + if (isa<IndirectBrInst>(BBPred->getTerminator())) return false; } return true; diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 03a10cb36bb6..b178bcae3b0e 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -602,7 +602,7 @@ private: : LLVMLoopDistributeFollowupCoincident}); if (PartitionID) { Loop *NewLoop = Part->getDistributedLoop(); - NewLoop->setLoopID(PartitionID.getValue()); + NewLoop->setLoopID(PartitionID.value()); } } }; @@ -826,7 +826,7 @@ public: {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, "llvm.loop.distribute.", true) - .getValue(); + .value(); LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 88d6a7aff3c9..d908c151d9f2 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1483,7 +1483,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // anything where the alignment isn't at least the element size. assert((StoreAlign && LoadAlign) && "Expect unordered load/store to have align."); - if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize) + if (StoreAlign.value() < StoreSize || LoadAlign.value() < StoreSize) return Changed; // If the element.atomic memcpy is not lowered into explicit @@ -1497,7 +1497,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // Note that unordered atomic loads/stores are *required* by the spec to // have an alignment but non-atomic loads/stores may not. NewCall = Builder.CreateElementUnorderedAtomicMemCpy( - StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(), + StoreBasePtr, StoreAlign.value(), LoadBasePtr, LoadAlign.value(), NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias); } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 9959e408e2e2..4ef7809c6681 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -5601,27 +5601,6 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, DeadInsts.emplace_back(OperandIsInstr); } -// Check if there are any loop exit values which are only used once within the -// loop which may potentially be optimized with a call to rewriteLoopExitValue. -static bool LoopExitValHasSingleUse(Loop *L) { - BasicBlock *ExitBB = L->getExitBlock(); - if (!ExitBB) - return false; - - for (PHINode &ExitPhi : ExitBB->phis()) { - if (ExitPhi.getNumIncomingValues() != 1) - break; - - BasicBlock *Pred = ExitPhi.getIncomingBlock(0); - Value *IVNext = ExitPhi.getIncomingValueForBlock(Pred); - // One use would be the exit phi node, and there should be only one other - // use for this to be considered. - if (IVNext->getNumUses() == 2) - return true; - } - return false; -} - /// Rewrite all the fixup locations with new values, following the chosen /// solution. void LSRInstance::ImplementSolution( @@ -6406,8 +6385,8 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, // less DWARF ops than an iteration count-based expression. if (Optional<APInt> Offset = SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) { - if (Offset.getValue().getMinSignedBits() <= 64) - SalvageExpr->createOffsetExpr(Offset.getValue().getSExtValue(), + if (Offset.value().getMinSignedBits() <= 64) + SalvageExpr->createOffsetExpr(Offset.value().getSExtValue(), LSRInductionVar); } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr, SE)) @@ -6627,12 +6606,12 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // When this is the case, if the exit value of the IV can be calculated using // SCEV, we can replace the exit block PHI with the final value of the IV and // skip the updates in each loop iteration. - if (L->isRecursivelyLCSSAForm(DT, LI) && LoopExitValHasSingleUse(L)) { + if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) { SmallVector<WeakTrackingVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); SCEVExpander Rewriter(SE, DL, "lsr", false); int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT, - OnlyCheapRepl, DeadInsts); + UnusedIndVarInLoop, DeadInsts); if (Rewrites) { Changed = true; RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 8c2868563227..64fcdfa15aa9 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -373,7 +373,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupRemainderInner}); if (NewInnerEpilogueLoopID) - SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); + SubLoop->setLoopID(NewInnerEpilogueLoopID.value()); // Find trip count and trip multiple BasicBlock *Latch = L->getLoopLatch(); @@ -403,14 +403,14 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupRemainderOuter}); if (NewOuterEpilogueLoopID) - EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue()); + EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.value()); } Optional<MDNode *> NewInnerLoopID = makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupInner}); if (NewInnerLoopID) - SubLoop->setLoopID(NewInnerLoopID.getValue()); + SubLoop->setLoopID(NewInnerLoopID.value()); else SubLoop->setLoopID(OrigSubLoopID); @@ -419,7 +419,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter}); if (NewOuterLoopID) { - L->setLoopID(NewOuterLoopID.getValue()); + L->setLoopID(NewOuterLoopID.value()); // Do not setLoopAlreadyUnrolled if a followup was given. return UnrollResult; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index fda86afe5f9d..de5833f60adc 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1324,7 +1324,7 @@ static LoopUnrollResult tryToUnrollLoop( makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); if (RemainderLoopID) - RemainderLoop->setLoopID(RemainderLoopID.getValue()); + RemainderLoop->setLoopID(RemainderLoopID.value()); } if (UnrollResult != LoopUnrollResult::FullyUnrolled) { @@ -1332,7 +1332,7 @@ static LoopUnrollResult tryToUnrollLoop( makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupUnrolled}); if (NewLoopID) { - L->setLoopID(NewLoopID.getValue()); + L->setLoopID(NewLoopID.value()); // Do not setLoopAlreadyUnrolled if loop attributes have been specified // explicitly. @@ -1645,15 +1645,15 @@ void LoopUnrollPass::printPipeline( OS, MapClassName2PassName); OS << "<"; if (UnrollOpts.AllowPartial != None) - OS << (UnrollOpts.AllowPartial.getValue() ? "" : "no-") << "partial;"; + OS << (UnrollOpts.AllowPartial.value() ? "" : "no-") << "partial;"; if (UnrollOpts.AllowPeeling != None) - OS << (UnrollOpts.AllowPeeling.getValue() ? "" : "no-") << "peeling;"; + OS << (UnrollOpts.AllowPeeling.value() ? "" : "no-") << "peeling;"; if (UnrollOpts.AllowRuntime != None) - OS << (UnrollOpts.AllowRuntime.getValue() ? "" : "no-") << "runtime;"; + OS << (UnrollOpts.AllowRuntime.value() ? "" : "no-") << "runtime;"; if (UnrollOpts.AllowUpperBound != None) - OS << (UnrollOpts.AllowUpperBound.getValue() ? "" : "no-") << "upperbound;"; + OS << (UnrollOpts.AllowUpperBound.value() ? "" : "no-") << "upperbound;"; if (UnrollOpts.AllowProfileBasedPeeling != None) - OS << (UnrollOpts.AllowProfileBasedPeeling.getValue() ? "" : "no-") + OS << (UnrollOpts.AllowProfileBasedPeeling.value() ? "" : "no-") << "profile-peeling;"; if (UnrollOpts.FullUnrollMaxCount != None) OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ";"; diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index da1737979305..75f0896d4845 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" @@ -1929,11 +1930,23 @@ Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. + const DataLayout &DL = I->getModule()->getDataLayout(); Constant *Cst = nullptr; unsigned Opcode = I->getOpcode(); - while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { - Constant *C = cast<Constant>(Ops.pop_back_val().Op); - Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; + while (!Ops.empty()) { + if (auto *C = dyn_cast<Constant>(Ops.back().Op)) { + if (!Cst) { + Ops.pop_back(); + Cst = C; + continue; + } + if (Constant *Res = ConstantFoldBinaryOpOperands(Opcode, C, Cst, DL)) { + Ops.pop_back(); + Cst = Res; + continue; + } + } + break; } // If there was nothing but constants then we are done. if (Ops.empty()) diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index e9983ff82176..079b2fc973b9 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -769,8 +769,7 @@ llvm::SplitAllCriticalEdges(Function &F, unsigned NumBroken = 0; for (BasicBlock &BB : F) { Instruction *TI = BB.getTerminator(); - if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) && - !isa<CallBrInst>(TI)) + if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI)) for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (SplitCriticalEdge(TI, i, Options)) ++NumBroken; @@ -1132,9 +1131,7 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, // all BlockAddress uses would need to be updated. assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) && "Cannot split an edge from an IndirectBrInst"); - assert(!isa<CallBrInst>(Preds[i]->getTerminator()) && - "Cannot split an edge from a CallBrInst"); - Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB); + Preds[i]->getTerminator()->replaceSuccessorWith(BB, NewBB); } // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 0b36e8708a03..9c595401ce29 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -129,8 +129,7 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, SmallVector<BasicBlock *, 4> LoopPreds; // Check if extra modifications will be required to preserve loop-simplify // form after splitting. If it would require splitting blocks with IndirectBr - // or CallBr terminators, bail out if preserving loop-simplify form is - // requested. + // terminators, bail out if preserving loop-simplify form is requested. if (LI) { if (Loop *TIL = LI->getLoopFor(TIBB)) { @@ -156,10 +155,7 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, // Loop-simplify form can be preserved, if we can split all in-loop // predecessors. if (any_of(LoopPreds, [](BasicBlock *Pred) { - const Instruction *T = Pred->getTerminator(); - if (const auto *CBR = dyn_cast<CallBrInst>(T)) - return CBR->getDefaultDest() != Pred; - return isa<IndirectBrInst>(T); + return isa<IndirectBrInst>(Pred->getTerminator()); })) { if (Options.PreserveLoopSimplify) return nullptr; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index f94d854f7ee8..421f1f329f07 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -927,6 +927,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::AlwaysInline: case Attribute::Cold: case Attribute::DisableSanitizerInstrumentation: + case Attribute::FnRetThunkExtern: case Attribute::Hot: case Attribute::NoRecurse: case Attribute::InlineHint: @@ -1777,7 +1778,7 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC, auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency()); if (Count) newFunction->setEntryCount( - ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME + ProfileCount(Count.value(), Function::PCT_Real)); // FIXME BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency()); } diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 205f7a7d9ed2..24126b5ab67b 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -961,8 +961,13 @@ createDebugifyFunctionPass(enum DebugifyMode Mode, } PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) { - applyDebugifyMetadata(M, M.functions(), - "ModuleDebugify: ", /*ApplyToMF*/ nullptr); + if (Mode == DebugifyMode::SyntheticDebugInfo) + applyDebugifyMetadata(M, M.functions(), + "ModuleDebugify: ", /*ApplyToMF*/ nullptr); + else + collectDebugInfoMetadata(M, M.functions(), *DebugInfoBeforePass, + "ModuleDebugify (original debuginfo)", + NameOfWrappedPass); return PreservedAnalyses::all(); } @@ -992,8 +997,14 @@ FunctionPass *createCheckDebugifyFunctionPass( PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, ModuleAnalysisManager &) { - checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false, - nullptr); + if (Mode == DebugifyMode::SyntheticDebugInfo) + checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass, + "CheckModuleDebugify", Strip, StatsMap); + else + checkDebugInfoMetadata( + M, M.functions(), *DebugInfoBeforePass, + "CheckModuleDebugify (original debuginfo)", NameOfWrappedPass, + OrigDIVerifyBugsReportFilePath); return PreservedAnalyses::all(); } @@ -1006,13 +1017,15 @@ static bool isIgnoredPass(StringRef PassID) { void DebugifyEachInstrumentation::registerCallbacks( PassInstrumentationCallbacks &PIC) { - PIC.registerBeforeNonSkippedPassCallback([](StringRef P, Any IR) { + PIC.registerBeforeNonSkippedPassCallback([this](StringRef P, Any IR) { if (isIgnoredPass(P)) return; if (any_isa<const Function *>(IR)) - applyDebugify(*const_cast<Function *>(any_cast<const Function *>(IR))); + applyDebugify(*const_cast<Function *>(any_cast<const Function *>(IR)), + Mode, DebugInfoBeforePass, P); else if (any_isa<const Module *>(IR)) - applyDebugify(*const_cast<Module *>(any_cast<const Module *>(IR))); + applyDebugify(*const_cast<Module *>(any_cast<const Module *>(IR)), + Mode, DebugInfoBeforePass, P); }); PIC.registerAfterPassCallback([this](StringRef P, Any IR, const PreservedAnalyses &PassPA) { @@ -1022,12 +1035,24 @@ void DebugifyEachInstrumentation::registerCallbacks( auto &F = *const_cast<Function *>(any_cast<const Function *>(IR)); Module &M = *F.getParent(); auto It = F.getIterator(); - checkDebugifyMetadata(M, make_range(It, std::next(It)), P, - "CheckFunctionDebugify", /*Strip=*/true, &StatsMap); + if (Mode == DebugifyMode::SyntheticDebugInfo) + checkDebugifyMetadata(M, make_range(It, std::next(It)), P, + "CheckFunctionDebugify", /*Strip=*/true, DIStatsMap); + else + checkDebugInfoMetadata( + M, make_range(It, std::next(It)), *DebugInfoBeforePass, + "CheckModuleDebugify (original debuginfo)", + P, OrigDIVerifyBugsReportFilePath); } else if (any_isa<const Module *>(IR)) { auto &M = *const_cast<Module *>(any_cast<const Module *>(IR)); - checkDebugifyMetadata(M, M.functions(), P, "CheckModuleDebugify", - /*Strip=*/true, &StatsMap); + if (Mode == DebugifyMode::SyntheticDebugInfo) + checkDebugifyMetadata(M, M.functions(), P, "CheckModuleDebugify", + /*Strip=*/true, DIStatsMap); + else + checkDebugInfoMetadata( + M, M.functions(), *DebugInfoBeforePass, + "CheckModuleDebugify (original debuginfo)", + P, OrigDIVerifyBugsReportFilePath); } }); } diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index cd3b6c1a095a..023a0afd329b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -402,7 +402,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder, Optional<MDNode *> NewLoopID = makeFollowupLoopID( LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); if (NewLoopID) { - NewLoop->setLoopID(NewLoopID.getValue()); + NewLoop->setLoopID(NewLoopID.value()); // Do not setLoopAlreadyUnrolled if loop attributes have been defined // explicitly. diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index ec898c463574..82f993b4ceab 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -75,9 +75,6 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, if (isa<IndirectBrInst>(PredBB->getTerminator())) // We cannot rewrite exiting edges from an indirectbr. return false; - if (isa<CallBrInst>(PredBB->getTerminator())) - // We cannot rewrite exiting edges from a callbr. - return false; InLoopPredecessors.push_back(PredBB); } else { @@ -359,7 +356,7 @@ TransformationMode llvm::hasUnrollTransformation(const Loop *L) { Optional<int> Count = getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count"); if (Count) - return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; + return Count.value() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable")) return TM_ForcedByUser; @@ -380,7 +377,7 @@ TransformationMode llvm::hasUnrollAndJamTransformation(const Loop *L) { Optional<int> Count = getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count"); if (Count) - return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; + return Count.value() == 1 ? TM_SuppressedByUser : TM_ForcedByUser; if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable")) return TM_ForcedByUser; @@ -1246,6 +1243,20 @@ static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) return true; } +/// Checks if it is safe to call InductionDescriptor::isInductionPHI for \p Phi, +/// and returns true if this Phi is an induction phi in the loop. When +/// isInductionPHI returns true, \p ID will be also be set by isInductionPHI. +static bool checkIsIndPhi(PHINode *Phi, Loop *L, ScalarEvolution *SE, + InductionDescriptor &ID) { + if (!Phi) + return false; + if (!L->getLoopPreheader()) + return false; + if (Phi->getParent() != L->getHeader()) + return false; + return InductionDescriptor::isInductionPHI(Phi, L, SE, ID); +} + int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, @@ -1297,6 +1308,46 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, if (!L->contains(Inst)) continue; + // Find exit values which are induction variables in the loop, and are + // unused in the loop, with the only use being the exit block PhiNode, + // and the induction variable update binary operator. + // The exit value can be replaced with the final value when it is cheap + // to do so. + if (ReplaceExitValue == UnusedIndVarInLoop) { + InductionDescriptor ID; + PHINode *IndPhi = dyn_cast<PHINode>(Inst); + if (IndPhi) { + if (!checkIsIndPhi(IndPhi, L, SE, ID)) + continue; + // This is an induction PHI. Check that the only users are PHI + // nodes, and induction variable update binary operators. + if (llvm::any_of(Inst->users(), [&](User *U) { + if (!isa<PHINode>(U) && !isa<BinaryOperator>(U)) + return true; + BinaryOperator *B = dyn_cast<BinaryOperator>(U); + if (B && B != ID.getInductionBinOp()) + return true; + return false; + })) + continue; + } else { + // If it is not an induction phi, it must be an induction update + // binary operator with an induction phi user. + BinaryOperator *B = dyn_cast<BinaryOperator>(Inst); + if (!B) + continue; + if (llvm::any_of(Inst->users(), [&](User *U) { + PHINode *Phi = dyn_cast<PHINode>(U); + if (Phi != PN && !checkIsIndPhi(Phi, L, SE, ID)) + return true; + return false; + })) + continue; + if (B != ID.getInductionBinOp()) + continue; + } + } + // Okay, this instruction has a user outside of the current loop // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. We prefer to start with @@ -1362,7 +1413,9 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, // Only do the rewrite when the ExitValue can be expanded cheaply. // If LoopCanBeDel is true, rewrite exit value aggressively. - if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) + if ((ReplaceExitValue == OnlyCheapRepl || + ReplaceExitValue == UnusedIndVarInLoop) && + !LoopCanBeDel && Phi.HighCost) continue; Value *ExitVal = Rewriter.expandCodeFor( diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp index 8641581c8039..9914a5ca6c5e 100644 --- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp @@ -74,6 +74,10 @@ Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op, return Builder.CreateFAdd(Loaded, Inc, "new"); case AtomicRMWInst::FSub: return Builder.CreateFSub(Loaded, Inc, "new"); + case AtomicRMWInst::FMax: + return Builder.CreateMaxNum(Loaded, Inc); + case AtomicRMWInst::FMin: + return Builder.CreateMinNum(Loaded, Inc); default: llvm_unreachable("Unknown atomic op"); } diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp index b73d68ebec7c..4414b04c7264 100644 --- a/llvm/lib/Transforms/Utils/MisExpect.cpp +++ b/llvm/lib/Transforms/Utils/MisExpect.cpp @@ -221,7 +221,7 @@ void checkBackendInstrumentation(Instruction &I, auto ExpectedWeightsOpt = extractWeights(&I, I.getContext()); if (!ExpectedWeightsOpt) return; - auto ExpectedWeights = ExpectedWeightsOpt.getValue(); + auto ExpectedWeights = ExpectedWeightsOpt.value(); verifyMisExpect(I, RealWeights, ExpectedWeights); } @@ -230,7 +230,7 @@ void checkFrontendInstrumentation(Instruction &I, auto RealWeightsOpt = extractWeights(&I, I.getContext()); if (!RealWeightsOpt) return; - auto RealWeights = RealWeightsOpt.getValue(); + auto RealWeights = RealWeightsOpt.value(); verifyMisExpect(I, RealWeights, ExpectedWeights); } diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index 5120ade70e16..9e1492b97a86 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -255,7 +255,7 @@ void VFABI::setVectorVariantNames(CallInst *CI, LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n"); Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M); assert(VI && "Cannot add an invalid VFABI name."); - assert(M->getNamedValue(VI.getValue().VectorName) && + assert(M->getNamedValue(VI.value().VectorName) && "Cannot add variant to attribute: " "vector function declaration is missing."); } @@ -275,5 +275,13 @@ void llvm::embedBufferInModule(Module &M, MemoryBufferRef Buf, GV->setSection(SectionName); GV->setAlignment(Alignment); + LLVMContext &Ctx = M.getContext(); + NamedMDNode *MD = M.getOrInsertNamedMetadata("llvm.embedded.objects"); + Metadata *MDVals[] = {ConstantAsMetadata::get(GV), + MDString::get(Ctx, SectionName)}; + + MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); + GV->setMetadata(LLVMContext::MD_exclude, llvm::MDNode::get(Ctx, {})); + appendToCompilerUsed(M, GV); } diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index aff692b36288..bec1db896efb 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -488,31 +488,33 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, StoresByIndex, std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)), less_first()); + Value *ReplVal; if (I == StoresByIndex.begin()) { if (StoresByIndex.empty()) // If there are no stores, the load takes the undef value. - LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + ReplVal = UndefValue::get(LI->getType()); else // There is no store before this load, bail out (load may be affected // by the following stores - see main comment). return false; } else { - // Otherwise, there was a store before this load, the load takes its value. - // Note, if the load was marked as nonnull we don't want to lose that - // information when we erase it. So we preserve it with an assume. - Value *ReplVal = std::prev(I)->second->getOperand(0); - if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && - !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT)) - addAssumeNonNull(AC, LI); + // Otherwise, there was a store before this load, the load takes its + // value. + ReplVal = std::prev(I)->second->getOperand(0); + } - // If the replacement value is the load, this must occur in unreachable - // code. - if (ReplVal == LI) - ReplVal = PoisonValue::get(LI->getType()); + // Note, if the load was marked as nonnull we don't want to lose that + // information when we erase it. So we preserve it with an assume. + if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && + !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT)) + addAssumeNonNull(AC, LI); - LI->replaceAllUsesWith(ReplVal); - } + // If the replacement value is the load, this must occur in unreachable + // code. + if (ReplVal == LI) + ReplVal = PoisonValue::get(LI->getType()); + LI->replaceAllUsesWith(ReplVal); LI->eraseFromParent(); LBI.deleteValue(LI); } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index eee91e70292e..09a83f1ea094 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -208,8 +208,6 @@ private: if (!Elt) LV.markOverdefined(); // Unknown sort of constant. - else if (isa<UndefValue>(Elt)) - ; // Undef values remain unknown. else LV.markConstant(Elt); // Constants are constant. } @@ -356,8 +354,7 @@ public: // We only track the contents of scalar globals. if (GV->getValueType()->isSingleValueType()) { ValueLatticeElement &IV = TrackedGlobals[GV]; - if (!isa<UndefValue>(GV->getInitializer())) - IV.markConstant(GV->getInitializer()); + IV.markConstant(GV->getInitializer()); } } @@ -822,9 +819,6 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) { if (Constant *OpC = getConstant(OpSt)) { // Fold the constant as we build. Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL); - if (isa<UndefValue>(C)) - return; - // Propagate constant value markConstant(&I, C); } else if (I.getDestTy()->isIntegerTy()) { auto &LV = getValueState(&I); @@ -959,19 +953,15 @@ void SCCPInstVisitor::visitUnaryOperator(Instruction &I) { if (isOverdefined(IV)) return (void)markOverdefined(&I); - if (isConstant(V0State)) { - Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State)); - - // op Y -> undef. - if (isa<UndefValue>(C)) - return; - return (void)markConstant(IV, &I, C); - } - - // If something is undef, wait for it to resolve. - if (!isOverdefined(V0State)) + // If something is unknown/undef, wait for it to resolve. + if (V0State.isUnknownOrUndef()) return; + if (isConstant(V0State)) + if (Constant *C = ConstantFoldUnaryOpOperand(I.getOpcode(), + getConstant(V0State), DL)) + return (void)markConstant(IV, &I, C); + markOverdefined(&I); } @@ -999,9 +989,6 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) { Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL)); auto *C = dyn_cast_or_null<Constant>(R); if (C) { - // X op Y -> undef. - if (isa<UndefValue>(C)) - return; // Conservatively assume that the result may be based on operands that may // be undef. Note that we use mergeInValue to combine the constant with // the existing lattice value for I, as different constants might be found @@ -1050,6 +1037,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) { Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State); if (C) { + // TODO: getCompare() currently has incorrect handling for unknown/undef. if (isa<UndefValue>(C)) return; ValueLatticeElement CV; @@ -1095,8 +1083,6 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end()); Constant *C = ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices); - if (isa<UndefValue>(C)) - return; markConstant(&I, C); } @@ -1174,11 +1160,8 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) { } // Transform load from a constant into a constant if possible. - if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) { - if (isa<UndefValue>(C)) - return; + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) return (void)markConstant(IV, &I, C); - } } // Fall back to metadata. @@ -1223,12 +1206,8 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) { // If we can constant fold this, mark the result of the call as a // constant. - if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) { - // call -> undef. - if (isa<UndefValue>(C)) - return; + if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) return (void)markConstant(&CB, C); - } } // Fall back to metadata. diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 401f1ee5a55d..0c8bf3827256 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -220,7 +220,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, // Fold a binop with constant operands. if (Constant *CLHS = dyn_cast<Constant>(LHS)) if (Constant *CRHS = dyn_cast<Constant>(RHS)) - return ConstantExpr::get(Opcode, CLHS, CRHS); + if (Constant *Res = ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, DL)) + return Res; // Do a quick scan to see if we have this binop nearby. If so, reuse it. unsigned ScanLimit = 6; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 567b866f7777..4b5ade99767b 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -377,18 +377,12 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, /// expensive. static InstructionCost computeSpeculationCost(const User *I, const TargetTransformInfo &TTI) { - assert(isSafeToSpeculativelyExecute(I) && + assert((!isa<Instruction>(I) || + isSafeToSpeculativelyExecute(cast<Instruction>(I))) && "Instruction is not safe to speculatively execute!"); return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency); } -/// Check whether this is a potentially trapping constant. -static bool canTrap(const Value *V) { - if (auto *C = dyn_cast<Constant>(V)) - return C->canTrap(); - return false; -} - /// If we have a merge point of an "if condition" as accepted above, /// return true if the specified value dominates the block. We /// don't handle the true generality of domination here, just a special case @@ -421,9 +415,9 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB, Instruction *I = dyn_cast<Instruction>(V); if (!I) { - // Non-instructions all dominate instructions, but not all constantexprs - // can be executed unconditionally. - return !canTrap(V); + // Non-instructions dominate all instructions and can be executed + // unconditionally. + return true; } BasicBlock *PBB = I->getParent(); @@ -1473,10 +1467,7 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI, while (isa<DbgInfoIntrinsic>(I2)) I2 = &*BB2_Itr++; } - // FIXME: Can we define a safety predicate for CallBr? - if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) || - (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) || - isa<CallBrInst>(I1)) + if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2)) return false; BasicBlock *BIParent = BI->getParent(); @@ -1609,11 +1600,6 @@ HoistTerminator: if (passingValueIsAlwaysUndefined(BB1V, &PN) || passingValueIsAlwaysUndefined(BB2V, &PN)) return Changed; - - if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) - return Changed; - if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) - return Changed; } } @@ -2679,9 +2665,6 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB, passingValueIsAlwaysUndefined(ThenV, &PN)) return false; - if (canTrap(OrigV) || canTrap(ThenV)) - return false; - HaveRewritablePHIs = true; ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV); ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV); @@ -2979,10 +2962,8 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { return true; } -static ConstantInt * -getKnownValueOnEdge(Value *V, BasicBlock *From, BasicBlock *To, - SmallDenseMap<std::pair<BasicBlock *, BasicBlock *>, - ConstantInt *> &Visited) { +static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From, + BasicBlock *To) { // Don't look past the block defining the value, we might get the value from // a previous loop iteration. auto *I = dyn_cast<Instruction>(V); @@ -2996,23 +2977,7 @@ getKnownValueOnEdge(Value *V, BasicBlock *From, BasicBlock *To, return BI->getSuccessor(0) == To ? ConstantInt::getTrue(BI->getContext()) : ConstantInt::getFalse(BI->getContext()); - // Limit the amount of blocks we inspect. - if (Visited.size() >= 8) - return nullptr; - - auto Pair = Visited.try_emplace({From, To}, nullptr); - if (!Pair.second) - return Pair.first->second; - - // Check whether the known value is the same for all predecessors. - ConstantInt *Common = nullptr; - for (BasicBlock *Pred : predecessors(From)) { - ConstantInt *C = getKnownValueOnEdge(V, Pred, From, Visited); - if (!C || (Common && Common != C)) - return nullptr; - Common = C; - } - return Visited[{From, To}] = Common; + return nullptr; } /// If we have a conditional branch on something for which we know the constant @@ -3022,7 +2987,7 @@ static Optional<bool> FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL, AssumptionCache *AC) { - SmallMapVector<BasicBlock *, ConstantInt *, 8> KnownValues; + SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues; BasicBlock *BB = BI->getParent(); Value *Cond = BI->getCondition(); PHINode *PN = dyn_cast<PHINode>(Cond); @@ -3035,12 +3000,11 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, for (Use &U : PN->incoming_values()) if (auto *CB = dyn_cast<ConstantInt>(U)) - KnownValues.insert({PN->getIncomingBlock(U), CB}); + KnownValues[CB].insert(PN->getIncomingBlock(U)); } else { - SmallDenseMap<std::pair<BasicBlock *, BasicBlock *>, ConstantInt *> Visited; for (BasicBlock *Pred : predecessors(BB)) { - if (ConstantInt *CB = getKnownValueOnEdge(Cond, Pred, BB, Visited)) - KnownValues.insert({Pred, CB}); + if (ConstantInt *CB = getKnownValueOnEdge(Cond, Pred, BB)) + KnownValues[CB].insert(Pred); } } @@ -3056,29 +3020,34 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, for (const auto &Pair : KnownValues) { // Okay, we now know that all edges from PredBB should be revectored to // branch to RealDest. - ConstantInt *CB = Pair.second; - BasicBlock *PredBB = Pair.first; + ConstantInt *CB = Pair.first; + ArrayRef<BasicBlock *> PredBBs = Pair.second.getArrayRef(); BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue()); if (RealDest == BB) continue; // Skip self loops. + // Skip if the predecessor's terminator is an indirect branch. - if (isa<IndirectBrInst>(PredBB->getTerminator())) + if (any_of(PredBBs, [](BasicBlock *PredBB) { + return isa<IndirectBrInst>(PredBB->getTerminator()); + })) continue; - SmallVector<DominatorTree::UpdateType, 3> Updates; + LLVM_DEBUG({ + dbgs() << "Condition " << *Cond << " in " << BB->getName() + << " has value " << *Pair.first << " in predecessors:\n"; + for (const BasicBlock *PredBB : Pair.second) + dbgs() << " " << PredBB->getName() << "\n"; + dbgs() << "Threading to destination " << RealDest->getName() << ".\n"; + }); + + // Split the predecessors we are threading into a new edge block. We'll + // clone the instructions into this block, and then redirect it to RealDest. + BasicBlock *EdgeBB = SplitBlockPredecessors(BB, PredBBs, ".critedge", DTU); - // The dest block might have PHI nodes, other predecessors and other - // difficult cases. Instead of being smart about this, just insert a new - // block that jumps to the destination block, effectively splitting - // the edge we are about to create. - BasicBlock *EdgeBB = - BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge", - RealDest->getParent(), RealDest); - BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB); - if (DTU) - Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest}); - CritEdgeBranch->setDebugLoc(BI->getDebugLoc()); + // TODO: These just exist to reduce test diff, we can drop them if we like. + EdgeBB->setName(RealDest->getName() + ".critedge"); + EdgeBB->moveBefore(RealDest); // Update PHI nodes. AddPredecessorToBlock(RealDest, EdgeBB, BB); @@ -3086,12 +3055,12 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, // BB may have instructions that are being threaded over. Clone these // instructions into EdgeBB. We know that there will be no uses of the // cloned instructions outside of EdgeBB. - BasicBlock::iterator InsertPt = EdgeBB->begin(); + BasicBlock::iterator InsertPt = EdgeBB->getFirstInsertionPt(); DenseMap<Value *, Value *> TranslateMap; // Track translated values. - TranslateMap[Cond] = Pair.second; + TranslateMap[Cond] = CB; for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) { if (PHINode *PN = dyn_cast<PHINode>(BBI)) { - TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB); + TranslateMap[PN] = PN->getIncomingValueForBlock(EdgeBB); continue; } // Clone the instruction. @@ -3129,19 +3098,15 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, } } - // Loop over all of the edges from PredBB to BB, changing them to branch - // to EdgeBB instead. - Instruction *PredBBTI = PredBB->getTerminator(); - for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i) - if (PredBBTI->getSuccessor(i) == BB) { - BB->removePredecessor(PredBB); - PredBBTI->setSuccessor(i, EdgeBB); - } + BB->removePredecessor(EdgeBB); + BranchInst *EdgeBI = cast<BranchInst>(EdgeBB->getTerminator()); + EdgeBI->setSuccessor(0, RealDest); + EdgeBI->setDebugLoc(BI->getDebugLoc()); if (DTU) { - Updates.push_back({DominatorTree::Insert, PredBB, EdgeBB}); - Updates.push_back({DominatorTree::Delete, PredBB, BB}); - + SmallVector<DominatorTree::UpdateType, 2> Updates; + Updates.push_back({DominatorTree::Delete, EdgeBB, BB}); + Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest}); DTU->applyUpdates(Updates); } @@ -3599,13 +3564,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, Cond->getParent() != BB || !Cond->hasOneUse()) return false; - // Cond is known to be a compare or binary operator. Check to make sure that - // neither operand is a potentially-trapping constant expression. - if (canTrap(Cond->getOperand(0))) - return false; - if (canTrap(Cond->getOperand(1))) - return false; - // Finally, don't infinitely unroll conditional loops. if (is_contained(successors(BB), BB)) return false; @@ -4113,9 +4071,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, if (tryWidenCondBranchToCondBranch(PBI, BI, DTU)) return true; - if (canTrap(BI->getCondition())) - return false; - // If both branches are conditional and both contain stores to the same // address, remove the stores from the conditionals and create a conditional // merged store at the end. @@ -4157,10 +4112,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // insertion of a large number of select instructions. For targets // without predication/cmovs, this is a big pessimization. - // Also do not perform this transformation if any phi node in the common - // destination block can trap when reached by BB or PBB (PR17073). In that - // case, it would be unsafe to hoist the operation into a select instruction. - BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); BasicBlock *RemovedDest = PBI->getSuccessor(PBIOp ^ 1); unsigned NumPhis = 0; @@ -4168,16 +4119,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, ++II, ++NumPhis) { if (NumPhis > 2) // Disable this xform. return false; - - PHINode *PN = cast<PHINode>(II); - Value *BIV = PN->getIncomingValueForBlock(BB); - if (canTrap(BIV)) - return false; - - unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); - Value *PBIV = PN->getIncomingValue(PBBIdx); - if (canTrap(PBIV)) - return false; } // Finally, if everything is ok, fold the branches to logical ops. @@ -6174,6 +6115,23 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, return isSwitchDense(SI->getNumCases(), TableSize); } +static bool ShouldUseSwitchConditionAsTableIndex( + ConstantInt &MinCaseVal, const ConstantInt &MaxCaseVal, + bool HasDefaultResults, const SmallDenseMap<PHINode *, Type *> &ResultTypes, + const DataLayout &DL, const TargetTransformInfo &TTI) { + if (MinCaseVal.isNullValue()) + return true; + if (MinCaseVal.isNegative() || + MaxCaseVal.getLimitedValue() == std::numeric_limits<uint64_t>::max() || + !HasDefaultResults) + return false; + return all_of(ResultTypes, [&](const auto &KV) { + return SwitchLookupTable::WouldFitInRegister( + DL, MaxCaseVal.getLimitedValue() + 1 /* TableSize */, + KV.second /* ResultType */); + }); +} + /// Try to reuse the switch table index compare. Following pattern: /// \code /// if (idx < tablesize) @@ -6329,9 +6287,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, } uint64_t NumResults = ResultLists[PHIs[0]].size(); - APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue(); - uint64_t TableSize = RangeSpread.getLimitedValue() + 1; - bool TableHasHoles = (NumResults < TableSize); // If the table has holes, we need a constant result for the default case // or a bitmask that fits in a register. @@ -6340,6 +6295,22 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResultsList, DL, TTI); + for (const auto &I : DefaultResultsList) { + PHINode *PHI = I.first; + Constant *Result = I.second; + DefaultResults[PHI] = Result; + } + + bool UseSwitchConditionAsTableIndex = ShouldUseSwitchConditionAsTableIndex( + *MinCaseVal, *MaxCaseVal, HasDefaultResults, ResultTypes, DL, TTI); + uint64_t TableSize; + if (UseSwitchConditionAsTableIndex) + TableSize = MaxCaseVal->getLimitedValue() + 1; + else + TableSize = + (MaxCaseVal->getValue() - MinCaseVal->getValue()).getLimitedValue() + 1; + + bool TableHasHoles = (NumResults < TableSize); bool NeedMask = (TableHasHoles && !HasDefaultResults); if (NeedMask) { // As an extra penalty for the validity test we require more cases. @@ -6349,12 +6320,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, return false; } - for (const auto &I : DefaultResultsList) { - PHINode *PHI = I.first; - Constant *Result = I.second; - DefaultResults[PHI] = Result; - } - if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) return false; @@ -6368,11 +6333,15 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // Compute the table index value. Builder.SetInsertPoint(SI); Value *TableIndex; - if (MinCaseVal->isNullValue()) + ConstantInt *TableIndexOffset; + if (UseSwitchConditionAsTableIndex) { + TableIndexOffset = ConstantInt::get(MaxCaseVal->getType(), 0); TableIndex = SI->getCondition(); - else - TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal, - "switch.tableidx"); + } else { + TableIndexOffset = MinCaseVal; + TableIndex = + Builder.CreateSub(SI->getCondition(), TableIndexOffset, "switch.tableidx"); + } // Compute the maximum table size representable by the integer type we are // switching upon. @@ -6424,7 +6393,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // Build bitmask; fill in a 1 bit for every case. const ResultListTy &ResultList = ResultLists[PHIs[0]]; for (size_t I = 0, E = ResultList.size(); I != E; ++I) { - uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue()) + uint64_t Idx = (ResultList[I].first->getValue() - TableIndexOffset->getValue()) .getLimitedValue(); MaskInt |= One << Idx; } @@ -6463,8 +6432,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // If using a bitmask, use any value to fill the lookup table holes. Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI]; StringRef FuncName = Fn->getName(); - SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL, - FuncName); + SwitchLookupTable Table(Mod, TableSize, TableIndexOffset, ResultList, DV, + DL, FuncName); Value *Result = Table.BuildLookup(TableIndex, Builder); diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index dbef1ff2e739..af15e0c31b75 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -79,21 +79,23 @@ namespace { bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand); bool replaceIVUserWithLoopInvariant(Instruction *UseInst); + bool replaceFloatIVWithIntegerIV(Instruction *UseInst); bool eliminateOverflowIntrinsic(WithOverflowInst *WO); bool eliminateSaturatingIntrinsic(SaturatingInst *SI); bool eliminateTrunc(TruncInst *TI); bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand); - bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand); - void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); - void simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand, + bool makeIVComparisonInvariant(ICmpInst *ICmp, Instruction *IVOperand); + void eliminateIVComparison(ICmpInst *ICmp, Instruction *IVOperand); + void simplifyIVRemainder(BinaryOperator *Rem, Instruction *IVOperand, bool IsSigned); void replaceRemWithNumerator(BinaryOperator *Rem); void replaceRemWithNumeratorOrZero(BinaryOperator *Rem); void replaceSRemWithURem(BinaryOperator *Rem); bool eliminateSDiv(BinaryOperator *SDiv); - bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand); - bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand); + bool strengthenOverflowingOperation(BinaryOperator *OBO, + Instruction *IVOperand); + bool strengthenRightShift(BinaryOperator *BO, Instruction *IVOperand); }; } @@ -192,7 +194,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) } bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp, - Value *IVOperand) { + Instruction *IVOperand) { unsigned IVOperIdx = 0; ICmpInst::Predicate Pred = ICmp->getPredicate(); if (IVOperand != ICmp->getOperand(0)) { @@ -261,7 +263,8 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp, /// SimplifyIVUsers helper for eliminating useless /// comparisons against an induction variable. -void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { +void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, + Instruction *IVOperand) { unsigned IVOperIdx = 0; ICmpInst::Predicate Pred = ICmp->getPredicate(); ICmpInst::Predicate OriginalPred = Pred; @@ -372,7 +375,8 @@ void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) { /// SimplifyIVUsers helper for eliminating useless remainder operations /// operating on an induction variable or replacing srem by urem. -void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand, +void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, + Instruction *IVOperand, bool IsSigned) { auto *NValue = Rem->getOperand(0); auto *DValue = Rem->getOperand(1); @@ -673,6 +677,35 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) { return true; } +/// Eliminate redundant type cast between integer and float. +bool SimplifyIndvar::replaceFloatIVWithIntegerIV(Instruction *UseInst) { + if (UseInst->getOpcode() != CastInst::SIToFP) + return false; + + Value *IVOperand = UseInst->getOperand(0); + // Get the symbolic expression for this instruction. + ConstantRange IVRange = SE->getSignedRange(SE->getSCEV(IVOperand)); + unsigned DestNumSigBits = UseInst->getType()->getFPMantissaWidth(); + if (IVRange.getActiveBits() <= DestNumSigBits) { + for (User *U : UseInst->users()) { + // Match for fptosi of sitofp and with same type. + auto *CI = dyn_cast<FPToSIInst>(U); + if (!CI || IVOperand->getType() != CI->getType()) + continue; + + CI->replaceAllUsesWith(IVOperand); + DeadInsts.push_back(CI); + LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *CI + << " with: " << *IVOperand << '\n'); + + ++NumFoldedUser; + Changed = true; + } + } + + return Changed; +} + /// Eliminate any operation that SCEV can prove is an identity function. bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand) { @@ -718,18 +751,16 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, /// Annotate BO with nsw / nuw if it provably does not signed-overflow / /// unsigned-overflow. Returns true if anything changed, false otherwise. bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, - Value *IVOperand) { - SCEV::NoWrapFlags Flags; - bool Deduced; - std::tie(Flags, Deduced) = SE->getStrengthenedNoWrapFlagsFromBinOp( + Instruction *IVOperand) { + auto Flags = SE->getStrengthenedNoWrapFlagsFromBinOp( cast<OverflowingBinaryOperator>(BO)); - if (!Deduced) - return Deduced; + if (!Flags) + return false; - BO->setHasNoUnsignedWrap(ScalarEvolution::maskFlags(Flags, SCEV::FlagNUW) == + BO->setHasNoUnsignedWrap(ScalarEvolution::maskFlags(*Flags, SCEV::FlagNUW) == SCEV::FlagNUW); - BO->setHasNoSignedWrap(ScalarEvolution::maskFlags(Flags, SCEV::FlagNSW) == + BO->setHasNoSignedWrap(ScalarEvolution::maskFlags(*Flags, SCEV::FlagNSW) == SCEV::FlagNSW); // The getStrengthenedNoWrapFlagsFromBinOp() check inferred additional nowrap @@ -737,14 +768,14 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, // forgetValue() here to make sure those flags also propagate to any other // SCEV expressions based on the addrec. However, this can have pathological // compile-time impact, see https://bugs.llvm.org/show_bug.cgi?id=50384. - return Deduced; + return true; } /// Annotate the Shr in (X << IVOperand) >> C as exact using the /// information from the IV's range. Returns true if anything changed, false /// otherwise. bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO, - Value *IVOperand) { + Instruction *IVOperand) { using namespace llvm::PatternMatch; if (BO->getOpcode() == Instruction::Shl) { @@ -896,6 +927,13 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) { } } + // Try to use integer induction for FPToSI of float induction directly. + if (replaceFloatIVWithIntegerIV(UseInst)) { + // Re-queue the potentially new direct uses of IVOperand. + pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers); + continue; + } + CastInst *Cast = dyn_cast<CastInst>(UseInst); if (V && Cast) { V->visitCast(Cast); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index f4306bb43dfd..b359717424a6 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -75,7 +75,8 @@ static bool callHasFP128Argument(const CallInst *CI) { }); } -static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) { +static Value *convertStrToNumber(CallInst *CI, StringRef &Str, Value *EndPtr, + int64_t Base, IRBuilderBase &B) { if (Base < 2 || Base > 36) // handle special zero base if (Base != 0) @@ -97,6 +98,15 @@ static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) { if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result)) return nullptr; + if (EndPtr) { + // Store the pointer to the end. + uint64_t ILen = End - nptr.c_str(); + Value *Off = B.getInt64(ILen); + Value *StrBeg = CI->getArgOperand(0); + Value *StrEnd = B.CreateInBoundsGEP(B.getInt8Ty(), StrBeg, Off, "endptr"); + B.CreateStore(StrEnd, EndPtr); + } + return ConstantInt::get(CI->getType(), Result); } @@ -295,31 +305,69 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, SrcLen, B)); } +// Helper to transform memchr(S, C, N) == S to N && *S == C and, when +// NBytes is null, strchr(S, C) to *S == C. A precondition of the function +// is that either S is dereferenceable or the value of N is nonzero. +static Value* memChrToCharCompare(CallInst *CI, Value *NBytes, + IRBuilderBase &B, const DataLayout &DL) +{ + Value *Src = CI->getArgOperand(0); + Value *CharVal = CI->getArgOperand(1); + + // Fold memchr(A, C, N) == A to N && *A == C. + Type *CharTy = B.getInt8Ty(); + Value *Char0 = B.CreateLoad(CharTy, Src); + CharVal = B.CreateTrunc(CharVal, CharTy); + Value *Cmp = B.CreateICmpEQ(Char0, CharVal, "char0cmp"); + + if (NBytes) { + Value *Zero = ConstantInt::get(NBytes->getType(), 0); + Value *And = B.CreateICmpNE(NBytes, Zero); + Cmp = B.CreateLogicalAnd(And, Cmp); + } + + Value *NullPtr = Constant::getNullValue(CI->getType()); + return B.CreateSelect(Cmp, Src, NullPtr); +} + Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); Value *SrcStr = CI->getArgOperand(0); + Value *CharVal = CI->getArgOperand(1); annotateNonNullNoUndefBasedOnAccess(CI, 0); + if (isOnlyUsedInEqualityComparison(CI, SrcStr)) + return memChrToCharCompare(CI, nullptr, B, DL); + // If the second operand is non-constant, see if we can compute the length // of the input string and turn this into memchr. - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal); if (!CharC) { uint64_t Len = GetStringLength(SrcStr); if (Len) annotateDereferenceableBytes(CI, 0, Len); else return nullptr; + + Function *Callee = CI->getCalledFunction(); + FunctionType *FT = Callee->getFunctionType(); if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; return copyFlags( *CI, - emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + emitMemChr(SrcStr, CharVal, // include nul. ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B, DL, TLI)); } + if (CharC->isZero()) { + Value *NullPtr = Constant::getNullValue(CI->getType()); + if (isOnlyUsedInEqualityComparison(CI, NullPtr)) + // Pre-empt the transformation to strlen below and fold + // strchr(A, '\0') == null to false. + return B.CreateIntToPtr(B.getTrue(), CI->getType()); + } + // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; @@ -1008,8 +1056,12 @@ Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) { Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { Value *SrcStr = CI->getArgOperand(0); Value *Size = CI->getArgOperand(2); - if (isKnownNonZero(Size, DL)) + + if (isKnownNonZero(Size, DL)) { annotateNonNullNoUndefBasedOnAccess(CI, 0); + if (isOnlyUsedInEqualityComparison(CI, SrcStr)) + return memChrToCharCompare(CI, Size, B, DL); + } Value *CharVal = CI->getArgOperand(1); ConstantInt *CharC = dyn_cast<ConstantInt>(CharVal); @@ -1099,9 +1151,16 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { return B.CreateSelect(And, SrcStr, Sel1, "memchr.sel2"); } - if (!LenC) + if (!LenC) { + if (isOnlyUsedInEqualityComparison(CI, SrcStr)) + // S is dereferenceable so it's safe to load from it and fold + // memchr(S, C, N) == S to N && *S == C for any C and N. + // TODO: This is safe even even for nonconstant S. + return memChrToCharCompare(CI, Size, B, DL); + // From now on we need a constant length and constant array. return nullptr; + } // If the char is variable but the input str and length are not we can turn // this memchr call into a simple bit field test. Of course this only works @@ -1589,31 +1648,6 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, return nullptr; } -static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) { - // Multiplications calculated using Addition Chains. - // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html - - assert(Exp != 0 && "Incorrect exponent 0 not handled"); - - if (InnerChain[Exp]) - return InnerChain[Exp]; - - static const unsigned AddChain[33][2] = { - {0, 0}, // Unused. - {0, 0}, // Unused (base case = pow1). - {1, 1}, // Unused (pre-computed). - {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4}, - {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7}, - {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10}, - {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, - {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, - }; - - InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), - getPow(InnerChain, AddChain[Exp][1], B)); - return InnerChain[Exp]; -} - // Return a properly extended integer (DstWidth bits wide) if the operation is // an itofp. static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) { @@ -1914,70 +1948,52 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { if (Value *Sqrt = replacePowWithSqrt(Pow, B)) return Sqrt; - // pow(x, n) -> x * x * x * ... + // pow(x, n) -> powi(x, n) * sqrt(x) if n has exactly a 0.5 fraction const APFloat *ExpoF; - if (AllowApprox && match(Expo, m_APFloat(ExpoF)) && - !ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)) { - // We limit to a max of 7 multiplications, thus the maximum exponent is 32. - // If the exponent is an integer+0.5 we generate a call to sqrt and an - // additional fmul. - // TODO: This whole transformation should be backend specific (e.g. some - // backends might prefer libcalls or the limit for the exponent might - // be different) and it should also consider optimizing for size. - APFloat LimF(ExpoF->getSemantics(), 33), - ExpoA(abs(*ExpoF)); - if (ExpoA < LimF) { - // This transformation applies to integer or integer+0.5 exponents only. - // For integer+0.5, we create a sqrt(Base) call. - Value *Sqrt = nullptr; - if (!ExpoA.isInteger()) { - APFloat Expo2 = ExpoA; - // To check if ExpoA is an integer + 0.5, we add it to itself. If there - // is no floating point exception and the result is an integer, then - // ExpoA == integer + 0.5 - if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK) - return nullptr; - - if (!Expo2.isInteger()) - return nullptr; - - Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(), - Pow->doesNotAccessMemory(), M, B, TLI); - if (!Sqrt) - return nullptr; - } - - // We will memoize intermediate products of the Addition Chain. - Value *InnerChain[33] = {nullptr}; - InnerChain[1] = Base; - InnerChain[2] = B.CreateFMul(Base, Base, "square"); - - // We cannot readily convert a non-double type (like float) to a double. - // So we first convert it to something which could be converted to double. - ExpoA.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &Ignored); - Value *FMul = getPow(InnerChain, ExpoA.convertToDouble(), B); + if (match(Expo, m_APFloat(ExpoF)) && !ExpoF->isExactlyValue(0.5) && + !ExpoF->isExactlyValue(-0.5)) { + APFloat ExpoA(abs(*ExpoF)); + APFloat ExpoI(*ExpoF); + Value *Sqrt = nullptr; + if (AllowApprox && !ExpoA.isInteger()) { + APFloat Expo2 = ExpoA; + // To check if ExpoA is an integer + 0.5, we add it to itself. If there + // is no floating point exception and the result is an integer, then + // ExpoA == integer + 0.5 + if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK) + return nullptr; - // Expand pow(x, y+0.5) to pow(x, y) * sqrt(x). - if (Sqrt) - FMul = B.CreateFMul(FMul, Sqrt); + if (!Expo2.isInteger()) + return nullptr; - // If the exponent is negative, then get the reciprocal. - if (ExpoF->isNegative()) - FMul = B.CreateFDiv(ConstantFP::get(Ty, 1.0), FMul, "reciprocal"); + if (ExpoI.roundToIntegral(APFloat::rmTowardNegative) != + APFloat::opInexact) + return nullptr; + if (!ExpoI.isInteger()) + return nullptr; + ExpoF = &ExpoI; - return FMul; + Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(), + Pow->doesNotAccessMemory(), M, B, TLI); + if (!Sqrt) + return nullptr; } + // pow(x, n) -> powi(x, n) if n is a constant signed integer value APSInt IntExpo(TLI->getIntSize(), /*isUnsigned=*/false); - // powf(x, n) -> powi(x, n) if n is a constant signed integer value if (ExpoF->isInteger() && ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) == APFloat::opOK) { - return copyFlags( + Value *PowI = copyFlags( *Pow, createPowWithIntegerExponent( Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), M, B)); + + if (PowI && Sqrt) + return B.CreateFMul(PowI, Sqrt); + + return PowI; } } @@ -2517,7 +2533,7 @@ Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) { if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; - return convertStrToNumber(CI, Str, 10); + return convertStrToNumber(CI, Str, nullptr, 10, B); } Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { @@ -2525,11 +2541,14 @@ Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) { if (!getConstantStringInfo(CI->getArgOperand(0), Str)) return nullptr; - if (!isa<ConstantPointerNull>(CI->getArgOperand(1))) + Value *EndPtr = CI->getArgOperand(1); + if (isa<ConstantPointerNull>(EndPtr)) + EndPtr = nullptr; + else if (!isKnownNonZero(EndPtr, DL)) return nullptr; if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) { - return convertStrToNumber(CI, Str, CInt->getSExtValue()); + return convertStrToNumber(CI, Str, EndPtr, CInt->getSExtValue(), B); } return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 6242d9a93fc1..183ba86abcb4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -386,20 +386,6 @@ static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { return true; } -/// Check whether it is safe to if-convert this phi node. -/// -/// Phi nodes with constant expressions that can trap are not safe to if -/// convert. -static bool canIfConvertPHINodes(BasicBlock *BB) { - for (PHINode &Phi : BB->phis()) { - for (Value *V : Phi.incoming_values()) - if (auto *C = dyn_cast<Constant>(V)) - if (C->canTrap()) - return false; - } - return true; -} - static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -993,7 +979,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getPredicate()); return true; } @@ -1098,13 +1083,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( SmallPtrSetImpl<const Instruction *> &MaskedOp, SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { for (Instruction &I : *BB) { - // Check that we don't have a constant expression that can trap as operand. - for (Value *Operand : I.operands()) { - if (auto *C = dyn_cast<Constant>(Operand)) - if (C->canTrap()) - return false; - } - // We can predicate blocks with calls to assume, as long as we drop them in // case we flatten the CFG via predication. if (match(&I, m_Intrinsic<Intrinsic::assume>())) { @@ -1190,7 +1168,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } // Collect the blocks that need predication. - BasicBlock *Header = TheLoop->getHeader(); for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { @@ -1212,13 +1189,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BB->getTerminator()); return false; } - } else if (BB != Header && !canIfConvertPHINodes(BB)) { - reportVectorizationFailure( - "Control flow cannot be substituted for a select", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); - return false; } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 0cb2032fa45a..2e9a9fe0640e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -33,7 +33,6 @@ class LoopInfo; class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; -class LoopVectorizationRequirements; class LoopVectorizeHints; class OptimizationRemarkEmitter; class TargetTransformInfo; @@ -46,8 +45,9 @@ class VPBuilder { VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); VPInstruction *createInstruction(unsigned Opcode, - ArrayRef<VPValue *> Operands, DebugLoc DL) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL); + ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") { + VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name); if (BB) BB->insert(Instr, InsertPt); return Instr; @@ -55,8 +55,8 @@ class VPBuilder { VPInstruction *createInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL) { - return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL); + DebugLoc DL, const Twine &Name = "") { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name); } public: @@ -124,34 +124,37 @@ public: /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as /// its underlying Instruction. VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, - Instruction *Inst = nullptr) { + Instruction *Inst = nullptr, const Twine &Name = "") { DebugLoc DL; if (Inst) DL = Inst->getDebugLoc(); - VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL); + VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, - DebugLoc DL) { - return createInstruction(Opcode, Operands, DL); + DebugLoc DL, const Twine &Name = "") { + return createInstruction(Opcode, Operands, DL, Name); } - VPValue *createNot(VPValue *Operand, DebugLoc DL) { - return createInstruction(VPInstruction::Not, {Operand}, DL); + VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") { + return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL, + const Twine &Name = "") { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name); } - VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL); + VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL, + const Twine &Name = "") { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL, Name); } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL); + DebugLoc DL, const Twine &Name = "") { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, + Name); } //===--------------------------------------------------------------------===// @@ -191,6 +194,10 @@ struct VectorizationFactor { /// Cost of the scalar loop. InstructionCost ScalarCost; + /// The minimum trip count required to make vectorization profitable, e.g. due + /// to runtime checks. + ElementCount MinProfitableTripCount; + VectorizationFactor(ElementCount Width, InstructionCost Cost, InstructionCost ScalarCost) : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} @@ -268,8 +275,6 @@ class LoopVectorizationPlanner { const LoopVectorizeHints &Hints; - LoopVectorizationRequirements &Requirements; - OptimizationRemarkEmitter *ORE; SmallVector<VPlanPtr, 4> VPlans; @@ -285,10 +290,9 @@ public: InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements, OptimizationRemarkEmitter *ORE) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {} + PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. @@ -332,11 +336,6 @@ public: bool requiresTooManyRuntimeChecks() const; protected: - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b637b2d5ddae..0777a1385916 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +static cl::opt<unsigned> VectorizeMemoryCheckThreshold( + "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks")); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the @@ -442,6 +441,7 @@ public: const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) @@ -453,6 +453,11 @@ public: // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + + if (MinProfitableTripCount.isZero()) + this->MinProfitableTripCount = VecWidth; + else + this->MinProfitableTripCount = MinProfitableTripCount; } virtual ~InnerLoopVectorizer() = default; @@ -656,6 +661,8 @@ protected: /// vector elements. ElementCount VF; + ElementCount MinProfitableTripCount; + /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -735,6 +742,7 @@ public: LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} @@ -783,8 +791,8 @@ public: BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, - Checks), + EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, + CM, BFI, PSI, Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1018,7 +1026,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || isa<VPScalarIVStepsRecipe>(CurRec) || - isa<VPCanonicalIVPHIRecipe>(CurRec)) + isa<VPCanonicalIVPHIRecipe>(CurRec) || + isa<VPActiveLaneMaskPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1503,6 +1512,13 @@ public: /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns true if were tail-folding and want to use the active lane mask + /// for vector loop control flow. + bool useActiveLaneMaskForControlFlow() const { + return FoldTailByMasking && + TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1551,14 +1567,14 @@ public: Scalars.clear(); } -private: - unsigned NumPredStores = 0; - /// Convenience function that returns the value of vscale_range iff /// vscale_range.min == vscale_range.max or otherwise returns the value /// returned by the corresponding TLI method. Optional<unsigned> getVScaleForTuning() const; +private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -1661,7 +1677,8 @@ private: /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. - SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> + PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at /// least once. This may be needed as a fallback loop in case runtime @@ -1849,14 +1866,17 @@ class GeneratedRTChecks { DominatorTree *DT; LoopInfo *LI; + TargetTransformInfo *TTI; SCEVExpander SCEVExp; SCEVExpander MemCheckExp; + bool CostTooHigh = false; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - const DataLayout &DL) - : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + TargetTransformInfo *TTI, const DataLayout &DL) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), MemCheckExp(SE, DL, "scev.check") {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can @@ -1867,6 +1887,15 @@ public: void Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + // Hard cutoff to limit compile-time increase in case a very large number of + // runtime checks needs to be generated. + // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to + // profile info. + CostTooHigh = + LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; + if (CostTooHigh) + return; + BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -1938,6 +1967,44 @@ public: } } + InstructionCost getCost() { + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); + + if (CostTooHigh) { + InstructionCost Cost; + Cost.setInvalid(); + LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); + return Cost; + } + + InstructionCost RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost + << "\n"); + + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2880,9 +2947,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - Value *Step = createStepForVF(Builder, CountTy, VF, UF); + auto CreateStep = [&]() { + // Create step with max(MinProTripCount, UF * VF). + if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) + return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); + return createStepForVF(Builder, CountTy, VF, UF); + }; + if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + CheckMinIters = + Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable()) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an @@ -2894,8 +2968,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } + // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -2920,7 +2995,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -4792,7 +4866,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { MaxVScale = TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -5187,9 +5261,9 @@ bool LoopVectorizationCostModel::isMoreProfitable( unsigned EstimatedWidthB = B.Width.getKnownMinValue(); if (Optional<unsigned> VScale = getVScaleForTuning()) { if (A.Width.isScalable()) - EstimatedWidthA *= VScale.getValue(); + EstimatedWidthA *= VScale.value(); if (B.Width.isScalable()) - EstimatedWidthB *= VScale.getValue(); + EstimatedWidthB *= VScale.value(); } // Assume vscale may be larger than 1 (or the value being tuned for), @@ -5872,10 +5946,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; - return TTI.getRegUsageForType(VectorType::get(Ty, VF)); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -6014,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // map will indicate that we've analyzed it already. ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + PredicatedBBsAfterVectorization[VF].clear(); + // Find all the instructions that are scalar with predication in the loop and // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. @@ -6031,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. - PredicatedBBsAfterVectorization.insert(BB); + PredicatedBBsAfterVectorization[VF].insert(BB); } } } @@ -6896,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF.isVector() && BI->isConditional() && - (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { @@ -7363,14 +7440,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - return (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold && - !Hints.allowReordering()) || - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; -} - Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7439,7 +7508,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -7554,7 +7625,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) - L->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.value()); else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -7585,51 +7656,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -void LoopVectorizationPlanner::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - - // We create new control-flow for the vectorized loop, so the original exit - // conditions will be dead after vectorization if it's only used by the - // terminator - SmallVector<BasicBlock*> ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - for (auto *BB : ExitingBlocks) { - auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); - if (!Cmp || !Cmp->hasOneUse()) - continue; - - // TODO: we should introduce a getUniqueExitingBlocks on Loop - if (!DeadInstructions.insert(Cmp).second) - continue; - - // The operands of the icmp is often a dead trunc, used by IndUpdate. - // TODO: can recurse through operands in general - for (Value *Op : Cmp->operands()) { - if (isa<TruncInst>(Op) && Op->hasOneUse()) - DeadInstructions.insert(cast<Instruction>(Op)); - } - } - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - auto *Latch = OrigLoop->getLoopLatch(); - for (auto &Induction : Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - - // If the tail is to be folded by masking, the primary induction variable, - // if exists, isn't dead: it will be used for masking. Don't kill it. - if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) - continue; - - if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); - } -} - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } //===--------------------------------------------------------------------===// @@ -8001,11 +8027,19 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8014,9 +8048,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); @@ -8409,9 +8444,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return RegSucc; } -VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe, - VPlanPtr &Plan) { +VPRegionBlock *VPRecipeBuilder::createReplicateRegion( + Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) { // Instructions marked for predication are replicated and placed under an // if-then construct to prevent side-effects. @@ -8425,7 +8459,7 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); auto *PHIRecipe = Instr->getType()->isVoidTy() ? nullptr - : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); + : new VPPredInstPHIRecipe(PredRecipe); if (PHIRecipe) { Plan->removeVPValueFor(Instr); Plan->addVPValue(Instr, PHIRecipe); @@ -8517,19 +8551,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. + SmallPtrSet<Instruction *, 4> DeadInstructions; auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); @@ -8565,32 +8591,84 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a -// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a -// BranchOnCount VPInstruction to the latch. +// Add the necessary canonical IV and branch recipes required to control the +// loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW) { + bool HasNUW, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar + // IV by VF * UF. auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL); + {CanonicalIVPHI}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + if (UseLaneMaskForLoopControlFlow) { + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {StartV}, DL, "index.part.next"); + Preheader->appendRecipe(CanonicalIVIncrementParts); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.entry"); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + + // Create the active lane mask for the next iteration of the loop. + CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.next"); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + // We have to invert the mask here because a true condition means jumping + // to the exit block. + auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); + EB->appendRecipe(NotMask); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); + EB->appendRecipe(BranchBack); + } else { + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); + } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8691,7 +8769,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking()); + !CM.foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8961,8 +9040,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::sinkScalarOperands(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeDeadRecipes(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); // Fold Exit block into its predecessor if possible. @@ -9006,7 +9085,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true); + true, CM.useActiveLaneMaskForControlFlow()); return Plan; } @@ -9078,7 +9157,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); - WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + // Append the recipe to the end of the VPBasicBlock because we need to + // ensure that it comes after all of it's inputs, including CondOp. + WidenRecipe->getParent()->appendRecipe(RedRecipe); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); @@ -9151,229 +9232,6 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { *this, State); } -void VPWidenSelectRecipe::execute(VPTransformState &State) { - auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = - InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); - Value *Op0 = State.get(getOperand(1), Part); - Value *Op1 = State.get(getOperand(2), Part); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel, Part); - State.addMetadata(Sel, &I); - } -} - -void VPWidenRecipe::execute(VPTransformState &State) { - auto &I = *cast<Instruction>(getUnderlyingValue()); - auto &Builder = State.Builder; - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) { - VecOp->copyIRFlags(&I); - - // If the instruction is vectorized and was in a basic block that needed - // predication, we can't propagate poison-generating flags (nuw/nsw, - // exact, etc.). The control flow has been linearized and the - // instruction is no longer guarded by the predicate, which could make - // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.contains(this)) - VecOp->dropPoisonGeneratingFlags(); - } - - // Use this vector value for all users of the original instruction. - State.set(this, V, Part); - State.addMetadata(V, &I); - } - - break; - } - case Instruction::Freeze: { - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Op = State.get(getOperand(0), Part); - - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze, Part); - } - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - State.setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(this, C, Part); - State.addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - State.setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = (State.VF.isScalar()) - ? CI->getType() - : VectorType::get(CI->getType(), State.VF); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - -void VPWidenGEPRecipe::execute(VPTransformState &State) { - auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = State.Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); - State.set(this, EntryPart, Part); - State.addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < State.UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(getOperand(0), VPIteration(0, 0)) - : State.get(getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // If the GEP instruction is vectorized and was in a basic block that - // needed predication, we can't propagate the poison-generating 'inbounds' - // flag. The control flow has been linearized and the GEP is no longer - // guarded by the predicate, which could make the 'inbounds' properties to - // no longer hold. - bool IsInBounds = - GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", IsInBounds); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP, Part); - State.addMetadata(NewGEP, GEP); - } - } -} - void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); @@ -9632,45 +9490,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { } } -void VPBlendRecipe::execute(VPTransformState &State) { - State.setDebugLocFromInst(Phi); - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // SELECT(Mask1, In1, - // In0))) - // Note that Mask0 is never used: lanes for which no path reaches this phi and - // are essentially undef are taken from In0. - InnerLoopVectorizer::VectorParts Entry(State.UF); - for (unsigned In = 0; In < NumIncoming; ++In) { - for (unsigned Part = 0; Part < State.UF; ++Part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), Part); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), Part); - Entry[Part] = - State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); - } - } - } - for (unsigned Part = 0; Part < State.UF; ++Part) - State.set(this, Entry[Part], Part); -} - void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), @@ -9758,32 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State); } -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); - - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane.getKnownLane(); - - Value *ConditionBit = nullptr; - VPValue *BlockInMask = getMask(); - if (BlockInMask) { - ConditionBit = State.get(BlockInMask, Part); - if (ConditionBit->getType()->isVectorTy()) - ConditionBit = State.Builder.CreateExtractElement( - ConditionBit, State.Builder.getInt32(Lane)); - } else // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); -} - void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -10103,8 +9896,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10123,10 +9915,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -10183,6 +9975,105 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } } +static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, + Optional<unsigned> VScale, Loop *L, + ScalarEvolution &SE) { + InstructionCost CheckCost = Checks.getCost(); + if (!CheckCost.isValid()) + return false; + + // When interleaving only scalar and vector cost will be equal, which in turn + // would lead to a divide by 0. Fall back to hard threshold. + if (VF.Width.isScalar()) { + if (CheckCost > VectorizeMemoryCheckThreshold) { + LLVM_DEBUG( + dbgs() + << "LV: Interleaving only is not profitable due to runtime checks\n"); + return false; + } + return true; + } + + // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. + double ScalarC = *VF.ScalarCost.getValue(); + if (ScalarC == 0) + return true; + + // First, compute the minimum iteration count required so that the vector + // loop outperforms the scalar loop. + // The total cost of the scalar loop is + // ScalarC * TC + // where + // * TC is the actual trip count of the loop. + // * ScalarC is the cost of a single scalar iteration. + // + // The total cost of the vector loop is + // RtC + VecC * (TC / VF) + EpiC + // where + // * RtC is the cost of the generated runtime checks + // * VecC is the cost of a single vector iteration. + // * TC is the actual trip count of the loop + // * VF is the vectorization factor + // * EpiCost is the cost of the generated epilogue, including the cost + // of the remaining scalar operations. + // + // Vectorization is profitable once the total vector cost is less than the + // total scalar cost: + // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC + // + // Now we can compute the minimum required trip count TC as + // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // + // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that + // the computations are performed on doubles, not integers and the result + // is rounded up, hence we get an upper estimate of the TC. + unsigned IntVF = VF.Width.getKnownMinValue(); + if (VF.Width.isScalable()) { + unsigned AssumedMinimumVscale = 1; + if (VScale) + AssumedMinimumVscale = *VScale; + IntVF *= AssumedMinimumVscale; + } + double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; + double RtC = *CheckCost.getValue(); + double MinTC1 = RtC / (ScalarC - VecCOverVF); + + // Second, compute a minimum iteration count so that the cost of the + // runtime checks is only a fraction of the total scalar loop cost. This + // adds a loop-dependent bound on the overhead incurred if the runtime + // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC + // * TC. To bound the runtime check to be a fraction 1/X of the scalar + // cost, compute + // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC + double MinTC2 = RtC * 10 / ScalarC; + + // Now pick the larger minimum. If it is not a multiple of VF, choose the + // next closest multiple of VF. This should partly compensate for ignoring + // the epilogue cost. + uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); + + LLVM_DEBUG( + dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" + << VF.MinProfitableTripCount << "\n"); + + // Skip vectorization if the expected trip count is less than the minimum + // required trip count. + if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), + VF.MinProfitableTripCount)) { + LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " + "trip count < minimum profitable VF (" + << *ExpectedTC << " < " << VF.MinProfitableTripCount + << ")\n"); + + return false; + } + } + return true; +} + LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), @@ -10340,8 +10231,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10353,10 +10243,25 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); if (MaybeVF) { - if (LVP.requiresTooManyRuntimeChecks()) { + VF = *MaybeVF; + // Select the interleave count. + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Optimistically generate runtime checks if they are needed. Drop them if + // they turn out to not be profitable. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + + // Check if it is profitable to vectorize with runtime checks. + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (!ForceVectorization && + !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), @@ -10368,15 +10273,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.emitRemarkWithHints(); return false; } - VF = *MaybeVF; - // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); - - unsigned SelectedIC = std::max(IC, UserIC); - // Optimistically generate runtime checks if they are needed. Drop them if - // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10533,8 +10429,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, + PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); @@ -10564,7 +10461,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); if (RemainderLoopID) { - L->setLoopID(RemainderLoopID.getValue()); + L->setLoopID(RemainderLoopID.value()); } else { if (DisableRuntimeUnroll) AddRuntimeUnrollDisableMetaData(L); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 019a09665a67..e136cd9aedac 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2637,7 +2637,7 @@ private: AliasCacheKey key = std::make_pair(Inst1, Inst2); Optional<bool> &result = AliasCache[key]; if (result) { - return result.getValue(); + return result.value(); } bool aliased = true; if (Loc1.Ptr && isSimple(Inst1)) @@ -4592,7 +4592,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, }; InstructionsState S = getSameOpcode(VL); - if (Depth == RecursionMaxDepth) { + + // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of + // a load), in which case peek through to include it in the tree, without + // ballooning over-budget. + if (Depth >= RecursionMaxDepth && + !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp && + VL.size() >= 4 && + (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { + return match(I, + m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && + cast<Instruction>(I)->getOpcode() == + cast<Instruction>(S.MainOp)->getOpcode(); + })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -11217,7 +11229,7 @@ public: return OptimizationRemarkMissed( SV_NAME, "HorSLPNotBeneficial", ReducedValsToOps.find(VL[0])->second.front()) - << "Vectorizing horizontal reduction is possible" + << "Vectorizing horizontal reduction is possible " << "but not beneficial with cost " << ore::NV("Cost", Cost) << " and threshold " << ore::NV("Threshold", -SLPCostThreshold); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 97f2b1a93815..c7949c42c03e 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -159,7 +159,8 @@ public: /// Create a replicating region for instruction \p I that requires /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, + VPRegionBlock *createReplicateRegion(Instruction *I, + VPReplicateRecipe *PredRecipe, VPlanPtr &Plan); /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4d709097c306..30032dda7f60 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -248,25 +248,27 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { } void VPTransformState::setDebugLocFromInst(const Value *V) { - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { - const DILocation *DIL = Inst->getDebugLoc(); + const Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) { + Builder.SetCurrentDebugLocation(DebugLoc()); + return; + } - // When a FSDiscriminator is enabled, we don't need to add the multiply - // factors to the discriminators. - if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { - // FIXME: For scalable vectors, assume vscale=1. - auto NewDIL = - DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); - if (NewDIL) - Builder.SetCurrentDebugLocation(*NewDIL); - else - LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " - << DIL->getFilename() << " Line: " << DIL->getLine()); - } else - Builder.SetCurrentDebugLocation(DIL); + const DILocation *DIL = Inst->getDebugLoc(); + // When a FSDiscriminator is enabled, we don't need to add the multiply + // factors to the discriminators. + if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && + !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + // FIXME: For scalable vectors, assume vscale=1. + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); + if (NewDIL) + Builder.SetCurrentDebugLocation(*NewDIL); + else + LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " + << DIL->getFilename() << " Line: " << DIL->getLine()); } else - Builder.SetCurrentDebugLocation(DebugLoc()); + Builder.SetCurrentDebugLocation(DIL); } BasicBlock * @@ -566,6 +568,24 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif +VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa<VPActiveLaneMaskPHIRecipe>(&R)) + return cast<VPActiveLaneMaskPHIRecipe>(&R); + } + return nullptr; +} + +static bool canSimplifyBranchOnCond(VPInstruction *Term) { + VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0)); + if (!Not || Not->getOpcode() != VPInstruction::Not) + return false; + + VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0)); + return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; +} + void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, @@ -573,11 +593,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); - // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when - // preparing to execute the plan for the main vector loop. - if (!IsEpilogueVectorization && Term && - Term->getOpcode() == VPInstruction::BranchOnCount && - isa<ConstantInt>(TripCountV)) { + // Try to simplify the branch condition if TC <= VF * UF when preparing to + // execute the plan for the main vector loop. We only do this if the + // terminator is: + // 1. BranchOnCount, or + // 2. BranchOnCond where the input is Not(ActiveLaneMask). + if (!IsEpilogueVectorization && Term && isa<ConstantInt>(TripCountV) && + (Term->getOpcode() == VPInstruction::BranchOnCount || + (Term->getOpcode() == VPInstruction::BranchOnCond && + canSimplifyBranchOnCond(Term)))) { ConstantInt *C = cast<ConstantInt>(TripCountV); uint64_t TCVal = C->getZExtValue(); if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { @@ -697,7 +721,8 @@ void VPlan::execute(VPTransformState *State) { // generated. bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || - cast<VPReductionPHIRecipe>(PhiR)->isOrdered(); + (isa<VPReductionPHIRecipe>(PhiR) && + cast<VPReductionPHIRecipe>(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 09da4a545d0d..f009a7ee6b4b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -784,6 +784,10 @@ public: ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + // The next two are similar to the above, but instead increment the + // canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + CanonicalIVIncrementForPartNUW, BranchOnCount, BranchOnCond }; @@ -794,6 +798,9 @@ private: FastMathFlags FMF; DebugLoc DL; + /// An optional name that can be used for the generated IR instruction. + const std::string Name; + /// Utility method serving execute(): generates a single instance of the /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); @@ -802,14 +809,15 @@ protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode), - DL(DL) {} + DL(DL), Name(Name.str()) {} VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL = {}) - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {} + DebugLoc DL = {}, const Twine &Name = "") + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -818,7 +826,7 @@ public: VPInstruction *clone() const { SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands, DL); + return new VPInstruction(Opcode, Operands, DL, Name); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -897,6 +905,8 @@ public: case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: case VPInstruction::BranchOnCount: return true; }; @@ -1125,6 +1135,7 @@ public: /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1132,6 +1143,7 @@ public: } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1861,6 +1873,42 @@ public: } }; +/// A recipe for generating the active lane mask for the vector loop that is +/// used to predicate the vector operations. +/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +/// remove VPActiveLaneMaskPHIRecipe. +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr, StartMask), + DL(DL) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2656,6 +2704,10 @@ public: return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } + /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there + /// be only one at most. If there isn't one, then return nullptr. + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); + void addLiveOut(PHINode *PN, VPValue *V); void clearLiveOuts() { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 92422b17457c..fdd901a4a70d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -26,13 +26,19 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> using namespace llvm; +using VectorParts = SmallVector<Value *, 2>; + extern cl::opt<bool> EnableVPlanNativePath; +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPWidenMemoryInstructionSC: { @@ -186,7 +192,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); - Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); + Value *V = + Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); State.set(this, V, Part); return; } @@ -194,14 +201,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, switch (getOpcode()) { case VPInstruction::Not: { Value *A = State.get(getOperand(0), Part); - Value *V = Builder.CreateNot(A); + Value *V = Builder.CreateNot(A, Name); State.set(this, V, Part); break; } case VPInstruction::ICmpULE: { Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); - Value *V = Builder.CreateICmpULE(IV, TC); + Value *V = Builder.CreateICmpULE(IV, TC, Name); State.set(this, V, Part); break; } @@ -209,7 +216,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, Value *Cond = State.get(getOperand(0), Part); Value *Op1 = State.get(getOperand(1), Part); Value *Op2 = State.get(getOperand(2), Part); - Value *V = Builder.CreateSelect(Cond, Op1, Op2); + Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name); State.set(this, V, Part); break; } @@ -223,7 +230,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); + {VIVElem0, ScalarTC}, nullptr, Name); State.set(this, Call, Part); break; } @@ -247,7 +254,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, PartMinus1, Part); } else { Value *V2 = State.get(getOperand(1), Part); - State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part); + State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name), + Part); } break; } @@ -261,7 +269,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, // elements) times the unroll factor (num of SIMD instructions). Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } else { Next = State.get(this, 0); } @@ -269,6 +277,23 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, Next, Part); break; } + + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + auto *IV = State.get(getOperand(0), VPIteration(0, 0)); + if (Part == 0) { + State.set(this, IV, Part); + break; + } + + // The canonical IV is incremented by the vectorization factor (num of SIMD + // elements) times the unroll part. + Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); + State.set(this, Next, Part); + break; + } case VPInstruction::BranchOnCond: { if (Part != 0) break; @@ -370,6 +395,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::CanonicalIVIncrementForPart: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementForPartNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; @@ -431,7 +462,158 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(2)->printAsOperand(O, SlotTracker); O << (InvariantCond ? " (condition is loop invariant)" : ""); } +#endif + +void VPWidenSelectRecipe::execute(VPTransformState &State) { + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.addMetadata(Sel, &I); + } +} + +void VPWidenRecipe::execute(VPTransformState &State) { + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.contains(this)) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.addMetadata(V, &I); + } + + break; + } + case Instruction::Freeze: { + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Op = State.get(getOperand(0), Part); + + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, Part); + } + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.addMetadata(C, &I); + } + + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; @@ -487,7 +669,82 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "= SCALAR-STEPS "; printOperands(O, SlotTracker); } +#endif + +void VPWidenGEPRecipe::execute(VPTransformState &State) { + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, + Indices, "", IsInBounds); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.addMetadata(NewGEP, GEP); + } + } +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; @@ -501,7 +758,48 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, O << " = getelementptr "; printOperands(O, SlotTracker); } +#endif + +void VPBlendRecipe::execute(VPTransformState &State) { + State.setDebugLocFromInst(Phi); + // We know that all PHIs in non-header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // SELECT(Mask1, In1, + // In0))) + // Note that Mask0 is never used: lanes for which no path reaches this phi and + // are essentially undef are taken from In0. + VectorParts Entry(State.UF); + for (unsigned In = 0; In < NumIncoming; ++In) { + for (unsigned Part = 0; Part < State.UF; ++Part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + Value *In0 = State.get(getIncomingValue(In), Part); + if (In == 0) + Entry[Part] = In0; // Initialize with the first incoming value. + else { + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Value *Cond = State.get(getMask(In), Part); + Entry[Part] = + State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); + } + } + } + for (unsigned Part = 0; Part < State.UF; ++Part) + State.set(this, Entry[Part], Part); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; @@ -566,7 +864,35 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, if (AlsoPack) O << " (S->V)"; } +#endif + +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Branch on Mask works only on single instance."); + unsigned Part = State.Instance->Part; + unsigned Lane = State.Instance->Lane.getKnownLane(); + + Value *ConditionBit = nullptr; + VPValue *BlockInMask = getMask(); + if (BlockInMask) { + ConditionBit = State.get(BlockInMask, Part); + if (ConditionBit->getType()->isVectorTy()) + ConditionBit = State.Builder.CreateExtractElement( + ConditionBit, State.Builder.getInt32(Lane)); + } else // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); + + // Replace the temporary unreachable terminator with a new conditional branch, + // whose two destinations will be set later when they are created. + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + assert(isa<UnreachableInst>(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional branch."); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; @@ -838,3 +1164,28 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +// remove VPActiveLaneMaskPHIRecipe. +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *StartMask = State.get(getOperand(0), Part); + PHINode *EntryPart = + State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "ACTIVE-LANE-MASK-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5fc676834331..c99fae1b2ab4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ public: // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -358,6 +359,7 @@ public: // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index f917883145c0..3501de6ab38e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -133,32 +133,48 @@ void VPlanVerifier::verifyHierarchicalCFG( verifyRegionRec(TopRegion); } -bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { - auto Iter = depth_first( - VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); - for (const VPBasicBlock *VPBB : - VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { - // Verify that phi-like recipes are at the beginning of the block, with no - // other recipes in between. - auto RecipeI = VPBB->begin(); - auto End = VPBB->end(); - while (RecipeI != End && RecipeI->isPhi()) - RecipeI++; +static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) { + // Verify that phi-like recipes are at the beginning of the block, with no + // other recipes in between. + auto RecipeI = VPBB->begin(); + auto End = VPBB->end(); + unsigned NumActiveLaneMaskPhiRecipes = 0; + while (RecipeI != End && RecipeI->isPhi()) { + if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI)) + NumActiveLaneMaskPhiRecipes++; + RecipeI++; + } - while (RecipeI != End) { - if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) { - errs() << "Found phi-like recipe after non-phi recipe"; + if (NumActiveLaneMaskPhiRecipes > 1) { + errs() << "There should be no more than one VPActiveLaneMaskPHIRecipe"; + return false; + } + + while (RecipeI != End) { + if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) { + errs() << "Found phi-like recipe after non-phi recipe"; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - errs() << ": "; - RecipeI->dump(); - errs() << "after\n"; - std::prev(RecipeI)->dump(); + errs() << ": "; + RecipeI->dump(); + errs() << "after\n"; + std::prev(RecipeI)->dump(); #endif - return false; - } - RecipeI++; + return false; } + RecipeI++; + } + + return true; +} + +bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); + for (const VPBasicBlock *VPBB : + VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { + if (!verifyVPBasicBlock(VPBB)) + return false; } const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); @@ -181,15 +197,16 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { } if (Exiting->empty()) { - errs() << "VPlan vector loop exiting block must end with BranchOnCount " - "VPInstruction but is empty\n"; + errs() << "VPlan vector loop exiting block must end with BranchOnCount or " + "BranchOnCond VPInstruction but is empty\n"; return false; } auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnCond VPInstruction\n"; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 90598937affc..d12624ffb824 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -414,6 +414,10 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilder<> &Builder) { + // Shufflevectors can only be created for fixed-width vectors. + if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) + return nullptr; + // If the extract can be constant-folded, this code is unsimplified. Defer // to other passes to handle that. Value *X = ExtElt->getVectorOperand(); @@ -1249,14 +1253,20 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || VT != Op0->getType()) return false; - auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0)); - auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1)); - auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0)); - auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1)); + auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); + auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); + auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); + auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); + SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); auto checkSVNonOpUses = [&](Instruction *I) { if (!I || I->getOperand(0)->getType() != VT) return true; - return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); + return any_of(I->users(), [&](User *U) { + return U != Op0 && U != Op1 && + !(isa<ShuffleVectorInst>(U) && + (InputShuffles.contains(cast<Instruction>(U)) || + isInstructionTriviallyDead(cast<Instruction>(U)))); + }); }; if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) @@ -1271,6 +1281,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { auto *SV = dyn_cast<ShuffleVectorInst>(U); if (!SV || SV->getType() != VT) return false; + if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || + (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) + return false; if (!llvm::is_contained(Shuffles, SV)) Shuffles.push_back(SV); } @@ -1283,13 +1296,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (FromReduction && Shuffles.size() > 1) return false; + // Add any shuffle uses for the shuffles we have found, to include them in our + // cost calculations. + if (!FromReduction) { + for (ShuffleVectorInst *SV : Shuffles) { + for (auto U : SV->users()) { + ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); + if (SSV && isa<UndefValue>(SSV->getOperand(1))) + Shuffles.push_back(SSV); + } + } + } + // For each of the output shuffles, we try to sort all the first vector // elements to the beginning, followed by the second array elements at the // end. If the binops are legalized to smaller vectors, this may reduce total // number of binops. We compute the ReconstructMask mask needed to convert // back to the original lane order. - SmallVector<int> V1, V2; - SmallVector<SmallVector<int>> ReconstructMasks; + SmallVector<std::pair<int, int>> V1, V2; + SmallVector<SmallVector<int>> OrigReconstructMasks; int MaxV1Elt = 0, MaxV2Elt = 0; unsigned NumElts = VT->getNumElements(); for (ShuffleVectorInst *SVN : Shuffles) { @@ -1300,6 +1325,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // case we need to commute the mask). Value *SVOp0 = SVN->getOperand(0); Value *SVOp1 = SVN->getOperand(1); + if (isa<UndefValue>(SVOp1)) { + auto *SSV = cast<ShuffleVectorInst>(SVOp0); + SVOp0 = SSV->getOperand(0); + SVOp1 = SSV->getOperand(1); + for (unsigned I = 0, E = Mask.size(); I != E; I++) { + if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) + return false; + Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); + } + } if (SVOp0 == Op1 && SVOp1 == Op0) { std::swap(SVOp0, SVOp1); ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); @@ -1316,21 +1351,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { ReconstructMask.push_back(-1); } else if (Mask[I] < static_cast<int>(NumElts)) { MaxV1Elt = std::max(MaxV1Elt, Mask[I]); - auto It = find(V1, Mask[I]); + auto It = find_if(V1, [&](const std::pair<int, int> &A) { + return Mask[I] == A.first; + }); if (It != V1.end()) ReconstructMask.push_back(It - V1.begin()); else { ReconstructMask.push_back(V1.size()); - V1.push_back(Mask[I]); + V1.emplace_back(Mask[I], V1.size()); } } else { MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); - auto It = find(V2, Mask[I] - NumElts); + auto It = find_if(V2, [&](const std::pair<int, int> &A) { + return Mask[I] - static_cast<int>(NumElts) == A.first; + }); if (It != V2.end()) ReconstructMask.push_back(NumElts + It - V2.begin()); else { ReconstructMask.push_back(NumElts + V2.size()); - V2.push_back(Mask[I] - NumElts); + V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); } } } @@ -1339,7 +1378,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // result. In-order can help simplify the shuffle away. if (FromReduction) sort(ReconstructMask); - ReconstructMasks.push_back(ReconstructMask); + OrigReconstructMasks.push_back(std::move(ReconstructMask)); } // If the Maximum element used from V1 and V2 are not larger than the new @@ -1351,16 +1390,68 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { MaxV2Elt == static_cast<int>(V2.size()) - 1)) return false; + // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a + // shuffle of another shuffle, or not a shuffle (that is treated like a + // identity shuffle). + auto GetBaseMaskValue = [&](Instruction *I, int M) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return M; + if (isa<UndefValue>(SV->getOperand(1))) + if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) + if (InputShuffles.contains(SSV)) + return SSV->getMaskValue(SV->getMaskValue(M)); + return SV->getMaskValue(M); + }; + + // Attempt to sort the inputs my ascending mask values to make simpler input + // shuffles and push complex shuffles down to the uses. We sort on the first + // of the two input shuffle orders, to try and get at least one input into a + // nice order. + auto SortBase = [&](Instruction *A, std::pair<int, int> X, + std::pair<int, int> Y) { + int MXA = GetBaseMaskValue(A, X.first); + int MYA = GetBaseMaskValue(A, Y.first); + return MXA < MYA; + }; + stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { + return SortBase(SVI0A, A, B); + }); + stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { + return SortBase(SVI1A, A, B); + }); + // Calculate our ReconstructMasks from the OrigReconstructMasks and the + // modified order of the input shuffles. + SmallVector<SmallVector<int>> ReconstructMasks; + for (auto Mask : OrigReconstructMasks) { + SmallVector<int> ReconstructMask; + for (int M : Mask) { + auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { + auto It = find_if(V, [M](auto A) { return A.second == M; }); + assert(It != V.end() && "Expected all entries in Mask"); + return std::distance(V.begin(), It); + }; + if (M < 0) + ReconstructMask.push_back(-1); + else if (M < static_cast<int>(NumElts)) { + ReconstructMask.push_back(FindIndex(V1, M)); + } else { + ReconstructMask.push_back(NumElts + FindIndex(V2, M)); + } + } + ReconstructMasks.push_back(std::move(ReconstructMask)); + } + // Calculate the masks needed for the new input shuffles, which get padded // with undef SmallVector<int> V1A, V1B, V2A, V2B; for (unsigned I = 0; I < V1.size(); I++) { - V1A.push_back(SVI0A->getMaskValue(V1[I])); - V1B.push_back(SVI0B->getMaskValue(V1[I])); + V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); + V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); } for (unsigned I = 0; I < V2.size(); I++) { - V2A.push_back(SVI1A->getMaskValue(V2[I])); - V2B.push_back(SVI1B->getMaskValue(V2[I])); + V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); + V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); } while (V1A.size() < NumElts) { V1A.push_back(UndefMaskElem); @@ -1371,9 +1462,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { V2B.push_back(UndefMaskElem); } - auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { - return C + - TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); + auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return C; + return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) + ? TTI::SK_PermuteSingleSrc + : TTI::SK_PermuteTwoSrc, + VT, SV->getShuffleMask()); }; auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); @@ -1386,9 +1482,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); - // This set helps us only cost each unique shuffle once. - SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles( - {SVI0A, SVI0B, SVI1A, SVI1B}); CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), InstructionCost(0), AddShuffleCost); @@ -1408,22 +1501,35 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), InstructionCost(0), AddShuffleMaskCost); + LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); + LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore + << " vs CostAfter: " << CostAfter << "\n"); if (CostBefore <= CostAfter) return false; // The cost model has passed, create the new instructions. - Builder.SetInsertPoint(SVI0A); - Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), - SVI0A->getOperand(1), V1A); - Builder.SetInsertPoint(SVI0B); - Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), - SVI0B->getOperand(1), V1B); - Builder.SetInsertPoint(SVI1A); - Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), - SVI1A->getOperand(1), V2A); - Builder.SetInsertPoint(SVI1B); - Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), - SVI1B->getOperand(1), V2B); + auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return I; + if (isa<UndefValue>(SV->getOperand(1))) + if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) + if (InputShuffles.contains(SSV)) + return SSV->getOperand(Op); + return SV->getOperand(Op); + }; + Builder.SetInsertPoint(SVI0A->getNextNode()); + Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), + GetShuffleOperand(SVI0A, 1), V1A); + Builder.SetInsertPoint(SVI0B->getNextNode()); + Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), + GetShuffleOperand(SVI0B, 1), V1B); + Builder.SetInsertPoint(SVI1A->getNextNode()); + Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), + GetShuffleOperand(SVI1A, 1), V2A); + Builder.SetInsertPoint(SVI1B->getNextNode()); + Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), + GetShuffleOperand(SVI1B, 1), V2B); Builder.SetInsertPoint(Op0); Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), NSV0A, NSV0B); |
