diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib')
360 files changed, 8762 insertions, 4120 deletions
diff --git a/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp index 33fdc8b628c5..856d7e90acb2 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -104,12 +104,67 @@ static const uint32_t LBH_NONTAKEN_WEIGHT = 4; /// All reachable probability will proportionally share the remaining part. static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1); +/// Heuristics and lookup tables for non-loop branches: +/// Pointer Heuristics (PH) static const uint32_t PH_TAKEN_WEIGHT = 20; static const uint32_t PH_NONTAKEN_WEIGHT = 12; +static const BranchProbability + PtrTakenProb(PH_TAKEN_WEIGHT, PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT); +static const BranchProbability + PtrUntakenProb(PH_NONTAKEN_WEIGHT, PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT); + +using ProbabilityList = SmallVector<BranchProbability>; +using ProbabilityTable = std::map<CmpInst::Predicate, ProbabilityList>; + +/// Pointer comparisons: +static const ProbabilityTable PointerTable{ + {ICmpInst::ICMP_NE, {PtrTakenProb, PtrUntakenProb}}, /// p != q -> Likely + {ICmpInst::ICMP_EQ, {PtrUntakenProb, PtrTakenProb}}, /// p == q -> Unlikely +}; +/// Zero Heuristics (ZH) static const uint32_t ZH_TAKEN_WEIGHT = 20; static const uint32_t ZH_NONTAKEN_WEIGHT = 12; +static const BranchProbability + ZeroTakenProb(ZH_TAKEN_WEIGHT, ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT); +static const BranchProbability + ZeroUntakenProb(ZH_NONTAKEN_WEIGHT, ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT); + +/// Integer compares with 0: +static const ProbabilityTable ICmpWithZeroTable{ + {CmpInst::ICMP_EQ, {ZeroUntakenProb, ZeroTakenProb}}, /// X == 0 -> Unlikely + {CmpInst::ICMP_NE, {ZeroTakenProb, ZeroUntakenProb}}, /// X != 0 -> Likely + {CmpInst::ICMP_SLT, {ZeroUntakenProb, ZeroTakenProb}}, /// X < 0 -> Unlikely + {CmpInst::ICMP_SGT, {ZeroTakenProb, ZeroUntakenProb}}, /// X > 0 -> Likely +}; + +/// Integer compares with -1: +static const ProbabilityTable ICmpWithMinusOneTable{ + {CmpInst::ICMP_EQ, {ZeroUntakenProb, ZeroTakenProb}}, /// X == -1 -> Unlikely + {CmpInst::ICMP_NE, {ZeroTakenProb, ZeroUntakenProb}}, /// X != -1 -> Likely + // InstCombine canonicalizes X >= 0 into X > -1 + {CmpInst::ICMP_SGT, {ZeroTakenProb, ZeroUntakenProb}}, /// X >= 0 -> Likely +}; + +/// Integer compares with 1: +static const ProbabilityTable ICmpWithOneTable{ + // InstCombine canonicalizes X <= 0 into X < 1 + {CmpInst::ICMP_SLT, {ZeroUntakenProb, ZeroTakenProb}}, /// X <= 0 -> Unlikely +}; + +/// strcmp and similar functions return zero, negative, or positive, if the +/// first string is equal, less, or greater than the second. We consider it +/// likely that the strings are not equal, so a comparison with zero is +/// probably false, but also a comparison with any other number is also +/// probably false given that what exactly is returned for nonzero values is +/// not specified. Any kind of comparison other than equality we know +/// nothing about. +static const ProbabilityTable ICmpWithLibCallTable{ + {CmpInst::ICMP_EQ, {ZeroUntakenProb, ZeroTakenProb}}, + {CmpInst::ICMP_NE, {ZeroTakenProb, ZeroUntakenProb}}, +}; +// Floating-Point Heuristics (FPH) static const uint32_t FPH_TAKEN_WEIGHT = 20; static const uint32_t FPH_NONTAKEN_WEIGHT = 12; @@ -120,6 +175,21 @@ static const uint32_t FPH_ORD_WEIGHT = 1024 * 1024 - 1; /// exceptional case, so the result is unlikely. static const uint32_t FPH_UNO_WEIGHT = 1; +static const BranchProbability FPOrdTakenProb(FPH_ORD_WEIGHT, + FPH_ORD_WEIGHT + FPH_UNO_WEIGHT); +static const BranchProbability + FPOrdUntakenProb(FPH_UNO_WEIGHT, FPH_ORD_WEIGHT + FPH_UNO_WEIGHT); +static const BranchProbability + FPTakenProb(FPH_TAKEN_WEIGHT, FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT); +static const BranchProbability + FPUntakenProb(FPH_NONTAKEN_WEIGHT, FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT); + +/// Floating-Point compares: +static const ProbabilityTable FCmpTable{ + {FCmpInst::FCMP_ORD, {FPOrdTakenProb, FPOrdUntakenProb}}, /// !isnan -> Likely + {FCmpInst::FCMP_UNO, {FPOrdUntakenProb, FPOrdTakenProb}}, /// isnan -> Unlikely +}; + /// Set of dedicated "absolute" execution weights for a block. These weights are /// meaningful relative to each other and their derivatives only. enum class BlockExecWeight : std::uint32_t { @@ -468,21 +538,10 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) { assert(CI->getOperand(1)->getType()->isPointerTy()); - BranchProbability TakenProb(PH_TAKEN_WEIGHT, - PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT); - BranchProbability UntakenProb(PH_NONTAKEN_WEIGHT, - PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT); - - // p != 0 -> isProb = true - // p == 0 -> isProb = false - // p != q -> isProb = true - // p == q -> isProb = false; - bool isProb = CI->getPredicate() == ICmpInst::ICMP_NE; - if (!isProb) - std::swap(TakenProb, UntakenProb); - - setEdgeProbability( - BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb})); + auto Search = PointerTable.find(CI->getPredicate()); + if (Search == PointerTable.end()) + return false; + setEdgeProbability(BB, Search->second); return true; } @@ -949,86 +1008,33 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB, if (Function *CalledFn = Call->getCalledFunction()) TLI->getLibFunc(*CalledFn, Func); - bool isProb; + ProbabilityTable::const_iterator Search; if (Func == LibFunc_strcasecmp || Func == LibFunc_strcmp || Func == LibFunc_strncasecmp || Func == LibFunc_strncmp || Func == LibFunc_memcmp || Func == LibFunc_bcmp) { - // strcmp and similar functions return zero, negative, or positive, if the - // first string is equal, less, or greater than the second. We consider it - // likely that the strings are not equal, so a comparison with zero is - // probably false, but also a comparison with any other number is also - // probably false given that what exactly is returned for nonzero values is - // not specified. Any kind of comparison other than equality we know - // nothing about. - switch (CI->getPredicate()) { - case CmpInst::ICMP_EQ: - isProb = false; - break; - case CmpInst::ICMP_NE: - isProb = true; - break; - default: + Search = ICmpWithLibCallTable.find(CI->getPredicate()); + if (Search == ICmpWithLibCallTable.end()) return false; - } } else if (CV->isZero()) { - switch (CI->getPredicate()) { - case CmpInst::ICMP_EQ: - // X == 0 -> Unlikely - isProb = false; - break; - case CmpInst::ICMP_NE: - // X != 0 -> Likely - isProb = true; - break; - case CmpInst::ICMP_SLT: - // X < 0 -> Unlikely - isProb = false; - break; - case CmpInst::ICMP_SGT: - // X > 0 -> Likely - isProb = true; - break; - default: + Search = ICmpWithZeroTable.find(CI->getPredicate()); + if (Search == ICmpWithZeroTable.end()) + return false; + } else if (CV->isOne()) { + Search = ICmpWithOneTable.find(CI->getPredicate()); + if (Search == ICmpWithOneTable.end()) return false; - } - } else if (CV->isOne() && CI->getPredicate() == CmpInst::ICMP_SLT) { - // InstCombine canonicalizes X <= 0 into X < 1. - // X <= 0 -> Unlikely - isProb = false; } else if (CV->isMinusOne()) { - switch (CI->getPredicate()) { - case CmpInst::ICMP_EQ: - // X == -1 -> Unlikely - isProb = false; - break; - case CmpInst::ICMP_NE: - // X != -1 -> Likely - isProb = true; - break; - case CmpInst::ICMP_SGT: - // InstCombine canonicalizes X >= 0 into X > -1. - // X >= 0 -> Likely - isProb = true; - break; - default: + Search = ICmpWithMinusOneTable.find(CI->getPredicate()); + if (Search == ICmpWithMinusOneTable.end()) return false; - } } else { return false; } - BranchProbability TakenProb(ZH_TAKEN_WEIGHT, - ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT); - BranchProbability UntakenProb(ZH_NONTAKEN_WEIGHT, - ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT); - if (!isProb) - std::swap(TakenProb, UntakenProb); - - setEdgeProbability( - BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb})); + setEdgeProbability(BB, Search->second); return true; } @@ -1042,34 +1048,21 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) { if (!FCmp) return false; - uint32_t TakenWeight = FPH_TAKEN_WEIGHT; - uint32_t NontakenWeight = FPH_NONTAKEN_WEIGHT; - bool isProb; + ProbabilityList ProbList; if (FCmp->isEquality()) { - // f1 == f2 -> Unlikely - // f1 != f2 -> Likely - isProb = !FCmp->isTrueWhenEqual(); - } else if (FCmp->getPredicate() == FCmpInst::FCMP_ORD) { - // !isnan -> Likely - isProb = true; - TakenWeight = FPH_ORD_WEIGHT; - NontakenWeight = FPH_UNO_WEIGHT; - } else if (FCmp->getPredicate() == FCmpInst::FCMP_UNO) { - // isnan -> Unlikely - isProb = false; - TakenWeight = FPH_ORD_WEIGHT; - NontakenWeight = FPH_UNO_WEIGHT; + ProbList = !FCmp->isTrueWhenEqual() ? + // f1 == f2 -> Unlikely + ProbabilityList({FPTakenProb, FPUntakenProb}) : + // f1 != f2 -> Likely + ProbabilityList({FPUntakenProb, FPTakenProb}); } else { - return false; + auto Search = FCmpTable.find(FCmp->getPredicate()); + if (Search == FCmpTable.end()) + return false; + ProbList = Search->second; } - BranchProbability TakenProb(TakenWeight, TakenWeight + NontakenWeight); - BranchProbability UntakenProb(NontakenWeight, TakenWeight + NontakenWeight); - if (!isProb) - std::swap(TakenProb, UntakenProb); - - setEdgeProbability( - BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb})); + setEdgeProbability(BB, ProbList); return true; } diff --git a/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp index 3634526370f5..7426d0c07592 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/DivergenceAnalysis.cpp @@ -24,12 +24,12 @@ // divergent can help the compiler to selectively run these optimizations. // // This implementation is derived from the Vectorization Analysis of the -// Region Vectorizer (RV). That implementation in turn is based on the approach -// described in +// Region Vectorizer (RV). The analysis is based on the approach described in // -// Improving Performance of OpenCL on CPUs -// Ralf Karrenberg and Sebastian Hack -// CC '12 +// An abstract interpretation for SPMD divergence +// on reducible control flow graphs. +// Julian Rosemann, Simon Moll and Sebastian Hack +// POPL '21 // // This implementation is generic in the sense that it does // not itself identify original sources of divergence. diff --git a/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp index f22c6aa04f5e..2ec6cbeabda2 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -820,7 +820,7 @@ void IRSimilarityIdentifier::populateMapper( /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from /// the IRInstructionData in subsequence. /// -/// \param [in] Mapper - The instruction mapper for sanity checks. +/// \param [in] Mapper - The instruction mapper for basic correctness checks. /// \param [in] InstrList - The vector that holds the instruction data. /// \param [in] IntegerMapping - The vector that holds the mapped integers. /// \param [out] CandsForRepSubstring - The vector to store the generated diff --git a/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp b/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp index c4b7239b43ab..cfe910df4e91 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/IVDescriptors.cpp @@ -81,6 +81,7 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurKind Kind) { case RecurKind::Mul: case RecurKind::FAdd: case RecurKind::FMul: + case RecurKind::FMulAdd: return true; } return false; @@ -194,21 +195,28 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit, // vectorizing floating point operations without unsafe math. static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst, Instruction *Exit, PHINode *Phi) { - // Currently only FAdd is supported - if (Kind != RecurKind::FAdd) + // Currently only FAdd and FMulAdd are supported. + if (Kind != RecurKind::FAdd && Kind != RecurKind::FMulAdd) return false; - // Ensure the exit instruction is an FAdd, and that it only has one user - // other than the reduction PHI - if (Exit->getOpcode() != Instruction::FAdd || Exit->hasNUsesOrMore(3) || - Exit != ExactFPMathInst) + if (Kind == RecurKind::FAdd && Exit->getOpcode() != Instruction::FAdd) + return false; + + if (Kind == RecurKind::FMulAdd && + !RecurrenceDescriptor::isFMulAddIntrinsic(Exit)) + return false; + + // Ensure the exit instruction has only one user other than the reduction PHI + if (Exit != ExactFPMathInst || Exit->hasNUsesOrMore(3)) return false; // The only pattern accepted is the one in which the reduction PHI // is used as one of the operands of the exit instruction - auto *LHS = Exit->getOperand(0); - auto *RHS = Exit->getOperand(1); - if (LHS != Phi && RHS != Phi) + auto *Op0 = Exit->getOperand(0); + auto *Op1 = Exit->getOperand(1); + if (Kind == RecurKind::FAdd && Op0 != Phi && Op1 != Phi) + return false; + if (Kind == RecurKind::FMulAdd && Exit->getOperand(2) != Phi) return false; LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi @@ -389,6 +397,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, for (User *U : Cur->users()) { Instruction *UI = cast<Instruction>(U); + // If the user is a call to llvm.fmuladd then the instruction can only be + // the final operand. + if (isFMulAddIntrinsic(UI)) + if (Cur == UI->getOperand(0) || Cur == UI->getOperand(1)) + return false; + // Check if we found the exit user. BasicBlock *Parent = UI->getParent(); if (!TheLoop->contains(Parent)) { @@ -710,6 +724,9 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi, I->hasNoSignedZeros())) && isFPMinMaxRecurrenceKind(Kind))) return isMinMaxPattern(I, Kind, Prev); + else if (isFMulAddIntrinsic(I)) + return InstDesc(Kind == RecurKind::FMulAdd, I, + I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); } } @@ -804,6 +821,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << " PHI." << *Phi << "\n"); return true; } + if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, + DT)) { + LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -927,6 +949,7 @@ Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, case RecurKind::FMul: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); + case RecurKind::FMulAdd: case RecurKind::FAdd: // Adding zero to a number does not change it. // FIXME: Ideally we should not need to check FMF for FAdd and should always @@ -974,6 +997,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { return Instruction::Xor; case RecurKind::FMul: return Instruction::FMul; + case RecurKind::FMulAdd: case RecurKind::FAdd: return Instruction::FAdd; case RecurKind::SMax: @@ -1032,6 +1056,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const { return SelectPatternResult::isMinOrMax( matchSelectPattern(Cur, LHS, RHS).Flavor); } + // Recognize a call to the llvm.fmuladd intrinsic. + if (isFMulAddIntrinsic(Cur)) + return true; + return Cur->getOpcode() == RedOp; }; diff --git a/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp index 864eeea4f8bf..22d2ce11cc90 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2180,6 +2180,55 @@ Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit); } +static Value *simplifyOrLogic(Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Expected same type for 'or' ops"); + Type *Ty = X->getType(); + + // X | ~X --> -1 + if (match(Y, m_Not(m_Specific(X)))) + return ConstantInt::getAllOnesValue(Ty); + + // X | ~(X & ?) = -1 + if (match(Y, m_Not(m_c_And(m_Specific(X), m_Value())))) + return ConstantInt::getAllOnesValue(Ty); + + // X | (X & ?) --> X + if (match(Y, m_c_And(m_Specific(X), m_Value()))) + return X; + + Value *A, *B; + + // (A & ~B) | (A ^ B) --> A ^ B + // (~B & A) | (A ^ B) --> A ^ B + // (A & ~B) | (B ^ A) --> B ^ A + // (~B & A) | (B ^ A) --> B ^ A + if (match(X, m_c_And(m_Value(A), m_Not(m_Value(B)))) && + match(Y, m_c_Xor(m_Specific(A), m_Specific(B)))) + return Y; + + // (~A ^ B) | (A & B) --> ~A ^ B + // (B ^ ~A) | (A & B) --> B ^ ~A + // (~A ^ B) | (B & A) --> ~A ^ B + // (B ^ ~A) | (B & A) --> B ^ ~A + if (match(X, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && + match(Y, m_c_And(m_Specific(A), m_Specific(B)))) + return X; + + // (A ^ B) | (A | B) --> A | B + // (A ^ B) | (B | A) --> B | A + if (match(X, m_Xor(m_Value(A), m_Value(B))) && + match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) + return Y; + + // ~(A ^ B) | (A | B) --> -1 + // ~(A ^ B) | (B | A) --> -1 + if (match(X, m_Not(m_Xor(m_Value(A), m_Value(B)))) && + match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) + return ConstantInt::getAllOnesValue(Ty); + + return nullptr; +} + /// Given operands for an Or, see if we can fold the result. /// If not, this returns null. static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, @@ -2202,81 +2251,15 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, if (Op0 == Op1 || match(Op1, m_Zero())) return Op0; - // A | ~A = ~A | A = -1 - if (match(Op0, m_Not(m_Specific(Op1))) || - match(Op1, m_Not(m_Specific(Op0)))) - return Constant::getAllOnesValue(Op0->getType()); - - // (A & ?) | A = A - if (match(Op0, m_c_And(m_Specific(Op1), m_Value()))) - return Op1; - - // A | (A & ?) = A - if (match(Op1, m_c_And(m_Specific(Op0), m_Value()))) - return Op0; - - // ~(A & ?) | A = -1 - if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value())))) - return Constant::getAllOnesValue(Op1->getType()); - - // A | ~(A & ?) = -1 - if (match(Op1, m_Not(m_c_And(m_Specific(Op0), m_Value())))) - return Constant::getAllOnesValue(Op0->getType()); + if (Value *R = simplifyOrLogic(Op0, Op1)) + return R; + if (Value *R = simplifyOrLogic(Op1, Op0)) + return R; if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Or)) return V; Value *A, *B, *NotA; - // (A & ~B) | (A ^ B) -> (A ^ B) - // (~B & A) | (A ^ B) -> (A ^ B) - // (A & ~B) | (B ^ A) -> (B ^ A) - // (~B & A) | (B ^ A) -> (B ^ A) - if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && - (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) || - match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) - return Op1; - - // Commute the 'or' operands. - // (A ^ B) | (A & ~B) -> (A ^ B) - // (A ^ B) | (~B & A) -> (A ^ B) - // (B ^ A) | (A & ~B) -> (B ^ A) - // (B ^ A) | (~B & A) -> (B ^ A) - if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && - (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) || - match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) - return Op0; - - // (A & B) | (~A ^ B) -> (~A ^ B) - // (B & A) | (~A ^ B) -> (~A ^ B) - // (A & B) | (B ^ ~A) -> (B ^ ~A) - // (B & A) | (B ^ ~A) -> (B ^ ~A) - if (match(Op0, m_And(m_Value(A), m_Value(B))) && - (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) || - match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B))))) - return Op1; - - // Commute the 'or' operands. - // (~A ^ B) | (A & B) -> (~A ^ B) - // (~A ^ B) | (B & A) -> (~A ^ B) - // (B ^ ~A) | (A & B) -> (B ^ ~A) - // (B ^ ~A) | (B & A) -> (B ^ ~A) - if (match(Op1, m_And(m_Value(A), m_Value(B))) && - (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) || - match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B))))) - return Op0; - - // (A | B) | (A ^ B) --> A | B - // (B | A) | (A ^ B) --> B | A - if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && - match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) - return Op0; - - // Commute the outer 'or' operands. - // (A ^ B) | (A | B) --> A | B - // (A ^ B) | (B | A) --> B | A - if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && - match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) - return Op1; // (~A & B) | ~(A | B) --> ~A // (~A & B) | ~(B | A) --> ~A @@ -2414,6 +2397,30 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, match(Op1, m_Not(m_Specific(Op0)))) return Constant::getAllOnesValue(Op0->getType()); + auto foldAndOrNot = [](Value *X, Value *Y) -> Value * { + Value *A, *B; + // (~A & B) ^ (A | B) --> A -- There are 8 commuted variants. + if (match(X, m_c_And(m_Not(m_Value(A)), m_Value(B))) && + match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) + return A; + + // (~A | B) ^ (A & B) --> ~A -- There are 8 commuted variants. + // The 'not' op must contain a complete -1 operand (no undef elements for + // vector) for the transform to be safe. + Value *NotA; + if (match(X, + m_c_Or(m_CombineAnd(m_NotForbidUndef(m_Value(A)), m_Value(NotA)), + m_Value(B))) && + match(Y, m_c_And(m_Specific(A), m_Specific(B)))) + return NotA; + + return nullptr; + }; + if (Value *R = foldAndOrNot(Op0, Op1)) + return R; + if (Value *R = foldAndOrNot(Op1, Op0)) + return R; + if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Xor)) return V; @@ -2935,8 +2942,10 @@ static Value *simplifyICmpWithBinOpOnLHS( return getFalse(ITy); } - // x >> y <=u x - // x udiv y <=u x. + // x >>u y <=u x --> true. + // x >>u y >u x --> false. + // x udiv y <=u x --> true. + // x udiv y >u x --> false. if (match(LBO, m_LShr(m_Specific(RHS), m_Value())) || match(LBO, m_UDiv(m_Specific(RHS), m_Value()))) { // icmp pred (X op Y), X @@ -2946,6 +2955,37 @@ static Value *simplifyICmpWithBinOpOnLHS( return getTrue(ITy); } + // If x is nonzero: + // x >>u C <u x --> true for C != 0. + // x >>u C != x --> true for C != 0. + // x >>u C >=u x --> false for C != 0. + // x >>u C == x --> false for C != 0. + // x udiv C <u x --> true for C != 1. + // x udiv C != x --> true for C != 1. + // x udiv C >=u x --> false for C != 1. + // x udiv C == x --> false for C != 1. + // TODO: allow non-constant shift amount/divisor + const APInt *C; + if ((match(LBO, m_LShr(m_Specific(RHS), m_APInt(C))) && *C != 0) || + (match(LBO, m_UDiv(m_Specific(RHS), m_APInt(C))) && *C != 1)) { + if (isKnownNonZero(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) { + switch (Pred) { + default: + break; + case ICmpInst::ICMP_EQ: + case ICmpInst::ICMP_UGE: + return getFalse(ITy); + case ICmpInst::ICMP_NE: + case ICmpInst::ICMP_ULT: + return getTrue(ITy); + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_ULE: + // UGT/ULE are handled by the more general case just above + llvm_unreachable("Unexpected UGT/ULE, should have been handled"); + } + } + } + // (x*C1)/C2 <= x for C1 <= C2. // This holds even if the multiplication overflows: Assume that x != 0 and // arithmetic is modulo M. For overflow to occur we must have C1 >= M/x and diff --git a/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp b/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp index 23ff4fd6f85e..d9620fd405bc 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/IntervalPartition.cpp @@ -36,16 +36,16 @@ INITIALIZE_PASS(IntervalPartition, "intervals", // releaseMemory - Reset state back to before function was analyzed void IntervalPartition::releaseMemory() { - for (unsigned i = 0, e = Intervals.size(); i != e; ++i) - delete Intervals[i]; + for (Interval *I : Intervals) + delete I; IntervalMap.clear(); Intervals.clear(); RootInterval = nullptr; } void IntervalPartition::print(raw_ostream &O, const Module*) const { - for(unsigned i = 0, e = Intervals.size(); i != e; ++i) - Intervals[i]->print(O); + for (const Interval *I : Intervals) + I->print(O); } // addIntervalToPartition - Add an interval to the internal list of intervals, @@ -87,8 +87,8 @@ bool IntervalPartition::runOnFunction(Function &F) { // Now that we know all of the successor information, propagate this to the // predecessors for each block. - for (unsigned i = 0, e = Intervals.size(); i != e; ++i) - updatePredecessors(Intervals[i]); + for (Interval *I : Intervals) + updatePredecessors(I); return false; } @@ -113,6 +113,6 @@ IntervalPartition::IntervalPartition(IntervalPartition &IP, bool) // Now that we know all of the successor information, propagate this to the // predecessors for each block. - for (unsigned i = 0, e = Intervals.size(); i != e; ++i) - updatePredecessors(Intervals[i]); + for (Interval *I : Intervals) + updatePredecessors(I); } diff --git a/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp index 50fa169c2081..5b5d48bf6fe5 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1095,7 +1095,8 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI, if (!Ty->isIntegerTy()) return ValueLatticeElement::getOverdefined(); - APInt Offset(Ty->getScalarSizeInBits(), 0); + unsigned BitWidth = Ty->getScalarSizeInBits(); + APInt Offset(BitWidth, 0); if (matchICmpOperand(Offset, LHS, Val, EdgePred)) return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset); @@ -1118,13 +1119,23 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI, // If (Val & Mask) != 0 then the value must be larger than the lowest set // bit of Mask. if (EdgePred == ICmpInst::ICMP_NE && !Mask->isZero() && C->isZero()) { - unsigned BitWidth = Ty->getIntegerBitWidth(); return ValueLatticeElement::getRange(ConstantRange::getNonEmpty( APInt::getOneBitSet(BitWidth, Mask->countTrailingZeros()), APInt::getZero(BitWidth))); } } + // If (X urem Modulus) >= C, then X >= C. + // TODO: An upper bound could be computed as well. + if (match(LHS, m_URem(m_Specific(Val), m_Value())) && + match(RHS, m_APInt(C))) { + // Use the icmp region so we don't have to deal with different predicates. + ConstantRange CR = ConstantRange::makeExactICmpRegion(EdgePred, *C); + if (!CR.isEmptySet()) + return ValueLatticeElement::getRange(ConstantRange::getNonEmpty( + CR.getUnsignedMin(), APInt(BitWidth, 0))); + } + return ValueLatticeElement::getOverdefined(); } diff --git a/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp index f9bd7167317f..19a24ac6a484 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -666,6 +666,29 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, return false; } +static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, + function_ref<void(Value *)> AddPointer) { + SmallPtrSet<Value *, 8> Visited; + SmallVector<Value *> WorkList; + WorkList.push_back(StartPtr); + + while (!WorkList.empty()) { + Value *Ptr = WorkList.pop_back_val(); + if (!Visited.insert(Ptr).second) + continue; + auto *PN = dyn_cast<PHINode>(Ptr); + // SCEV does not look through non-header PHIs inside the loop. Such phis + // can be analyzed by adding separate accesses for each incoming pointer + // value. + if (PN && InnermostLoop.contains(PN->getParent()) && + PN->getParent() != InnermostLoop.getHeader()) { + for (const Use &Inc : PN->incoming_values()) + WorkList.push_back(Inc); + } else + AddPointer(Ptr); + } +} + bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, MemAccessInfo Access, const ValueToValueMap &StridesMap, @@ -1032,13 +1055,11 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, bool ShouldCheckWrap) { Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); - unsigned AddrSpace = Ty->getPointerAddressSpace(); + assert(!AccessTy->isAggregateType() && "Bad stride - Not a pointer to a scalar type"); - // Make sure we're not accessing an aggregate type. - // TODO: Why? This doesn't make any sense. - if (AccessTy->isAggregateType()) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type" - << *Ptr << "\n"); + if (isa<ScalableVectorType>(AccessTy)) { + LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy + << "\n"); return 0; } @@ -1068,6 +1089,7 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, // An getelementptr without an inbounds attribute and unit stride would have // to access the pointer value "0" which is undefined behavior in address // space 0, therefore we can also vectorize this case. + unsigned AddrSpace = Ty->getPointerAddressSpace(); bool IsInBoundsGEP = isInBoundsGep(Ptr); bool IsNoWrapAddRec = !ShouldCheckWrap || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) || @@ -1101,7 +1123,8 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, } auto &DL = Lp->getHeader()->getModule()->getDataLayout(); - int64_t Size = DL.getTypeAllocSize(AccessTy); + TypeSize AllocSize = DL.getTypeAllocSize(AccessTy); + int64_t Size = AllocSize.getFixedSize(); const APInt &APStepVal = C->getAPInt(); // Huge step value - give up. @@ -1263,29 +1286,6 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, return Diff && *Diff == 1; } -static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, - function_ref<void(Value *)> AddPointer) { - SmallPtrSet<Value *, 8> Visited; - SmallVector<Value *> WorkList; - WorkList.push_back(StartPtr); - - while (!WorkList.empty()) { - Value *Ptr = WorkList.pop_back_val(); - if (!Visited.insert(Ptr).second) - continue; - auto *PN = dyn_cast<PHINode>(Ptr); - // SCEV does not look through non-header PHIs inside the loop. Such phis - // can be analyzed by adding separate accesses for each incoming pointer - // value. - if (PN && InnermostLoop.contains(PN->getParent()) && - PN->getParent() != InnermostLoop.getHeader()) { - for (const Use &Inc : PN->incoming_values()) - WorkList.push_back(Inc); - } else - AddPointer(Ptr); - } -} - void MemoryDepChecker::addAccess(StoreInst *SI) { visitPointers(SI->getPointerOperand(), *InnermostLoop, [this, SI](Value *Ptr) { diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index b44d15e71556..da6bb4c49cba 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -1481,11 +1481,11 @@ void MemoryDependenceResults::removeCachedNonLocalPointerDependencies( // instructions from the reverse map. NonLocalDepInfo &PInfo = It->second.NonLocalDeps; - for (unsigned i = 0, e = PInfo.size(); i != e; ++i) { - Instruction *Target = PInfo[i].getResult().getInst(); + for (const NonLocalDepEntry &DE : PInfo) { + Instruction *Target = DE.getResult().getInst(); if (!Target) continue; // Ignore non-local dep results. - assert(Target->getParent() == PInfo[i].getBB()); + assert(Target->getParent() == DE.getBB()); // Eliminating the dirty entry from 'Cache', so update the reverse info. RemoveFromReverseMap(ReverseNonLocalPtrDeps, Target, P); diff --git a/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp b/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp index 7f2d04c49565..854ba83bd34a 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/MemoryLocation.cpp @@ -213,6 +213,28 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, LibFunc F; if (TLI && TLI->getLibFunc(*Call, F) && TLI->has(F)) { switch (F) { + case LibFunc_memset_chk: { + assert(ArgIdx == 0 && "Invalid argument index for memset_chk"); + LocationSize Size = LocationSize::afterPointer(); + if (const auto *Len = dyn_cast<ConstantInt>(Call->getArgOperand(2))) { + // memset_chk writes at most Len bytes. It may write less, if Len + // exceeds the specified max size and aborts. + Size = LocationSize::upperBound(Len->getZExtValue()); + } + return MemoryLocation(Arg, Size, AATags); + } + case LibFunc_strncpy: { + assert((ArgIdx == 0 || ArgIdx == 1) && + "Invalid argument index for strncpy"); + LocationSize Size = LocationSize::afterPointer(); + if (const auto *Len = dyn_cast<ConstantInt>(Call->getArgOperand(2))) { + // strncpy is guaranteed to write Len bytes, but only reads up to Len + // bytes. + Size = ArgIdx == 0 ? LocationSize::precise(Len->getZExtValue()) + : LocationSize::upperBound(Len->getZExtValue()); + } + return MemoryLocation(Arg, Size, AATags); + } case LibFunc_memset_pattern16: assert((ArgIdx == 0 || ArgIdx == 1) && "Invalid argument index for memset_pattern16"); diff --git a/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp b/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp index c73e1fd82915..4c80f6743411 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/PHITransAddr.cpp @@ -69,7 +69,7 @@ static bool VerifySubExpr(Value *Expr, } // If it isn't in the InstInputs list it is a subexpr incorporated into the - // address. Sanity check that it is phi translatable. + // address. Validate that it is phi translatable. if (!CanPHITrans(I)) { errs() << "Instruction in PHITransAddr is not phi-translatable:\n"; errs() << *I << '\n'; diff --git a/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp b/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp index a73607dbef61..c20ecff5f912 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/RegionPass.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/RegionPass.h" #include "llvm/IR/OptBisect.h" #include "llvm/IR/PassTimingInfo.h" +#include "llvm/IR/PrintPasses.h" #include "llvm/IR/StructuralHash.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Timer.h" @@ -187,6 +188,8 @@ public: } bool runOnRegion(Region *R, RGPassManager &RGM) override { + if (!isFunctionInPrintList(R->getEntry()->getParent()->getName())) + return false; Out << Banner; for (const auto *BB : R->blocks()) { if (BB) diff --git a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp index f7c22cfb0310..7dc7f9904c70 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2915,8 +2915,8 @@ ScalarEvolution::getOrCreateAddRecExpr(ArrayRef<const SCEV *> Ops, const Loop *L, SCEV::NoWrapFlags Flags) { FoldingSetNodeID ID; ID.AddInteger(scAddRecExpr); - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - ID.AddPointer(Ops[i]); + for (const SCEV *Op : Ops) + ID.AddPointer(Op); ID.AddPointer(L); void *IP = nullptr; SCEVAddRecExpr *S = @@ -2939,8 +2939,8 @@ ScalarEvolution::getOrCreateMulExpr(ArrayRef<const SCEV *> Ops, SCEV::NoWrapFlags Flags) { FoldingSetNodeID ID; ID.AddInteger(scMulExpr); - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - ID.AddPointer(Ops[i]); + for (const SCEV *Op : Ops) + ID.AddPointer(Op); void *IP = nullptr; SCEVMulExpr *S = static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); @@ -3708,8 +3708,8 @@ SCEV *ScalarEvolution::findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops) { FoldingSetNodeID ID; ID.AddInteger(SCEVType); - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - ID.AddPointer(Ops[i]); + for (const SCEV *Op : Ops) + ID.AddPointer(Op); void *IP = nullptr; return UniqueSCEVs.FindNodeOrInsertPos(ID, IP); } @@ -4094,6 +4094,17 @@ void ScalarEvolution::eraseValueFromMap(Value *V) { } } +void ScalarEvolution::insertValueToMap(Value *V, const SCEV *S) { + // A recursive query may have already computed the SCEV. It should be + // equivalent, but may not necessarily be exactly the same, e.g. due to lazily + // inferred nowrap flags. + auto It = ValueExprMap.find_as(V); + if (It == ValueExprMap.end()) { + ValueExprMap.insert({SCEVCallbackVH(V, this), S}); + ExprValueMap[S].insert({V, nullptr}); + } +} + /// Return an existing SCEV if it exists, otherwise analyze the expression and /// create a new one. const SCEV *ScalarEvolution::getSCEV(Value *V) { @@ -4134,10 +4145,9 @@ const SCEV *ScalarEvolution::getExistingSCEV(Value *V) { ValueExprMapType::iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) { const SCEV *S = I->second; - if (checkValidity(S)) - return S; - eraseValueFromMap(V); - forgetMemoizedResults(S); + assert(checkValidity(S) && + "existing SCEV has not been properly invalidated"); + return S; } return nullptr; } @@ -4430,44 +4440,6 @@ static void PushDefUseChildren(Instruction *I, } } -void ScalarEvolution::forgetSymbolicName(Instruction *PN, const SCEV *SymName) { - SmallVector<Instruction *, 16> Worklist; - SmallPtrSet<Instruction *, 8> Visited; - SmallVector<const SCEV *, 8> ToForget; - Visited.insert(PN); - Worklist.push_back(PN); - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - - auto It = ValueExprMap.find_as(static_cast<Value *>(I)); - if (It != ValueExprMap.end()) { - const SCEV *Old = It->second; - - // Short-circuit the def-use traversal if the symbolic name - // ceases to appear in expressions. - if (Old != SymName && !hasOperand(Old, SymName)) - continue; - - // SCEVUnknown for a PHI either means that it has an unrecognized - // structure, it's a PHI that's in the progress of being computed - // by createNodeForPHI, or it's a single-value PHI. In the first case, - // additional loop trip count information isn't going to change anything. - // In the second case, createNodeForPHI will perform the necessary - // updates on its own when it gets to that point. In the third, we do - // want to forget the SCEVUnknown. - if (!isa<PHINode>(I) || - !isa<SCEVUnknown>(Old) || - (I != PN && Old == SymName)) { - eraseValueFromMap(It->first); - ToForget.push_back(Old); - } - } - - PushDefUseChildren(I, Worklist, Visited); - } - forgetMemoizedResults(ToForget); -} - namespace { /// Takes SCEV S and Loop L. For each AddRec sub-expression, use its start @@ -5335,15 +5307,17 @@ const SCEV *ScalarEvolution::createSimpleAffineAddRec(PHINode *PN, const SCEV *StartVal = getSCEV(StartValueV); const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); - - ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; + insertValueToMap(PN, PHISCEV); // We can add Flags to the post-inc expression only if we // know that it is *undefined behavior* for BEValueV to // overflow. - if (auto *BEInst = dyn_cast<Instruction>(BEValueV)) - if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L)) + if (auto *BEInst = dyn_cast<Instruction>(BEValueV)) { + assert(isLoopInvariant(Accum, L) && + "Accum is defined outside L, but is not invariant?"); + if (isAddRecNeverPoison(BEInst, L)) (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags); + } return PHISCEV; } @@ -5386,7 +5360,7 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) { // Handle PHI node value symbolically. const SCEV *SymbolicName = getUnknown(PN); - ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName}); + insertValueToMap(PN, SymbolicName); // Using this symbolic name for the PHI, analyze the value coming around // the back-edge. @@ -5457,8 +5431,8 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) { // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. - forgetSymbolicName(PN, SymbolicName); - ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; + forgetMemoizedResults(SymbolicName); + insertValueToMap(PN, PHISCEV); // We can add Flags to the post-inc expression only if we // know that it is *undefined behavior* for BEValueV to @@ -5489,8 +5463,8 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) { // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. - forgetSymbolicName(PN, SymbolicName); - ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted; + forgetMemoizedResults(SymbolicName); + insertValueToMap(PN, Shifted); return Shifted; } } @@ -7598,62 +7572,19 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { // Now that we know more about the trip count for this loop, forget any // existing SCEV values for PHI nodes in this loop since they are only // conservative estimates made without the benefit of trip count - // information. This is similar to the code in forgetLoop, except that - // it handles SCEVUnknown PHI nodes specially. + // information. This invalidation is not necessary for correctness, and is + // only done to produce more precise results. if (Result.hasAnyInfo()) { - SmallVector<Instruction *, 16> Worklist; - SmallPtrSet<Instruction *, 8> Discovered; + // Invalidate any expression using an addrec in this loop. SmallVector<const SCEV *, 8> ToForget; - PushLoopPHIs(L, Worklist, Discovered); - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - - ValueExprMapType::iterator It = - ValueExprMap.find_as(static_cast<Value *>(I)); - if (It != ValueExprMap.end()) { - const SCEV *Old = It->second; - - // SCEVUnknown for a PHI either means that it has an unrecognized - // structure, or it's a PHI that's in the progress of being computed - // by createNodeForPHI. In the former case, additional loop trip - // count information isn't going to change anything. In the later - // case, createNodeForPHI will perform the necessary updates on its - // own when it gets to that point. - if (!isa<PHINode>(I) || !isa<SCEVUnknown>(Old)) { - eraseValueFromMap(It->first); - ToForget.push_back(Old); - } - if (PHINode *PN = dyn_cast<PHINode>(I)) - ConstantEvolutionLoopExitValue.erase(PN); - } - - // Since we don't need to invalidate anything for correctness and we're - // only invalidating to make SCEV's results more precise, we get to stop - // early to avoid invalidating too much. This is especially important in - // cases like: - // - // %v = f(pn0, pn1) // pn0 and pn1 used through some other phi node - // loop0: - // %pn0 = phi - // ... - // loop1: - // %pn1 = phi - // ... - // - // where both loop0 and loop1's backedge taken count uses the SCEV - // expression for %v. If we don't have the early stop below then in cases - // like the above, getBackedgeTakenInfo(loop1) will clear out the trip - // count for loop0 and getBackedgeTakenInfo(loop0) will clear out the trip - // count for loop1, effectively nullifying SCEV's trip count cache. - for (auto *U : I->users()) - if (auto *I = dyn_cast<Instruction>(U)) { - auto *LoopForUser = LI.getLoopFor(I->getParent()); - if (LoopForUser && L->contains(LoopForUser) && - Discovered.insert(I).second) - Worklist.push_back(I); - } - } + auto LoopUsersIt = LoopUsers.find(L); + if (LoopUsersIt != LoopUsers.end()) + append_range(ToForget, LoopUsersIt->second); forgetMemoizedResults(ToForget); + + // Invalidate constant-evolved loop header phis. + for (PHINode &PN : L->getHeader()->phis()) + ConstantEvolutionLoopExitValue.erase(&PN); } // Re-lookup the insert position, since the call to @@ -7672,10 +7603,12 @@ void ScalarEvolution::forgetAllLoops() { // result. BackedgeTakenCounts.clear(); PredicatedBackedgeTakenCounts.clear(); + BECountUsers.clear(); LoopPropertiesCache.clear(); ConstantEvolutionLoopExitValue.clear(); ValueExprMap.clear(); ValuesAtScopes.clear(); + ValuesAtScopesUsers.clear(); LoopDispositions.clear(); BlockDispositions.clear(); UnsignedRanges.clear(); @@ -7697,8 +7630,8 @@ void ScalarEvolution::forgetLoop(const Loop *L) { auto *CurrL = LoopWorklist.pop_back_val(); // Drop any stored trip count value. - BackedgeTakenCounts.erase(CurrL); - PredicatedBackedgeTakenCounts.erase(CurrL); + forgetBackedgeTakenCounts(CurrL, /* Predicated */ false); + forgetBackedgeTakenCounts(CurrL, /* Predicated */ true); // Drop information about predicated SCEV rewrites for this loop. for (auto I = PredicatedSCEVRewrites.begin(); @@ -7872,10 +7805,6 @@ bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero( return MaxOrZero && !any_of(ExitNotTaken, PredicateNotAlwaysTrue); } -bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S) const { - return Operands.contains(S); -} - ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E) : ExitLimit(E, E, false, None) { } @@ -7916,19 +7845,6 @@ ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M, : ExitLimit(E, M, MaxOrZero, None) { } -class SCEVRecordOperands { - SmallPtrSetImpl<const SCEV *> &Operands; - -public: - SCEVRecordOperands(SmallPtrSetImpl<const SCEV *> &Operands) - : Operands(Operands) {} - bool follow(const SCEV *S) { - Operands.insert(S); - return true; - } - bool isDone() { return false; } -}; - /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each /// computable exit into a persistent ExitNotTakenInfo array. ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo( @@ -7957,14 +7873,6 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo( assert((isa<SCEVCouldNotCompute>(ConstantMax) || isa<SCEVConstant>(ConstantMax)) && "No point in having a non-constant max backedge taken count!"); - - SCEVRecordOperands RecordOperands(Operands); - SCEVTraversal<SCEVRecordOperands> ST(RecordOperands); - if (!isa<SCEVCouldNotCompute>(ConstantMax)) - ST.visitAll(ConstantMax); - for (auto &ENT : ExitNotTaken) - if (!isa<SCEVCouldNotCompute>(ENT.ExactNotTaken)) - ST.visitAll(ENT.ExactNotTaken); } /// Compute the number of times the backedge of the specified loop will execute. @@ -8046,6 +7954,13 @@ ScalarEvolution::computeBackedgeTakenCount(const Loop *L, // The loop backedge will be taken the maximum or zero times if there's // a single exit that must be taken the maximum or zero times. bool MaxOrZero = (MustExitMaxOrZero && ExitingBlocks.size() == 1); + + // Remember which SCEVs are used in exit limits for invalidation purposes. + // We only care about non-constant SCEVs here, so we can ignore EL.MaxNotTaken + // and MaxBECount, which must be SCEVConstant. + for (const auto &Pair : ExitCounts) + if (!isa<SCEVConstant>(Pair.second.ExactNotTaken)) + BECountUsers[Pair.second.ExactNotTaken].insert({L, AllowPredicates}); return BackedgeTakenInfo(std::move(ExitCounts), CouldComputeBECount, MaxBECount, MaxOrZero); } @@ -8916,6 +8831,9 @@ const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) { LS.second = C; break; } + + if (!isa<SCEVConstant>(C)) + ValuesAtScopesUsers[C].push_back({L, V}); return C; } @@ -12387,7 +12305,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range, if (Range.contains(Val->getValue())) return SE.getCouldNotCompute(); // Something strange happened - // Ensure that the previous value is in the range. This is a sanity check. + // Ensure that the previous value is in the range. assert(Range.contains( EvaluateConstantChrecAtConstant(this, ConstantInt::get(SE.getContext(), ExitVal - 1), SE)->getValue()) && @@ -12531,9 +12449,11 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg) BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)), PredicatedBackedgeTakenCounts( std::move(Arg.PredicatedBackedgeTakenCounts)), + BECountUsers(std::move(Arg.BECountUsers)), ConstantEvolutionLoopExitValue( std::move(Arg.ConstantEvolutionLoopExitValue)), ValuesAtScopes(std::move(Arg.ValuesAtScopes)), + ValuesAtScopesUsers(std::move(Arg.ValuesAtScopesUsers)), LoopDispositions(std::move(Arg.LoopDispositions)), LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)), BlockDispositions(std::move(Arg.BlockDispositions)), @@ -12946,6 +12866,23 @@ bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { return SCEVExprContains(S, [&](const SCEV *Expr) { return Expr == Op; }); } +void ScalarEvolution::forgetBackedgeTakenCounts(const Loop *L, + bool Predicated) { + auto &BECounts = + Predicated ? PredicatedBackedgeTakenCounts : BackedgeTakenCounts; + auto It = BECounts.find(L); + if (It != BECounts.end()) { + for (const ExitNotTakenInfo &ENT : It->second.ExitNotTaken) { + if (!isa<SCEVConstant>(ENT.ExactNotTaken)) { + auto UserIt = BECountUsers.find(ENT.ExactNotTaken); + assert(UserIt != BECountUsers.end()); + UserIt->second.erase({L, Predicated}); + } + } + BECounts.erase(It); + } +} + void ScalarEvolution::forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs) { SmallPtrSet<const SCEV *, 8> ToForget(SCEVs.begin(), SCEVs.end()); SmallVector<const SCEV *, 8> Worklist(ToForget.begin(), ToForget.end()); @@ -12970,32 +12907,52 @@ void ScalarEvolution::forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs) { else ++I; } - - auto RemoveSCEVFromBackedgeMap = [&ToForget]( - DenseMap<const Loop *, BackedgeTakenInfo> &Map) { - for (auto I = Map.begin(), E = Map.end(); I != E;) { - BackedgeTakenInfo &BEInfo = I->second; - if (any_of(ToForget, - [&BEInfo](const SCEV *S) { return BEInfo.hasOperand(S); })) - Map.erase(I++); - else - ++I; - } - }; - - RemoveSCEVFromBackedgeMap(BackedgeTakenCounts); - RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts); } void ScalarEvolution::forgetMemoizedResultsImpl(const SCEV *S) { - ValuesAtScopes.erase(S); LoopDispositions.erase(S); BlockDispositions.erase(S); UnsignedRanges.erase(S); SignedRanges.erase(S); - ExprValueMap.erase(S); HasRecMap.erase(S); MinTrailingZerosCache.erase(S); + + auto ExprIt = ExprValueMap.find(S); + if (ExprIt != ExprValueMap.end()) { + for (auto &ValueAndOffset : ExprIt->second) { + if (ValueAndOffset.second == nullptr) { + auto ValueIt = ValueExprMap.find_as(ValueAndOffset.first); + if (ValueIt != ValueExprMap.end()) + ValueExprMap.erase(ValueIt); + } + } + ExprValueMap.erase(ExprIt); + } + + auto ScopeIt = ValuesAtScopes.find(S); + if (ScopeIt != ValuesAtScopes.end()) { + for (const auto &Pair : ScopeIt->second) + if (!isa_and_nonnull<SCEVConstant>(Pair.second)) + erase_value(ValuesAtScopesUsers[Pair.second], + std::make_pair(Pair.first, S)); + ValuesAtScopes.erase(ScopeIt); + } + + auto ScopeUserIt = ValuesAtScopesUsers.find(S); + if (ScopeUserIt != ValuesAtScopesUsers.end()) { + for (const auto &Pair : ScopeUserIt->second) + erase_value(ValuesAtScopes[Pair.second], std::make_pair(Pair.first, S)); + ValuesAtScopesUsers.erase(ScopeUserIt); + } + + auto BEUsersIt = BECountUsers.find(S); + if (BEUsersIt != BECountUsers.end()) { + // Work on a copy, as forgetBackedgeTakenCounts() will modify the original. + auto Copy = BEUsersIt->second; + for (const auto &Pair : Copy) + forgetBackedgeTakenCounts(Pair.getPointer(), Pair.getInt()); + BECountUsers.erase(BEUsersIt); + } } void @@ -13100,16 +13057,43 @@ void ScalarEvolution::verify() const { ValidLoops.insert(L); Worklist.append(L->begin(), L->end()); } - // Check for SCEV expressions referencing invalid/deleted loops. for (auto &KV : ValueExprMap) { - auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second); - if (!AR) - continue; - assert(ValidLoops.contains(AR->getLoop()) && - "AddRec references invalid loop"); + // Check for SCEV expressions referencing invalid/deleted loops. + if (auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second)) { + assert(ValidLoops.contains(AR->getLoop()) && + "AddRec references invalid loop"); + } + + // Check that the value is also part of the reverse map. + auto It = ExprValueMap.find(KV.second); + if (It == ExprValueMap.end() || !It->second.contains({KV.first, nullptr})) { + dbgs() << "Value " << *KV.first + << " is in ValueExprMap but not in ExprValueMap\n"; + std::abort(); + } + } + + for (const auto &KV : ExprValueMap) { + for (const auto &ValueAndOffset : KV.second) { + if (ValueAndOffset.second != nullptr) + continue; + + auto It = ValueExprMap.find_as(ValueAndOffset.first); + if (It == ValueExprMap.end()) { + dbgs() << "Value " << *ValueAndOffset.first + << " is in ExprValueMap but not in ValueExprMap\n"; + std::abort(); + } + if (It->second != KV.first) { + dbgs() << "Value " << *ValueAndOffset.first + << " mapped to " << *It->second + << " rather than " << *KV.first << "\n"; + std::abort(); + } + } } - // Verify intergity of SCEV users. + // Verify integrity of SCEV users. for (const auto &S : UniqueSCEVs) { SmallVector<const SCEV *, 4> Ops; collectUniqueOps(&S, Ops); @@ -13125,6 +13109,61 @@ void ScalarEvolution::verify() const { std::abort(); } } + + // Verify integrity of ValuesAtScopes users. + for (const auto &ValueAndVec : ValuesAtScopes) { + const SCEV *Value = ValueAndVec.first; + for (const auto &LoopAndValueAtScope : ValueAndVec.second) { + const Loop *L = LoopAndValueAtScope.first; + const SCEV *ValueAtScope = LoopAndValueAtScope.second; + if (!isa<SCEVConstant>(ValueAtScope)) { + auto It = ValuesAtScopesUsers.find(ValueAtScope); + if (It != ValuesAtScopesUsers.end() && + is_contained(It->second, std::make_pair(L, Value))) + continue; + dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: " + << ValueAtScope << " missing in ValuesAtScopesUsers\n"; + std::abort(); + } + } + } + + for (const auto &ValueAtScopeAndVec : ValuesAtScopesUsers) { + const SCEV *ValueAtScope = ValueAtScopeAndVec.first; + for (const auto &LoopAndValue : ValueAtScopeAndVec.second) { + const Loop *L = LoopAndValue.first; + const SCEV *Value = LoopAndValue.second; + assert(!isa<SCEVConstant>(Value)); + auto It = ValuesAtScopes.find(Value); + if (It != ValuesAtScopes.end() && + is_contained(It->second, std::make_pair(L, ValueAtScope))) + continue; + dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: " + << ValueAtScope << " missing in ValuesAtScopes\n"; + std::abort(); + } + } + + // Verify integrity of BECountUsers. + auto VerifyBECountUsers = [&](bool Predicated) { + auto &BECounts = + Predicated ? PredicatedBackedgeTakenCounts : BackedgeTakenCounts; + for (const auto &LoopAndBEInfo : BECounts) { + for (const ExitNotTakenInfo &ENT : LoopAndBEInfo.second.ExitNotTaken) { + if (!isa<SCEVConstant>(ENT.ExactNotTaken)) { + auto UserIt = BECountUsers.find(ENT.ExactNotTaken); + if (UserIt != BECountUsers.end() && + UserIt->second.contains({ LoopAndBEInfo.first, Predicated })) + continue; + dbgs() << "Value " << *ENT.ExactNotTaken << " for loop " + << *LoopAndBEInfo.first << " missing from BECountUsers\n"; + std::abort(); + } + } + } + }; + VerifyBECountUsers(/* Predicated */ false); + VerifyBECountUsers(/* Predicated */ true); } bool ScalarEvolution::invalidate( diff --git a/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp index 74cc39b7f2c0..54f3605ee033 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/StackSafetyAnalysis.cpp @@ -14,12 +14,14 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/StackLifetime.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/ModuleSummaryIndex.h" @@ -117,7 +119,7 @@ template <typename CalleeTy> struct UseInfo { // Access range if the address (alloca or parameters). // It is allowed to be empty-set when there are no known accesses. ConstantRange Range; - std::map<const Instruction *, ConstantRange> Accesses; + std::set<const Instruction *> UnsafeAccesses; // List of calls which pass address as an argument. // Value is offset range of address from base address (alloca or calling @@ -131,10 +133,9 @@ template <typename CalleeTy> struct UseInfo { UseInfo(unsigned PointerSize) : Range{PointerSize, false} {} void updateRange(const ConstantRange &R) { Range = unionNoWrap(Range, R); } - void addRange(const Instruction *I, const ConstantRange &R) { - auto Ins = Accesses.emplace(I, R); - if (!Ins.second) - Ins.first->second = unionNoWrap(Ins.first->second, R); + void addRange(const Instruction *I, const ConstantRange &R, bool IsSafe) { + if (!IsSafe) + UnsafeAccesses.insert(I); updateRange(R); } }; @@ -230,7 +231,7 @@ struct StackSafetyInfo::InfoTy { struct StackSafetyGlobalInfo::InfoTy { GVToSSI Info; SmallPtrSet<const AllocaInst *, 8> SafeAllocas; - std::map<const Instruction *, bool> AccessIsUnsafe; + std::set<const Instruction *> UnsafeAccesses; }; namespace { @@ -253,6 +254,11 @@ class StackSafetyLocalAnalysis { void analyzeAllUses(Value *Ptr, UseInfo<GlobalValue> &AS, const StackLifetime &SL); + + bool isSafeAccess(const Use &U, AllocaInst *AI, const SCEV *AccessSize); + bool isSafeAccess(const Use &U, AllocaInst *AI, Value *V); + bool isSafeAccess(const Use &U, AllocaInst *AI, TypeSize AccessSize); + public: StackSafetyLocalAnalysis(Function &F, ScalarEvolution &SE) : F(F), DL(F.getParent()->getDataLayout()), SE(SE), @@ -333,6 +339,56 @@ ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange( return getAccessRange(U, Base, SizeRange); } +bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI, + Value *V) { + return isSafeAccess(U, AI, SE.getSCEV(V)); +} + +bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI, + TypeSize TS) { + if (TS.isScalable()) + return false; + auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize); + const SCEV *SV = SE.getConstant(CalculationTy, TS.getFixedSize()); + return isSafeAccess(U, AI, SV); +} + +bool StackSafetyLocalAnalysis::isSafeAccess(const Use &U, AllocaInst *AI, + const SCEV *AccessSize) { + + if (!AI) + return true; + if (isa<SCEVCouldNotCompute>(AccessSize)) + return false; + + const auto *I = cast<Instruction>(U.getUser()); + + auto ToCharPtr = [&](const SCEV *V) { + auto *PtrTy = IntegerType::getInt8PtrTy(SE.getContext()); + return SE.getTruncateOrZeroExtend(V, PtrTy); + }; + + const SCEV *AddrExp = ToCharPtr(SE.getSCEV(U.get())); + const SCEV *BaseExp = ToCharPtr(SE.getSCEV(AI)); + const SCEV *Diff = SE.getMinusSCEV(AddrExp, BaseExp); + if (isa<SCEVCouldNotCompute>(Diff)) + return false; + + auto Size = getStaticAllocaSizeRange(*AI); + + auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize); + auto ToDiffTy = [&](const SCEV *V) { + return SE.getTruncateOrZeroExtend(V, CalculationTy); + }; + const SCEV *Min = ToDiffTy(SE.getConstant(Size.getLower())); + const SCEV *Max = SE.getMinusSCEV(ToDiffTy(SE.getConstant(Size.getUpper())), + ToDiffTy(AccessSize)); + return SE.evaluatePredicateAt(ICmpInst::Predicate::ICMP_SGE, Diff, Min, I) + .getValueOr(false) && + SE.evaluatePredicateAt(ICmpInst::Predicate::ICMP_SLE, Diff, Max, I) + .getValueOr(false); +} + /// The function analyzes all local uses of Ptr (alloca or argument) and /// calculates local access range and all function calls where it was used. void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, @@ -341,7 +397,7 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, SmallPtrSet<const Value *, 16> Visited; SmallVector<const Value *, 8> WorkList; WorkList.push_back(Ptr); - const AllocaInst *AI = dyn_cast<AllocaInst>(Ptr); + AllocaInst *AI = dyn_cast<AllocaInst>(Ptr); // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. while (!WorkList.empty()) { @@ -356,11 +412,13 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, switch (I->getOpcode()) { case Instruction::Load: { if (AI && !SL.isAliveAfter(AI, I)) { - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; } - US.addRange(I, - getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType()))); + auto TypeSize = DL.getTypeStoreSize(I->getType()); + auto AccessRange = getAccessRange(UI, Ptr, TypeSize); + bool Safe = isSafeAccess(UI, AI, TypeSize); + US.addRange(I, AccessRange, Safe); break; } @@ -370,16 +428,17 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, case Instruction::Store: { if (V == I->getOperand(0)) { // Stored the pointer - conservatively assume it may be unsafe. - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; } if (AI && !SL.isAliveAfter(AI, I)) { - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; } - US.addRange( - I, getAccessRange( - UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType()))); + auto TypeSize = DL.getTypeStoreSize(I->getOperand(0)->getType()); + auto AccessRange = getAccessRange(UI, Ptr, TypeSize); + bool Safe = isSafeAccess(UI, AI, TypeSize); + US.addRange(I, AccessRange, Safe); break; } @@ -387,7 +446,7 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, // Information leak. // FIXME: Process parameters correctly. This is a leak only if we return // alloca. - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; case Instruction::Call: @@ -396,12 +455,20 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, break; if (AI && !SL.isAliveAfter(AI, I)) { - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; } - if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { - US.addRange(I, getMemIntrinsicAccessRange(MI, UI, Ptr)); + auto AccessRange = getMemIntrinsicAccessRange(MI, UI, Ptr); + bool Safe = false; + if (const auto *MTI = dyn_cast<MemTransferInst>(MI)) { + if (MTI->getRawSource() != UI && MTI->getRawDest() != UI) + Safe = true; + } else if (MI->getRawDest() != UI) { + Safe = true; + } + Safe = Safe || isSafeAccess(UI, AI, MI->getLength()); + US.addRange(I, AccessRange, Safe); break; } @@ -412,15 +479,16 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, } if (!CB.isArgOperand(&UI)) { - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; } unsigned ArgNo = CB.getArgOperandNo(&UI); if (CB.isByValArgument(ArgNo)) { - US.addRange(I, getAccessRange( - UI, Ptr, - DL.getTypeStoreSize(CB.getParamByValType(ArgNo)))); + auto TypeSize = DL.getTypeStoreSize(CB.getParamByValType(ArgNo)); + auto AccessRange = getAccessRange(UI, Ptr, TypeSize); + bool Safe = isSafeAccess(UI, AI, TypeSize); + US.addRange(I, AccessRange, Safe); break; } @@ -430,7 +498,7 @@ void StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr, const GlobalValue *Callee = dyn_cast<GlobalValue>(CB.getCalledOperand()->stripPointerCasts()); if (!Callee) { - US.addRange(I, UnknownRange); + US.addRange(I, UnknownRange, /*IsSafe=*/false); break; } @@ -827,8 +895,8 @@ const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const { Info->SafeAllocas.insert(AI); ++NumAllocaStackSafe; } - for (const auto &A : KV.second.Accesses) - Info->AccessIsUnsafe[A.first] |= !AIRange.contains(A.second); + Info->UnsafeAccesses.insert(KV.second.UnsafeAccesses.begin(), + KV.second.UnsafeAccesses.end()); } } @@ -903,11 +971,7 @@ bool StackSafetyGlobalInfo::isSafe(const AllocaInst &AI) const { bool StackSafetyGlobalInfo::stackAccessIsSafe(const Instruction &I) const { const auto &Info = getInfo(); - auto It = Info.AccessIsUnsafe.find(&I); - if (It == Info.AccessIsUnsafe.end()) { - return true; - } - return !It->second; + return Info.UnsafeAccesses.find(&I) == Info.UnsafeAccesses.end(); } void StackSafetyGlobalInfo::print(raw_ostream &O) const { diff --git a/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp index 59582cd3a198..ff833b55bbce 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/SyncDependenceAnalysis.cpp @@ -15,21 +15,18 @@ // The SyncDependenceAnalysis is used in the DivergenceAnalysis to model // control-induced divergence in phi nodes. // -// -- Summary -- -// The SyncDependenceAnalysis lazily computes sync dependences [3]. -// The analysis evaluates the disjoint path criterion [2] by a reduction -// to SSA construction. The SSA construction algorithm is implemented as -// a simple data-flow analysis [1]. // -// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy -// [2] "Efficiently Computing Static Single Assignment Form -// and the Control Dependence Graph", TOPLAS '91, -// Cytron, Ferrante, Rosen, Wegman and Zadeck -// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack -// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira +// -- Reference -- +// The algorithm is presented in Section 5 of +// +// An abstract interpretation for SPMD divergence +// on reducible control flow graphs. +// Julian Rosemann, Simon Moll and Sebastian Hack +// POPL '21 +// // // -- Sync dependence -- -// Sync dependence [4] characterizes the control flow aspect of the +// Sync dependence characterizes the control flow aspect of the // propagation of branch divergence. For example, // // %cond = icmp slt i32 %tid, 10 @@ -46,9 +43,10 @@ // because the branch "br i1 %cond" depends on %tid and affects which value %a // is assigned to. // +// // -- Reduction to SSA construction -- // There are two disjoint paths from A to X, if a certain variant of SSA -// construction places a phi node in X under the following set-up scheme [2]. +// construction places a phi node in X under the following set-up scheme. // // This variant of SSA construction ignores incoming undef values. // That is paths from the entry without a definition do not result in @@ -63,6 +61,7 @@ // D E // \ / // F +// // Assume that A contains a divergent branch. We are interested // in the set of all blocks where each block is reachable from A // via two disjoint paths. This would be the set {D, F} in this @@ -70,6 +69,7 @@ // To generally reduce this query to SSA construction we introduce // a virtual variable x and assign to x different values in each // successor block of A. +// // entry // / \ // A \ @@ -79,23 +79,41 @@ // D E // \ / // F +// // Our flavor of SSA construction for x will construct the following +// // entry // / \ // A \ // / \ Y // x0 = 0 x1 = 1 / // \ / \ / -// x2=phi E +// x2 = phi E // \ / -// x3=phi +// x3 = phi +// // The blocks D and F contain phi nodes and are thus each reachable // by two disjoins paths from A. // // -- Remarks -- -// In case of loop exits we need to check the disjoint path criterion for loops -// [2]. To this end, we check whether the definition of x differs between the -// loop exit and the loop header (_after_ SSA construction). +// * In case of loop exits we need to check the disjoint path criterion for loops. +// To this end, we check whether the definition of x differs between the +// loop exit and the loop header (_after_ SSA construction). +// +// -- Known Limitations & Future Work -- +// * The algorithm requires reducible loops because the implementation +// implicitly performs a single iteration of the underlying data flow analysis. +// This was done for pragmatism, simplicity and speed. +// +// Relevant related work for extending the algorithm to irreducible control: +// A simple algorithm for global data flow analysis problems. +// Matthew S. Hecht and Jeffrey D. Ullman. +// SIAM Journal on Computing, 4(4):519–532, December 1975. +// +// * Another reason for requiring reducible loops is that points of +// synchronization in irreducible loops aren't 'obvious' - there is no unique +// header where threads 'should' synchronize when entering or coming back +// around from the latch. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/SyncDependenceAnalysis.h" @@ -128,8 +146,9 @@ using namespace llvm; // // We cannot use the vanilla (R)PO computation of LLVM because: // * We (virtually) modify the CFG. -// * We want a loop-compact block enumeration, that is the numbers assigned by -// the traveral to the blocks of a loop are an interval. +// * We want a loop-compact block enumeration, that is the numbers assigned to +// blocks of a loop form an interval +// using POCB = std::function<void(const BasicBlock &)>; using VisitedSet = std::set<const BasicBlock *>; using BlockStack = std::vector<const BasicBlock *>; diff --git a/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp index 7326ba74c071..72fbd5ad3f68 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -166,8 +166,8 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, return; } - // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later. - // All versions of watchOS support it. + // memset_pattern{4,8,16} is only available on iOS 3.0 and Mac OS X 10.5 and + // later. All versions of watchOS support it. if (T.isMacOSX()) { // available IO unlocked variants on Mac OS X TLI.setAvailable(LibFunc_getc_unlocked); @@ -175,12 +175,20 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setAvailable(LibFunc_putc_unlocked); TLI.setAvailable(LibFunc_putchar_unlocked); - if (T.isMacOSXVersionLT(10, 5)) + if (T.isMacOSXVersionLT(10, 5)) { + TLI.setUnavailable(LibFunc_memset_pattern4); + TLI.setUnavailable(LibFunc_memset_pattern8); TLI.setUnavailable(LibFunc_memset_pattern16); + } } else if (T.isiOS()) { - if (T.isOSVersionLT(3, 0)) + if (T.isOSVersionLT(3, 0)) { + TLI.setUnavailable(LibFunc_memset_pattern4); + TLI.setUnavailable(LibFunc_memset_pattern8); TLI.setUnavailable(LibFunc_memset_pattern16); + } } else if (!T.isWatchOS()) { + TLI.setUnavailable(LibFunc_memset_pattern4); + TLI.setUnavailable(LibFunc_memset_pattern8); TLI.setUnavailable(LibFunc_memset_pattern16); } @@ -684,7 +692,6 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_strcat_chk); TLI.setUnavailable(LibFunc_strcpy_chk); TLI.setUnavailable(LibFunc_strlcat_chk); - TLI.setUnavailable(LibFunc_strlcat_chk); TLI.setUnavailable(LibFunc_strlcpy_chk); TLI.setUnavailable(LibFunc_strlen_chk); TLI.setUnavailable(LibFunc_strncat_chk); @@ -1523,6 +1530,8 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, FTy.getParamType(2)->isPointerTy() && FTy.getParamType(3)->isIntegerTy()); + case LibFunc_memset_pattern4: + case LibFunc_memset_pattern8: case LibFunc_memset_pattern16: return (!FTy.isVarArg() && NumParams == 3 && FTy.getParamType(0)->isPointerTy() && diff --git a/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp b/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp index 8a34a34eb307..7573975a3dd3 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/VFABIDemangling.cpp @@ -445,7 +445,6 @@ Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName, VF = EC.getKnownMinValue(); } - // Sanity checks. // 1. We don't accept a zero lanes vectorization factor. // 2. We don't accept the demangling if the vector function is not // present in the module. diff --git a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp index 5bce1eaa59a0..5feabd876e3a 100644 --- a/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp +++ b/contrib/llvm-project/llvm/lib/AsmParser/LLParser.cpp @@ -124,8 +124,8 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) { std::make_pair(I.first, std::make_pair(I.second, LocTy()))); } -/// validateEndOfModule - Do final validity and sanity checks at the end of the -/// module. +/// validateEndOfModule - Do final validity and basic correctness checks at the +/// end of the module. bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { if (!M) return false; @@ -271,7 +271,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { return false; } -/// Do final validity and sanity checks at the end of the index. +/// Do final validity and basic correctness checks at the end of the index. bool LLParser::validateEndOfIndex() { if (!Index) return false; @@ -2989,9 +2989,10 @@ BasicBlock *LLParser::PerFunctionState::defineBB(const std::string &Name, /// parseValID - parse an abstract value that doesn't necessarily have a /// type implied. For example, if we parse "4" we don't know what integer type /// it has. The value will later be combined with its type and checked for -/// sanity. PFS is used to convert function-local operands of metadata (since -/// metadata operands are not just parsed here but also converted to values). -/// PFS can be null when we are not parsing metadata values inside a function. +/// basic correctness. PFS is used to convert function-local operands of +/// metadata (since metadata operands are not just parsed here but also +/// converted to values). PFS can be null when we are not parsing metadata +/// values inside a function. bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { ID.Loc = Lex.getLoc(); switch (Lex.getKind()) { diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index 2723105b092f..d7bcb0d7f575 100644 --- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -957,8 +957,8 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel, O->OS.write_escaped(Blob, /*hex=*/true) << "'"; } else { bool BlobIsPrintable = true; - for (unsigned i = 0, e = Blob.size(); i != e; ++i) - if (!isPrint(static_cast<unsigned char>(Blob[i]))) { + for (char C : Blob) + if (!isPrint(static_cast<unsigned char>(C))) { BlobIsPrintable = false; break; } diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index c568461e62b0..993cb1de8c02 100644 --- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -3996,8 +3996,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { // See if anything took the address of blocks in this function. auto BBFRI = BasicBlockFwdRefs.find(F); if (BBFRI == BasicBlockFwdRefs.end()) { - for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i) - FunctionBBs[i] = BasicBlock::Create(Context, "", F); + for (BasicBlock *&BB : FunctionBBs) + BB = BasicBlock::Create(Context, "", F); } else { auto &BBRefs = BBFRI->second; // Check for invalid basic block references. @@ -4605,9 +4605,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { CaseVals.push_back(ConstantInt::get(Context, Low)); } BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]); - for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(), - cve = CaseVals.end(); cvi != cve; ++cvi) - SI->addCase(*cvi, DestBB); + for (ConstantInt *Cst : CaseVals) + SI->addCase(Cst, DestBB); } I = SI; break; diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 6df5a4a64d51..60530d7f7a00 100644 --- a/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/contrib/llvm-project/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -444,7 +444,8 @@ class MetadataLoader::MetadataLoaderImpl { uint64_t GlobalDeclAttachmentPos = 0; #ifndef NDEBUG - /// Sanity check that we end up parsing all of the global decl attachments. + /// Baisic correctness check that we end up parsing all of the global decl + /// attachments. unsigned NumGlobalDeclAttachSkipped = 0; unsigned NumGlobalDeclAttachParsed = 0; #endif @@ -917,7 +918,7 @@ Expected<bool> MetadataLoader::MetadataLoaderImpl::loadGlobalDeclAttachments() { case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: - // Sanity check that we parsed them all. + // Check that we parsed them all. assert(NumGlobalDeclAttachSkipped == NumGlobalDeclAttachParsed); return true; case BitstreamEntry::Record: @@ -929,7 +930,7 @@ Expected<bool> MetadataLoader::MetadataLoaderImpl::loadGlobalDeclAttachments() { return MaybeCode.takeError(); if (MaybeCode.get() != bitc::METADATA_GLOBAL_DECL_ATTACHMENT) { // Anything other than a global decl attachment signals the end of - // these records. sanity check that we parsed them all. + // these records. Check that we parsed them all. assert(NumGlobalDeclAttachSkipped == NumGlobalDeclAttachParsed); return true; } diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 1e9a9197aed7..e2354c40844a 100644 --- a/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -596,10 +596,10 @@ static void writeStringRecord(BitstreamWriter &Stream, unsigned Code, SmallVector<unsigned, 64> Vals; // Code: [strchar x N] - for (unsigned i = 0, e = Str.size(); i != e; ++i) { - if (AbbrevToUse && !BitCodeAbbrevOp::isChar6(Str[i])) + for (char C : Str) { + if (AbbrevToUse && !BitCodeAbbrevOp::isChar6(C)) AbbrevToUse = 0; - Vals.push_back(Str[i]); + Vals.push_back(C); } // Emit the finished record. @@ -914,8 +914,7 @@ void ModuleBitcodeWriter::writeTypeTable() { TypeVals.clear(); // Loop over all of the types, emitting each in turn. - for (unsigned i = 0, e = TypeList.size(); i != e; ++i) { - Type *T = TypeList[i]; + for (Type *T : TypeList) { int AbbrevToUse = 0; unsigned Code = 0; @@ -3343,19 +3342,18 @@ void ModuleBitcodeWriter::writeFunction( DILocation *LastDL = nullptr; // Finally, emit all the instructions, in order. - for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { - writeInstruction(*I, InstID, Vals); + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) { + writeInstruction(I, InstID, Vals); - if (!I->getType()->isVoidTy()) + if (!I.getType()->isVoidTy()) ++InstID; // If the instruction has metadata, write a metadata attachment later. - NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc(); + NeedsMetadataAttachment |= I.hasMetadataOtherThanDebugLoc(); // If the instruction has a debug location, emit it. - DILocation *DL = I->getDebugLoc(); + DILocation *DL = I.getDebugLoc(); if (!DL) continue; @@ -4429,9 +4427,9 @@ void ModuleBitcodeWriter::write() { // Emit function bodies. DenseMap<const Function *, uint64_t> FunctionToBitcodeIndex; - for (Module::const_iterator F = M.begin(), E = M.end(); F != E; ++F) - if (!F->isDeclaration()) - writeFunction(*F, FunctionToBitcodeIndex); + for (const Function &F : M) + if (!F.isDeclaration()) + writeFunction(F, FunctionToBitcodeIndex); // Need to write after the above call to WriteFunction which populates // the summary information in the index. diff --git a/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp index 9465a3b11c8f..07e0708e68c3 100644 --- a/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/contrib/llvm-project/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -1148,8 +1148,8 @@ void ValueEnumerator::purgeFunction() { ValueMap.erase(Values[i].first); for (unsigned i = NumModuleMDs, e = MDs.size(); i != e; ++i) MetadataMap.erase(MDs[i]); - for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i) - ValueMap.erase(BasicBlocks[i]); + for (const BasicBlock *BB : BasicBlocks) + ValueMap.erase(BB); Values.resize(NumModuleValues); MDs.resize(NumModuleMDs); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 87a3cede601b..5984063627b0 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -354,8 +354,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction( // dead, or because only a subregister is live at the def. If we // don't do this the dead def will be incorrectly merged into the // previous def. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; Register Reg = MO.getReg(); if (Reg == 0) continue; @@ -407,8 +406,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction( // Scan the register defs for this instruction and update // live-ranges. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; Register Reg = MO.getReg(); if (Reg == 0) continue; @@ -495,8 +493,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI, LLVM_DEBUG(dbgs() << "\tKill Group:"); unsigned FirstReg = 0; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (Reg == 0) continue; @@ -762,11 +759,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // ...need a map from MI to SUnit. std::map<MachineInstr *, const SUnit *> MISUnitMap; - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - const SUnit *SU = &SUnits[i]; - MISUnitMap.insert(std::pair<MachineInstr *, const SUnit *>(SU->getInstr(), - SU)); - } + for (const SUnit &SU : SUnits) + MISUnitMap.insert(std::make_pair(SU.getInstr(), &SU)); // Track progress along the critical path through the SUnit graph as // we walk the instructions. This is needed for regclasses that only @@ -774,12 +768,11 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( const SUnit *CriticalPathSU = nullptr; MachineInstr *CriticalPathMI = nullptr; if (CriticalPathSet.any()) { - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - const SUnit *SU = &SUnits[i]; + for (const SUnit &SU : SUnits) { if (!CriticalPathSU || - ((SU->getDepth() + SU->Latency) > + ((SU.getDepth() + SU.Latency) > (CriticalPathSU->getDepth() + CriticalPathSU->Latency))) { - CriticalPathSU = SU; + CriticalPathSU = &SU; } } assert(CriticalPathSU && "Failed to find SUnit critical path"); @@ -839,8 +832,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // but don't cause any anti-dependence breaking themselves) if (!MI.isKill()) { // Attempt to break each anti-dependency... - for (unsigned i = 0, e = Edges.size(); i != e; ++i) { - const SDep *Edge = Edges[i]; + for (const SDep *Edge : Edges) { SUnit *NextSU = Edge->getSUnit(); if ((Edge->getKind() != SDep::Anti) && diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index cc848d28a9a7..828cb760b82e 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -809,9 +809,9 @@ void AsmPrinter::emitFunctionHeader() { // so that we don't get references to undefined symbols. std::vector<MCSymbol*> DeadBlockSyms; MMI->takeDeletedSymbolsForFunction(&F, DeadBlockSyms); - for (unsigned i = 0, e = DeadBlockSyms.size(); i != e; ++i) { + for (MCSymbol *DeadBlockSym : DeadBlockSyms) { OutStreamer->AddComment("Address taken block that was later removed"); - OutStreamer->emitLabel(DeadBlockSyms[i]); + OutStreamer->emitLabel(DeadBlockSym); } if (CurrentFnBegin) { @@ -910,8 +910,7 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) { std::string Str; raw_string_ostream OS(Str); OS << "kill:"; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &Op = MI->getOperand(i); + for (const MachineOperand &Op : MI->operands()) { assert(Op.isReg() && "KILL instruction must have only register operands"); OS << ' ' << (Op.isDef() ? "def " : "killed ") << printReg(Op.getReg(), AP.MF->getSubtarget().getRegisterInfo()); @@ -2150,8 +2149,7 @@ void AsmPrinter::emitJumpTableInfo() { SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets; const TargetLowering *TLI = MF->getSubtarget().getTargetLowering(); const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext); - for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) { - const MachineBasicBlock *MBB = JTBBs[ii]; + for (const MachineBasicBlock *MBB : JTBBs) { if (!EmittedSets.insert(MBB).second) continue; @@ -2177,8 +2175,8 @@ void AsmPrinter::emitJumpTableInfo() { MCSymbol* JTISymbol = GetJTISymbol(JTI); OutStreamer->emitLabel(JTISymbol); - for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) - emitJumpTableEntry(MJTI, JTBBs[ii], JTI); + for (const MachineBasicBlock *MBB : JTBBs) + emitJumpTableEntry(MJTI, MBB, JTI); } if (!JTInDiffSection) OutStreamer->emitDataRegion(MCDR_DataRegionEnd); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index ef1abc47701a..5d0cadefdbf7 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -128,191 +128,29 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, emitInlineAsmEnd(STI, &TAP->getSTI()); } -static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI, - MachineModuleInfo *MMI, const MCAsmInfo *MAI, - AsmPrinter *AP, uint64_t LocCookie, - raw_ostream &OS) { - // Switch to the inline assembly variant. - OS << "\t.intel_syntax\n\t"; - - int CurVariant = -1; // The number of the {.|.|.} region we are in. - const char *LastEmitted = AsmStr; // One past the last character emitted. - unsigned NumOperands = MI->getNumOperands(); - int AsmPrinterVariant = 1; // X86MCAsmInfo.cpp's AsmWriterFlavorTy::Intel. - - while (*LastEmitted) { - switch (*LastEmitted) { - default: { - // Not a special case, emit the string section literally. - const char *LiteralEnd = LastEmitted+1; - while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' && - *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n') - ++LiteralEnd; - if (CurVariant == -1 || CurVariant == AsmPrinterVariant) - OS.write(LastEmitted, LiteralEnd - LastEmitted); - LastEmitted = LiteralEnd; - break; - } - case '\n': - ++LastEmitted; // Consume newline character. - OS << '\n'; // Indent code with newline. - break; - case '$': { - ++LastEmitted; // Consume '$' character. - bool Done = true; - - // Handle escapes. - switch (*LastEmitted) { - default: Done = false; break; - case '$': - ++LastEmitted; // Consume second '$' character. - break; - case '(': // $( -> same as GCC's { character. - ++LastEmitted; // Consume '(' character. - if (CurVariant != -1) - report_fatal_error("Nested variants found in inline asm string: '" + - Twine(AsmStr) + "'"); - CurVariant = 0; // We're in the first variant now. - break; - case '|': - ++LastEmitted; // Consume '|' character. - if (CurVariant == -1) - OS << '|'; // This is gcc's behavior for | outside a variant. - else - ++CurVariant; // We're in the next variant. - break; - case ')': // $) -> same as GCC's } char. - ++LastEmitted; // Consume ')' character. - if (CurVariant == -1) - OS << '}'; // This is gcc's behavior for } outside a variant. - else - CurVariant = -1; - break; - } - if (Done) break; - - bool HasCurlyBraces = false; - if (*LastEmitted == '{') { // ${variable} - ++LastEmitted; // Consume '{' character. - HasCurlyBraces = true; - } - - // If we have ${:foo}, then this is not a real operand reference, it is a - // "magic" string reference, just like in .td files. Arrange to call - // PrintSpecial. - if (HasCurlyBraces && *LastEmitted == ':') { - ++LastEmitted; - const char *StrStart = LastEmitted; - const char *StrEnd = strchr(StrStart, '}'); - if (!StrEnd) - report_fatal_error("Unterminated ${:foo} operand in inline asm" - " string: '" + Twine(AsmStr) + "'"); - if (CurVariant == -1 || CurVariant == AsmPrinterVariant) - AP->PrintSpecial(MI, OS, StringRef(StrStart, StrEnd - StrStart)); - LastEmitted = StrEnd+1; - break; - } - - const char *IDStart = LastEmitted; - const char *IDEnd = IDStart; - while (isDigit(*IDEnd)) - ++IDEnd; - - unsigned Val; - if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val)) - report_fatal_error("Bad $ operand number in inline asm string: '" + - Twine(AsmStr) + "'"); - LastEmitted = IDEnd; - - if (Val >= NumOperands - 1) - report_fatal_error("Invalid $ operand number in inline asm string: '" + - Twine(AsmStr) + "'"); - - char Modifier[2] = { 0, 0 }; - - if (HasCurlyBraces) { - // If we have curly braces, check for a modifier character. This - // supports syntax like ${0:u}, which correspond to "%u0" in GCC asm. - if (*LastEmitted == ':') { - ++LastEmitted; // Consume ':' character. - if (*LastEmitted == 0) - report_fatal_error("Bad ${:} expression in inline asm string: '" + - Twine(AsmStr) + "'"); - - Modifier[0] = *LastEmitted; - ++LastEmitted; // Consume modifier character. - } - - if (*LastEmitted != '}') - report_fatal_error("Bad ${} expression in inline asm string: '" + - Twine(AsmStr) + "'"); - ++LastEmitted; // Consume '}' character. - } - - // Okay, we finally have a value number. Ask the target to print this - // operand! - if (CurVariant == -1 || CurVariant == AsmPrinterVariant) { - unsigned OpNo = InlineAsm::MIOp_FirstOperand; - - bool Error = false; - - // Scan to find the machine operand number for the operand. - for (; Val; --Val) { - if (OpNo >= MI->getNumOperands()) - break; - unsigned OpFlags = MI->getOperand(OpNo).getImm(); - OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1; - } - - // We may have a location metadata attached to the end of the - // instruction, and at no point should see metadata at any - // other point while processing. It's an error if so. - if (OpNo >= MI->getNumOperands() || MI->getOperand(OpNo).isMetadata()) { - Error = true; - } else { - unsigned OpFlags = MI->getOperand(OpNo).getImm(); - ++OpNo; // Skip over the ID number. - - // FIXME: Shouldn't arch-independent output template handling go into - // PrintAsmOperand? - // Labels are target independent. - if (MI->getOperand(OpNo).isBlockAddress()) { - const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress(); - MCSymbol *Sym = AP->GetBlockAddressSymbol(BA); - Sym->print(OS, AP->MAI); - MMI->getContext().registerInlineAsmLabel(Sym); - } else if (InlineAsm::isMemKind(OpFlags)) { - Error = AP->PrintAsmMemoryOperand( - MI, OpNo, Modifier[0] ? Modifier : nullptr, OS); - } else { - Error = AP->PrintAsmOperand(MI, OpNo, - Modifier[0] ? Modifier : nullptr, OS); - } - } - if (Error) { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "invalid operand in inline asm: '" << AsmStr << "'"; - MMI->getModule()->getContext().emitError(LocCookie, Msg.str()); - } - } - break; - } - } +static void EmitInlineAsmStr(const char *AsmStr, const MachineInstr *MI, + MachineModuleInfo *MMI, const MCAsmInfo *MAI, + AsmPrinter *AP, uint64_t LocCookie, + raw_ostream &OS) { + bool InputIsIntelDialect = MI->getInlineAsmDialect() == InlineAsm::AD_Intel; + + if (InputIsIntelDialect) { + // Switch to the inline assembly variant. + OS << "\t.intel_syntax\n\t"; } - OS << "\n\t.att_syntax\n" << (char)0; // null terminate string. -} -static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI, - MachineModuleInfo *MMI, const MCAsmInfo *MAI, - AsmPrinter *AP, uint64_t LocCookie, - raw_ostream &OS) { int CurVariant = -1; // The number of the {.|.|.} region we are in. const char *LastEmitted = AsmStr; // One past the last character emitted. unsigned NumOperands = MI->getNumOperands(); - int AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant(); - if (MAI->getEmitGNUAsmStartIndentationMarker()) + int AsmPrinterVariant; + if (InputIsIntelDialect) + AsmPrinterVariant = 1; // X86MCAsmInfo.cpp's AsmWriterFlavorTy::Intel. + else + AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant(); + + // FIXME: Should this happen for `asm inteldialect` as well? + if (!InputIsIntelDialect && MAI->getEmitGNUAsmStartIndentationMarker()) OS << '\t'; while (*LastEmitted) { @@ -340,8 +178,9 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI, switch (*LastEmitted) { default: Done = false; break; case '$': // $$ -> $ - if (CurVariant == -1 || CurVariant == AsmPrinterVariant) - OS << '$'; + if (!InputIsIntelDialect) + if (CurVariant == -1 || CurVariant == AsmPrinterVariant) + OS << '$'; ++LastEmitted; // Consume second '$' character. break; case '(': // $( -> same as GCC's { character. @@ -480,6 +319,8 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI, } } } + if (InputIsIntelDialect) + OS << "\n\t.att_syntax"; OS << '\n' << (char)0; // null terminate string. } @@ -515,9 +356,8 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const { // it. uint64_t LocCookie = 0; const MDNode *LocMD = nullptr; - for (unsigned i = MI->getNumOperands(); i != 0; --i) { - if (MI->getOperand(i-1).isMetadata() && - (LocMD = MI->getOperand(i-1).getMetadata()) && + for (const MachineOperand &MO : llvm::reverse(MI->operands())) { + if (MO.isMetadata() && (LocMD = MO.getMetadata()) && LocMD->getNumOperands() != 0) { if (const ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) { @@ -533,10 +373,7 @@ void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const { raw_svector_ostream OS(StringData); AsmPrinter *AP = const_cast<AsmPrinter*>(this); - if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT) - EmitGCCInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS); - else - EmitMSInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS); + EmitInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS); // Emit warnings if we use reserved registers on the clobber list, as // that might lead to undefined behaviour. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index a36d2966d44a..9b73f0ab2f05 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -521,8 +521,8 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) { } // Construct a DIE for this scope. -void DwarfCompileUnit::constructScopeDIE( - LexicalScope *Scope, SmallVectorImpl<DIE *> &FinalChildren) { +void DwarfCompileUnit::constructScopeDIE(LexicalScope *Scope, + DIE &ParentScopeDIE) { if (!Scope || !Scope->getScopeNode()) return; @@ -533,46 +533,27 @@ void DwarfCompileUnit::constructScopeDIE( "constructSubprogramScopeDIE for non-inlined " "subprograms"); - SmallVector<DIE *, 8> Children; - - // We try to create the scope DIE first, then the children DIEs. This will - // avoid creating un-used children then removing them later when we find out - // the scope DIE is null. - DIE *ScopeDIE; + // Emit inlined subprograms. if (Scope->getParent() && isa<DISubprogram>(DS)) { - ScopeDIE = constructInlinedScopeDIE(Scope); + DIE *ScopeDIE = constructInlinedScopeDIE(Scope); if (!ScopeDIE) return; - // We create children when the scope DIE is not null. - createScopeChildrenDIE(Scope, Children); - } else { - // Early exit when we know the scope DIE is going to be null. - if (DD->isLexicalScopeDIENull(Scope)) - return; - - bool HasNonScopeChildren = false; - // We create children here when we know the scope DIE is not going to be - // null and the children will be added to the scope DIE. - createScopeChildrenDIE(Scope, Children, &HasNonScopeChildren); - - // If there are only other scopes as children, put them directly in the - // parent instead, as this scope would serve no purpose. - if (!HasNonScopeChildren) { - FinalChildren.insert(FinalChildren.end(), - std::make_move_iterator(Children.begin()), - std::make_move_iterator(Children.end())); - return; - } - ScopeDIE = constructLexicalScopeDIE(Scope); - assert(ScopeDIE && "Scope DIE should not be null."); + ParentScopeDIE.addChild(ScopeDIE); + createAndAddScopeChildren(Scope, *ScopeDIE); + return; } - // Add children - for (auto &I : Children) - ScopeDIE->addChild(std::move(I)); + // Early exit when we know the scope DIE is going to be null. + if (DD->isLexicalScopeDIENull(Scope)) + return; + + // Emit lexical blocks. + DIE *ScopeDIE = constructLexicalScopeDIE(Scope); + assert(ScopeDIE && "Scope DIE should not be null."); - FinalChildren.push_back(std::move(ScopeDIE)); + ParentScopeDIE.addChild(ScopeDIE); + createAndAddScopeChildren(Scope, *ScopeDIE); } void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE, @@ -1022,42 +1003,6 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) { return Result; } -DIE *DwarfCompileUnit::createScopeChildrenDIE(LexicalScope *Scope, - SmallVectorImpl<DIE *> &Children, - bool *HasNonScopeChildren) { - assert(Children.empty()); - DIE *ObjectPointer = nullptr; - - // Emit function arguments (order is significant). - auto Vars = DU->getScopeVariables().lookup(Scope); - for (auto &DV : Vars.Args) - Children.push_back(constructVariableDIE(*DV.second, *Scope, ObjectPointer)); - - // Emit local variables. - auto Locals = sortLocalVars(Vars.Locals); - for (DbgVariable *DV : Locals) - Children.push_back(constructVariableDIE(*DV, *Scope, ObjectPointer)); - - // Skip imported directives in gmlt-like data. - if (!includeMinimalInlineScopes()) { - // There is no need to emit empty lexical block DIE. - for (const auto *IE : ImportedEntities[Scope->getScopeNode()]) - Children.push_back( - constructImportedEntityDIE(cast<DIImportedEntity>(IE))); - } - - if (HasNonScopeChildren) - *HasNonScopeChildren = !Children.empty(); - - for (DbgLabel *DL : DU->getScopeLabels().lookup(Scope)) - Children.push_back(constructLabelDIE(*DL, *Scope)); - - for (LexicalScope *LS : Scope->getChildren()) - constructScopeDIE(LS, Children); - - return ObjectPointer; -} - DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope) { DIE &ScopeDIE = updateSubprogramScopeDIE(Sub); @@ -1088,13 +1033,48 @@ DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub, DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE) { - // We create children when the scope DIE is not null. - SmallVector<DIE *, 8> Children; - DIE *ObjectPointer = createScopeChildrenDIE(Scope, Children); + DIE *ObjectPointer = nullptr; + + // Emit function arguments (order is significant). + auto Vars = DU->getScopeVariables().lookup(Scope); + for (auto &DV : Vars.Args) + ScopeDIE.addChild(constructVariableDIE(*DV.second, *Scope, ObjectPointer)); + + // Emit local variables. + auto Locals = sortLocalVars(Vars.Locals); + for (DbgVariable *DV : Locals) + ScopeDIE.addChild(constructVariableDIE(*DV, *Scope, ObjectPointer)); + + // Emit imported entities (skipped in gmlt-like data). + if (!includeMinimalInlineScopes()) { + for (const auto *IE : ImportedEntities[Scope->getScopeNode()]) + ScopeDIE.addChild(constructImportedEntityDIE(cast<DIImportedEntity>(IE))); + } + + // Emit labels. + for (DbgLabel *DL : DU->getScopeLabels().lookup(Scope)) + ScopeDIE.addChild(constructLabelDIE(*DL, *Scope)); - // Add children - for (auto &I : Children) - ScopeDIE.addChild(std::move(I)); + // Emit inner lexical scopes. + auto needToEmitLexicalScope = [this](LexicalScope *LS) { + if (isa<DISubprogram>(LS->getScopeNode())) + return true; + auto Vars = DU->getScopeVariables().lookup(LS); + if (!Vars.Args.empty() || !Vars.Locals.empty()) + return true; + if (!includeMinimalInlineScopes() && + !ImportedEntities[LS->getScopeNode()].empty()) + return true; + return false; + }; + for (LexicalScope *LS : Scope->getChildren()) { + // If the lexical block doesn't have non-scope children, skip + // its emission and put its children directly to the parent scope. + if (needToEmitLexicalScope(LS)) + constructScopeDIE(LS, ScopeDIE); + else + createAndAddScopeChildren(LS, ScopeDIE); + } return ObjectPointer; } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 6e9261087686..fb03982b5e4a 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -191,8 +191,7 @@ public: /// variables. DIE &updateSubprogramScopeDIE(const DISubprogram *SP); - void constructScopeDIE(LexicalScope *Scope, - SmallVectorImpl<DIE *> &FinalChildren); + void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE); /// A helper function to construct a RangeSpanList for a given /// lexical scope. @@ -220,11 +219,6 @@ public: /// Construct a DIE for the given DbgLabel. DIE *constructLabelDIE(DbgLabel &DL, const LexicalScope &Scope); - /// A helper function to create children of a Scope DIE. - DIE *createScopeChildrenDIE(LexicalScope *Scope, - SmallVectorImpl<DIE *> &Children, - bool *HasNonScopeChildren = nullptr); - void createBaseTypeDIEs(); /// Construct a DIE for this subprogram scope. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 150f19324834..39f40b172c1b 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -162,9 +162,7 @@ bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) { bool MarkedNoUnwind = false; bool SawFunc = false; - for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI->getOperand(I); - + for (const MachineOperand &MO : MI->operands()) { if (!MO.isGlobal()) continue; const Function *F = dyn_cast<Function>(MO.getGlobal()); @@ -386,8 +384,8 @@ MCSymbol *EHStreamer::emitExceptionTable() { SmallVector<const LandingPadInfo *, 64> LandingPads; LandingPads.reserve(PadInfos.size()); - for (unsigned i = 0, N = PadInfos.size(); i != N; ++i) - LandingPads.push_back(&PadInfos[i]); + for (const LandingPadInfo &LPI : PadInfos) + LandingPads.push_back(&LPI); // Order landing pads lexicographically by type id. llvm::sort(LandingPads, [](const LandingPadInfo *L, const LandingPadInfo *R) { diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp index 5ac8f49a9522..64dadc82b48b 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp @@ -1013,8 +1013,8 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // If this is a large problem, avoid visiting the same basic blocks // multiple times. if (MergePotentials.size() == TailMergeThreshold) - for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) - TriedMerging.insert(MergePotentials[i].getBlock()); + for (const MergePotentialsElt &Elt : MergePotentials) + TriedMerging.insert(Elt.getBlock()); // See if we can do any tail merging on those. if (MergePotentials.size() >= 2) diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp index 50825ccf9bac..eda0f37fdeb7 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -513,9 +513,7 @@ bool BranchRelaxation::relaxBranchInstructions() { // Relaxing branches involves creating new basic blocks, so re-eval // end() for termination. - for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) { - MachineBasicBlock &MBB = *I; - + for (MachineBasicBlock &MBB : *MF) { // Empty block? MachineBasicBlock::iterator Last = MBB.getLastNonDebugInstr(); if (Last == MBB.end()) diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp index e0e2db9f4725..bbdd8aab502e 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp @@ -58,8 +58,10 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); initializeLowerIntrinsicsPass(Registry); + initializeMIRAddFSDiscriminatorsPass(Registry); initializeMIRCanonicalizerPass(Registry); initializeMIRNamerPass(Registry); + initializeMIRProfileLoaderPassPass(Registry); initializeMachineBlockFrequencyInfoPass(Registry); initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp index a1ff02178ffa..3bed81d5841d 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp @@ -90,7 +90,7 @@ CGOPT(bool, EnableAddrsig) CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableMachineFunctionSplitter) CGOPT(bool, EnableDebugEntryValues) -CGOPT(bool, ValueTrackingVariableLocations) +CGOPT_EXP(bool, ValueTrackingVariableLocations) CGOPT(bool, ForceDwarfFrameSection) CGOPT(bool, XRayOmitFunctionIndex) CGOPT(bool, DebugStrictDwarf) @@ -534,12 +534,17 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.EmitAddrsig = getEnableAddrsig(); Options.EmitCallSiteInfo = getEmitCallSiteInfo(); Options.EnableDebugEntryValues = getEnableDebugEntryValues(); - Options.ValueTrackingVariableLocations = getValueTrackingVariableLocations(); Options.ForceDwarfFrameSection = getForceDwarfFrameSection(); Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex(); Options.DebugStrictDwarf = getDebugStrictDwarf(); Options.LoopAlignment = getAlignLoops(); + if (auto Opt = getExplicitValueTrackingVariableLocations()) + Options.ValueTrackingVariableLocations = *Opt; + else + Options.ValueTrackingVariableLocations = + getDefaultValueTrackingVariableLocations(TheTriple); + Options.MCOptions = mc::InitMCTargetOptionsFromFlags(); Options.ThreadModel = getThreadModel(); @@ -692,3 +697,9 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, for (Function &F : M) setFunctionAttributes(CPU, Features, F); } + +bool codegen::getDefaultValueTrackingVariableLocations(const llvm::Triple &T) { + if (T.getArch() == llvm::Triple::x86_64) + return true; + return false; +} diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 981f5973fee8..4e98d49206b5 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -370,9 +370,7 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin, // Handle cases in which this instruction defines NewReg. MachineInstr *MI = RefOper->getParent(); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &CheckOper = MI->getOperand(i); - + for (const MachineOperand &CheckOper : MI->operands()) { if (CheckOper.isRegMask() && CheckOper.clobbersPhysReg(NewReg)) return true; @@ -462,11 +460,10 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits, // Find the node at the bottom of the critical path. const SUnit *Max = nullptr; - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - const SUnit *SU = &SUnits[i]; - MISUnitMap[SU->getInstr()] = SU; - if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency) - Max = SU; + for (const SUnit &SU : SUnits) { + MISUnitMap[SU.getInstr()] = &SU; + if (!Max || SU.getDepth() + SU.Latency > Max->getDepth() + Max->Latency) + Max = &SU; } assert(Max && "Failed to find bottom of the critical path"); @@ -621,8 +618,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits, // is invalid. If the instruction defines other registers, // save a list of them so that we don't pick a new register // that overlaps any of them. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (Reg == 0) continue; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index c6c0b79cd7e7..0bb186a02416 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -76,8 +76,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const { return false; // Examine each operand. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isReg() && MO.isDef()) { Register Reg = MO.getReg(); if (Register::isPhysicalRegister(Reg)) { @@ -87,7 +86,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const { } else { if (MO.isDead()) { #ifndef NDEBUG - // Sanity check on uses of this dead register. All of them should be + // Baisc check on the register. All of them should be // 'undef'. for (auto &U : MRI->use_nodbg_operands(Reg)) assert(U.isUndef() && "'Undef' use on a 'dead' register is found!"); @@ -152,8 +151,7 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) { } // Record the physreg defs. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.isDef()) { Register Reg = MO.getReg(); if (Register::isPhysicalRegister(Reg)) { @@ -171,8 +169,7 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) { } // Record the physreg uses, after the defs, in case a physreg is // both defined and used in the same instruction. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.isUse()) { Register Reg = MO.getReg(); if (Register::isPhysicalRegister(Reg)) { diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3a52959d54bf..755b3b844570 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/Casting.h" @@ -3732,8 +3733,7 @@ void CombinerHelper::applyExtendThroughPhis(MachineInstr &MI, Builder.setInstrAndDebugLoc(MI); auto NewPhi = Builder.buildInstrNoInsert(TargetOpcode::G_PHI); NewPhi.addDef(DstReg); - for (unsigned SrcIdx = 1; SrcIdx < MI.getNumOperands(); ++SrcIdx) { - auto &MO = MI.getOperand(SrcIdx); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) { if (!MO.isReg()) { NewPhi.addMBB(MO.getMBB()); continue; @@ -3825,8 +3825,7 @@ bool CombinerHelper::matchExtractAllEltsFromBuildVector( unsigned NumElts = DstTy.getNumElements(); SmallBitVector ExtractedElts(NumElts); - for (auto &II : make_range(MRI.use_instr_nodbg_begin(DstReg), - MRI.use_instr_nodbg_end())) { + for (MachineInstr &II : MRI.use_nodbg_instructions(DstReg)) { if (II.getOpcode() != TargetOpcode::G_EXTRACT_VECTOR_ELT) return false; auto Cst = getIConstantVRegVal(II.getOperand(2).getReg(), MRI); @@ -3868,6 +3867,51 @@ void CombinerHelper::applyBuildFnNoErase( MatchInfo(Builder); } +bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_OR); + + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + unsigned BitWidth = Ty.getScalarSizeInBits(); + + Register ShlSrc, ShlAmt, LShrSrc, LShrAmt; + unsigned FshOpc = 0; + + // Match (or (shl x, amt), (lshr y, sub(bw, amt))). + if (mi_match( + Dst, MRI, + // m_GOr() handles the commuted version as well. + m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)), + m_GLShr(m_Reg(LShrSrc), m_GSub(m_SpecificICstOrSplat(BitWidth), + m_Reg(LShrAmt)))))) { + FshOpc = TargetOpcode::G_FSHL; + + // Match (or (shl x, sub(bw, amt)), (lshr y, amt)). + } else if (mi_match(Dst, MRI, + m_GOr(m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt)), + m_GShl(m_Reg(ShlSrc), + m_GSub(m_SpecificICstOrSplat(BitWidth), + m_Reg(ShlAmt)))))) { + FshOpc = TargetOpcode::G_FSHR; + + } else { + return false; + } + + if (ShlAmt != LShrAmt) + return false; + + LLT AmtTy = MRI.getType(ShlAmt); + if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, ShlAmt}); + }; + return true; +} + /// Match an FSHL or FSHR that can be combined to a ROTR or ROTL rotate. bool CombinerHelper::matchFunnelShiftToRotate(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); @@ -4499,20 +4543,9 @@ bool CombinerHelper::matchNarrowBinopFeedingAnd( bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) { unsigned Opc = MI.getOpcode(); assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO); - // Check for a constant 2 or a splat of 2 on the RHS. - auto RHS = MI.getOperand(3).getReg(); - bool IsVector = MRI.getType(RHS).isVector(); - if (!IsVector && !mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(2))) - return false; - if (IsVector) { - // FIXME: There's no mi_match pattern for this yet. - auto *RHSDef = getDefIgnoringCopies(RHS, MRI); - if (!RHSDef) - return false; - auto Splat = getBuildVectorConstantSplat(*RHSDef, MRI); - if (!Splat || *Splat != 2) - return false; - } + + if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(2))) + return false; MatchInfo = [=, &MI](MachineIRBuilder &B) { Observer.changingInstr(MI); @@ -4760,6 +4793,556 @@ bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI, return true; } +/// Checks if \p MI is TargetOpcode::G_FMUL and contractable either +/// due to global flags or MachineInstr flags. +static bool isContractableFMul(MachineInstr &MI, bool AllowFusionGlobally) { + if (MI.getOpcode() != TargetOpcode::G_FMUL) + return false; + return AllowFusionGlobally || MI.getFlag(MachineInstr::MIFlag::FmContract); +} + +static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1, + const MachineRegisterInfo &MRI) { + return std::distance(MRI.use_instr_nodbg_begin(MI0.getOperand(0).getReg()), + MRI.use_instr_nodbg_end()) > + std::distance(MRI.use_instr_nodbg_begin(MI1.getOperand(0).getReg()), + MRI.use_instr_nodbg_end()); +} + +bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, + bool &AllowFusionGlobally, + bool &HasFMAD, bool &Aggressive, + bool CanReassociate) { + + auto *MF = MI.getMF(); + const auto &TLI = *MF->getSubtarget().getTargetLowering(); + const TargetOptions &Options = MF->getTarget().Options; + LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + + if (CanReassociate && + !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc))) + return false; + + // Floating-point multiply-add with intermediate rounding. + HasFMAD = (LI && TLI.isFMADLegal(MI, DstType)); + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(*MF, DstType) && + isLegalOrBeforeLegalizer({TargetOpcode::G_FMA, {DstType}}); + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return false; + + AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD; + // If the addition is not contractable, do not combine. + if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract)) + return false; + + Aggressive = TLI.enableAggressiveFMAFusion(DstType); + return true; +} + +bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FADD); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); + MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && + isContractableFMul(*RHS, AllowFusionGlobally)) { + if (hasMoreUses(*LHS, *RHS, MRI)) + std::swap(LHS, RHS); + } + + // fold (fadd (fmul x, y), z) -> (fma x, y, z) + if (isContractableFMul(*LHS, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), + RHS->getOperand(0).getReg()}); + }; + return true; + } + + // fold (fadd x, (fmul y, z)) -> (fma y, z, x) + if (isContractableFMul(*RHS, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {RHS->getOperand(1).getReg(), RHS->getOperand(2).getReg(), + LHS->getOperand(0).getReg()}); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FADD); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); + MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); + MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && + isContractableFMul(*RHS, AllowFusionGlobally)) { + if (hasMoreUses(*LHS, *RHS, MRI)) + std::swap(LHS, RHS); + } + + // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) + MachineInstr *FpExtSrc; + if (mi_match(LHS->getOperand(0).getReg(), MRI, + m_GFPExt(m_MInstr(FpExtSrc))) && + isContractableFMul(*FpExtSrc, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, + MRI.getType(FpExtSrc->getOperand(1).getReg()))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg()); + auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg()); + B.buildInstr( + PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX.getReg(0), FpExtY.getReg(0), RHS->getOperand(0).getReg()}); + }; + return true; + } + + // fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z) + // Note: Commutes FADD operands. + if (mi_match(RHS->getOperand(0).getReg(), MRI, + m_GFPExt(m_MInstr(FpExtSrc))) && + isContractableFMul(*FpExtSrc, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, + MRI.getType(FpExtSrc->getOperand(1).getReg()))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg()); + auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg()); + B.buildInstr( + PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX.getReg(0), FpExtY.getReg(0), LHS->getOperand(0).getReg()}); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FADD); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true)) + return false; + + MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); + MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && + isContractableFMul(*RHS, AllowFusionGlobally)) { + if (hasMoreUses(*LHS, *RHS, MRI)) + std::swap(LHS, RHS); + } + + MachineInstr *FMA = nullptr; + Register Z; + // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z)) + if (LHS->getOpcode() == PreferredFusedOpcode && + (MRI.getVRegDef(LHS->getOperand(3).getReg())->getOpcode() == + TargetOpcode::G_FMUL) && + MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()) && + MRI.hasOneNonDBGUse(LHS->getOperand(3).getReg())) { + FMA = LHS; + Z = RHS->getOperand(0).getReg(); + } + // fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z)) + else if (RHS->getOpcode() == PreferredFusedOpcode && + (MRI.getVRegDef(RHS->getOperand(3).getReg())->getOpcode() == + TargetOpcode::G_FMUL) && + MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()) && + MRI.hasOneNonDBGUse(RHS->getOperand(3).getReg())) { + Z = LHS->getOperand(0).getReg(); + FMA = RHS; + } + + if (FMA) { + MachineInstr *FMulMI = MRI.getVRegDef(FMA->getOperand(3).getReg()); + Register X = FMA->getOperand(1).getReg(); + Register Y = FMA->getOperand(2).getReg(); + Register U = FMulMI->getOperand(1).getReg(); + Register V = FMulMI->getOperand(2).getReg(); + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register InnerFMA = MRI.createGenericVirtualRegister(DstTy); + B.buildInstr(PreferredFusedOpcode, {InnerFMA}, {U, V, Z}); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {X, Y, InnerFMA}); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FADD); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + if (!Aggressive) + return false; + + const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); + LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); + MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) && + isContractableFMul(*RHS, AllowFusionGlobally)) { + if (hasMoreUses(*LHS, *RHS, MRI)) + std::swap(LHS, RHS); + } + + // Builds: (fma x, y, (fma (fpext u), (fpext v), z)) + auto buildMatchInfo = [=, &MI](Register U, Register V, Register Z, Register X, + Register Y, MachineIRBuilder &B) { + Register FpExtU = B.buildFPExt(DstType, U).getReg(0); + Register FpExtV = B.buildFPExt(DstType, V).getReg(0); + Register InnerFMA = + B.buildInstr(PreferredFusedOpcode, {DstType}, {FpExtU, FpExtV, Z}) + .getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {X, Y, InnerFMA}); + }; + + MachineInstr *FMulMI, *FMAMI; + // fold (fadd (fma x, y, (fpext (fmul u, v))), z) + // -> (fma x, y, (fma (fpext u), (fpext v), z)) + if (LHS->getOpcode() == PreferredFusedOpcode && + mi_match(LHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && + isContractableFMul(*FMulMI, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, + MRI.getType(FMulMI->getOperand(0).getReg()))) { + MatchInfo = [=](MachineIRBuilder &B) { + buildMatchInfo(FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), + RHS->getOperand(0).getReg(), LHS->getOperand(1).getReg(), + LHS->getOperand(2).getReg(), B); + }; + return true; + } + + // fold (fadd (fpext (fma x, y, (fmul u, v))), z) + // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (mi_match(LHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) && + FMAMI->getOpcode() == PreferredFusedOpcode) { + MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); + if (isContractableFMul(*FMulMI, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, + MRI.getType(FMAMI->getOperand(0).getReg()))) { + MatchInfo = [=](MachineIRBuilder &B) { + Register X = FMAMI->getOperand(1).getReg(); + Register Y = FMAMI->getOperand(2).getReg(); + X = B.buildFPExt(DstType, X).getReg(0); + Y = B.buildFPExt(DstType, Y).getReg(0); + buildMatchInfo(FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), + RHS->getOperand(0).getReg(), X, Y, B); + }; + + return true; + } + } + + // fold (fadd z, (fma x, y, (fpext (fmul u, v))) + // -> (fma x, y, (fma (fpext u), (fpext v), z)) + if (RHS->getOpcode() == PreferredFusedOpcode && + mi_match(RHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) && + isContractableFMul(*FMulMI, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, + MRI.getType(FMulMI->getOperand(0).getReg()))) { + MatchInfo = [=](MachineIRBuilder &B) { + buildMatchInfo(FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), + LHS->getOperand(0).getReg(), RHS->getOperand(1).getReg(), + RHS->getOperand(2).getReg(), B); + }; + return true; + } + + // fold (fadd z, (fpext (fma x, y, (fmul u, v))) + // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (mi_match(RHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) && + FMAMI->getOpcode() == PreferredFusedOpcode) { + MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg()); + if (isContractableFMul(*FMulMI, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, + MRI.getType(FMAMI->getOperand(0).getReg()))) { + MatchInfo = [=](MachineIRBuilder &B) { + Register X = FMAMI->getOperand(1).getReg(); + Register Y = FMAMI->getOperand(2).getReg(); + X = B.buildFPExt(DstType, X).getReg(0); + Y = B.buildFPExt(DstType, Y).getReg(0); + buildMatchInfo(FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), + LHS->getOperand(0).getReg(), X, Y, B); + }; + return true; + } + } + + return false; +} + +bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FSUB); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg()); + MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg()); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + int FirstMulHasFewerUses = true; + if (isContractableFMul(*LHS, AllowFusionGlobally) && + isContractableFMul(*RHS, AllowFusionGlobally) && + hasMoreUses(*LHS, *RHS, MRI)) + FirstMulHasFewerUses = false; + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + // fold (fsub (fmul x, y), z) -> (fma x, y, -z) + if (FirstMulHasFewerUses && + (isContractableFMul(*LHS, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg()).getReg(0); + B.buildInstr( + PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), NegZ}); + }; + return true; + } + // fold (fsub x, (fmul y, z)) -> (fma -y, z, x) + else if ((isContractableFMul(*RHS, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg()).getReg(0); + B.buildInstr( + PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {NegY, RHS->getOperand(2).getReg(), LHS->getOperand(0).getReg()}); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FSUB); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + Register LHSReg = MI.getOperand(1).getReg(); + Register RHSReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + MachineInstr *FMulMI; + // fold (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) + if (mi_match(LHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) && + (Aggressive || (MRI.hasOneNonDBGUse(LHSReg) && + MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) && + isContractableFMul(*FMulMI, AllowFusionGlobally)) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register NegX = + B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0); + Register NegZ = B.buildFNeg(DstTy, RHSReg).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {NegX, FMulMI->getOperand(2).getReg(), NegZ}); + }; + return true; + } + + // fold (fsub x, (fneg (fmul, y, z))) -> (fma y, z, x) + if (mi_match(RHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) && + (Aggressive || (MRI.hasOneNonDBGUse(RHSReg) && + MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) && + isContractableFMul(*FMulMI, AllowFusionGlobally)) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), LHSReg}); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FSUB); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + Register LHSReg = MI.getOperand(1).getReg(); + Register RHSReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + MachineInstr *FMulMI; + // fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z)) + if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) && + isContractableFMul(*FMulMI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(LHSReg))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register FpExtX = + B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0); + Register FpExtY = + B.buildFPExt(DstTy, FMulMI->getOperand(2).getReg()).getReg(0); + Register NegZ = B.buildFNeg(DstTy, RHSReg).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {FpExtX, FpExtY, NegZ}); + }; + return true; + } + + // fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x) + if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) && + isContractableFMul(*FMulMI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(RHSReg))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register FpExtY = + B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0); + Register NegY = B.buildFNeg(DstTy, FpExtY).getReg(0); + Register FpExtZ = + B.buildFPExt(DstTy, FMulMI->getOperand(2).getReg()).getReg(0); + B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, + {NegY, FpExtZ, LHSReg}); + }; + return true; + } + + return false; +} + +bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FSUB); + + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + Register LHSReg = MI.getOperand(1).getReg(); + Register RHSReg = MI.getOperand(2).getReg(); + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + auto buildMatchInfo = [=](Register Dst, Register X, Register Y, Register Z, + MachineIRBuilder &B) { + Register FpExtX = B.buildFPExt(DstTy, X).getReg(0); + Register FpExtY = B.buildFPExt(DstTy, Y).getReg(0); + B.buildInstr(PreferredFusedOpcode, {Dst}, {FpExtX, FpExtY, Z}); + }; + + MachineInstr *FMulMI; + // fold (fsub (fpext (fneg (fmul x, y))), z) -> + // (fneg (fma (fpext x), (fpext y), z)) + // fold (fsub (fneg (fpext (fmul x, y))), z) -> + // (fneg (fma (fpext x), (fpext y), z)) + if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) || + mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) && + isContractableFMul(*FMulMI, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, + MRI.getType(FMulMI->getOperand(0).getReg()))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Register FMAReg = MRI.createGenericVirtualRegister(DstTy); + buildMatchInfo(FMAReg, FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), RHSReg, B); + B.buildFNeg(MI.getOperand(0).getReg(), FMAReg); + }; + return true; + } + + // fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x) + // fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x) + if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) || + mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) && + isContractableFMul(*FMulMI, AllowFusionGlobally) && + TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, + MRI.getType(FMulMI->getOperand(0).getReg()))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + buildMatchInfo(MI.getOperand(0).getReg(), FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), LHSReg, B); + }; + return true; + } + + return false; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index c74bec7dfc0d..e09cd26eb0c1 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -585,8 +585,8 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, // FIXME: What does the original arg index mean here? SmallVector<CallLowering::ArgInfo, 3> Args; - for (unsigned i = 1; i < MI.getNumOperands(); i++) - Args.push_back({MI.getOperand(i).getReg(), OpType, 0}); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + Args.push_back({MO.getReg(), OpType, 0}); return createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), OpType, 0}, Args); } @@ -1500,8 +1500,8 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideDstTy = LLT::scalar(NumMerge * WideSize); // Decompose the original operands if they don't evenly divide. - for (int I = 1, E = MI.getNumOperands(); I != E; ++I) { - Register SrcReg = MI.getOperand(I).getReg(); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) { + Register SrcReg = MO.getReg(); if (GCD == SrcSize) { Unmerges.push_back(SrcReg); } else { @@ -4037,8 +4037,8 @@ LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, // Break into a common type SmallVector<Register, 16> Parts; - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) - extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg()); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + extractGCDType(Parts, GCDTy, MO.getReg()); // Build the requested new merge, padding with undef. LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, @@ -7782,7 +7782,6 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, // of that value loaded. This can result in a sequence of loads and stores // mixed types, depending on what the target specifies as good types to use. unsigned CurrOffset = 0; - LLT PtrTy = MRI.getType(Src); unsigned Size = KnownLen; for (auto CopyTy : MemOps) { // Issuing an unaligned load / store pair that overlaps with the previous @@ -7800,15 +7799,19 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, Register LoadPtr = Src; Register Offset; if (CurrOffset != 0) { - Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) + LLT SrcTy = MRI.getType(Src); + Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset) .getReg(0); - LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); } auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); // Create the store. - Register StorePtr = - CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); + Register StorePtr = Dst; + if (CurrOffset != 0) { + LLT DstTy = MRI.getType(Dst); + StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + } MIB.buildStore(LdVal, StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); Size -= CopyTy.getSizeInBytes(); @@ -7885,7 +7888,6 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, // Apart from that, this loop is pretty much doing the same thing as the // memcpy codegen function. unsigned CurrOffset = 0; - LLT PtrTy = MRI.getType(Src); SmallVector<Register, 16> LoadVals; for (auto CopyTy : MemOps) { // Construct MMO for the load. @@ -7895,9 +7897,10 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, // Create the load. Register LoadPtr = Src; if (CurrOffset != 0) { + LLT SrcTy = MRI.getType(Src); auto Offset = - MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); - LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0); + MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset); + LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); } LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); CurrOffset += CopyTy.getSizeInBytes(); @@ -7912,9 +7915,10 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, Register StorePtr = Dst; if (CurrOffset != 0) { + LLT DstTy = MRI.getType(Dst); auto Offset = - MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); - StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); + MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset); + StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index 1a2102e3ef21..650500c7eb31 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -123,7 +123,7 @@ const RegisterBank *RegisterBankInfo::getRegBankFromConstraints( Register Reg = MI.getOperand(OpIdx).getReg(); const RegisterBank &RegBank = getRegBankFromRegClass(*RC, MRI.getType(Reg)); - // Sanity check that the target properly implemented getRegBankFromRegClass. + // Check that the target properly implemented getRegBankFromRegClass. assert(RegBank.covers(*RC) && "The mapping of the register bank does not make sense"); return &RegBank; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 1a440c064a59..b0b84763e922 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -834,10 +834,9 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, case TargetOpcode::G_BUILD_VECTOR: { // TODO: Probably should have a recursion depth guard since you could have // bitcasted vector elements. - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) { - if (!isKnownToBeAPowerOfTwo(MI.getOperand(I).getReg(), MRI, KB)) + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + if (!isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB)) return false; - } return true; } @@ -845,8 +844,8 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, // Only handle constants since we would need to know if number of leading // zeros is greater than the truncation amount. const unsigned BitWidth = Ty.getScalarSizeInBits(); - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) { - auto Const = getIConstantVRegVal(MI.getOperand(I).getReg(), MRI); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) { + auto Const = getIConstantVRegVal(MO.getReg(), MRI); if (!Const || !Const->zextOrTrunc(BitWidth).isPowerOf2()) return false; } @@ -1031,16 +1030,22 @@ Optional<ValueAndVReg> getAnyConstantSplat(Register VReg, return SplatValAndReg; } -bool isBuildVectorConstantSplat(const MachineInstr &MI, - const MachineRegisterInfo &MRI, - int64_t SplatValue, bool AllowUndef) { - if (auto SplatValAndReg = - getAnyConstantSplat(MI.getOperand(0).getReg(), MRI, AllowUndef)) +} // end anonymous namespace + +bool llvm::isBuildVectorConstantSplat(const Register Reg, + const MachineRegisterInfo &MRI, + int64_t SplatValue, bool AllowUndef) { + if (auto SplatValAndReg = getAnyConstantSplat(Reg, MRI, AllowUndef)) return mi_match(SplatValAndReg->VReg, MRI, m_SpecificICst(SplatValue)); return false; } -} // end anonymous namespace +bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + int64_t SplatValue, bool AllowUndef) { + return isBuildVectorConstantSplat(MI.getOperand(0).getReg(), MRI, SplatValue, + AllowUndef); +} Optional<int64_t> llvm::getBuildVectorConstantSplat(const MachineInstr &MI, diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp index 6c1ce4c1efb0..bbd9006a5d8c 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp @@ -399,8 +399,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, // having a single global, but is aggressive enough for any other case. if (GlobalMergeIgnoreSingleUse) { BitVector AllGlobals(Globals.size()); - for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) { - const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1]; + for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) { if (UGS.UsageCount == 0) continue; if (UGS.Globals.count() > 1) @@ -418,8 +417,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, BitVector PickedGlobals(Globals.size()); bool Changed = false; - for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) { - const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1]; + for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) { if (UGS.UsageCount == 0) continue; if (PickedGlobals.anyCommon(UGS.Globals)) diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp index e4606daba352..2d38a44d5a33 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp @@ -260,10 +260,12 @@ bool IndirectBrExpandPass::runOnFunction(Function &F) { if (DTU) { // If there were multiple indirectbr's, they may have common successors, // but in the dominator tree, we only track unique edges. - SmallPtrSet<BasicBlock *, 8> UniqueSuccessors(BBs.begin(), BBs.end()); - Updates.reserve(Updates.size() + UniqueSuccessors.size()); - for (BasicBlock *BB : UniqueSuccessors) - Updates.push_back({DominatorTree::Insert, SwitchBB, BB}); + SmallPtrSet<BasicBlock *, 8> UniqueSuccessors; + Updates.reserve(Updates.size() + BBs.size()); + for (BasicBlock *BB : BBs) { + if (UniqueSuccessors.insert(BB).second) + Updates.push_back({DominatorTree::Insert, SwitchBB, BB}); + } DTU->applyUpdates(Updates); } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp index 64e1f4351456..fc5ac45752ca 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp @@ -274,11 +274,9 @@ static Register isFullCopyOf(const MachineInstr &MI, Register Reg) { } static void getVDefInterval(const MachineInstr &MI, LiveIntervals &LIS) { - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI.getOperand(I); + for (const MachineOperand &MO : MI.operands()) if (MO.isReg() && MO.isDef() && Register::isVirtualRegister(MO.getReg())) LIS.getInterval(MO.getReg()); - } } /// isSnippet - Identify if a live interval is a snippet that should be spilled. @@ -583,11 +581,9 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { if (!ParentVNI) { LLVM_DEBUG(dbgs() << "\tadding <undef> flags: "); - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (MachineOperand &MO : MI.operands()) if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg()) MO.setIsUndef(); - } LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI); return true; } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp index c3e0553418a5..fab6b8d10a33 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp @@ -73,11 +73,9 @@ void LatencyPriorityQueue::push(SUnit *SU) { // Look at all of the successors of this node. Count the number of nodes that // this node is the sole unscheduled node for. unsigned NumNodesBlocking = 0; - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (getSingleUnscheduledPred(I->getSUnit()) == SU) + for (const SDep &Succ : SU->Succs) + if (getSingleUnscheduledPred(Succ.getSUnit()) == SU) ++NumNodesBlocking; - } NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking; Queue.push_back(SU); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index a4eb3094612b..cf62b0e5d7e8 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -544,8 +544,7 @@ public: // Re-state the variable location: if there's no replacement then NewLoc // is None and a $noreg DBG_VALUE will be created. Otherwise, a DBG_VALUE // identifying the alternative location will be emitted. - const DIExpression *Expr = ActiveVLocIt->second.Properties.DIExpr; - DbgValueProperties Properties(Expr, false); + const DbgValueProperties &Properties = ActiveVLocIt->second.Properties; PendingDbgValues.push_back(MTracker->emitLoc(NewLoc, Var, Properties)); // Update machine locations <=> variable locations maps. Defer updating @@ -836,6 +835,15 @@ MachineInstrBuilder MLocTracker::emitLoc(Optional<LocIdx> MLoc, unsigned Base = Spill.SpillBase; MIB.addReg(Base); MIB.addImm(0); + + // Being on the stack makes this location indirect; if it was _already_ + // indirect though, we need to add extra indirection. See this test for + // a scenario where this happens: + // llvm/test/DebugInfo/X86/spill-nontrivial-param.ll + if (Properties.Indirect) { + std::vector<uint64_t> Elts = {dwarf::DW_OP_deref}; + Expr = DIExpression::append(Expr, Elts); + } } else { // This is a stack location with a weird subregister offset: emit an undef // DBG_VALUE instead. @@ -1288,6 +1296,24 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) { } else if (MI.isMetaInstruction()) return; + // We always ignore SP defines on call instructions, they don't actually + // change the value of the stack pointer... except for win32's _chkstk. This + // is rare: filter quickly for the common case (no stack adjustments, not a + // call, etc). If it is a call that modifies SP, recognise the SP register + // defs. + bool CallChangesSP = false; + if (AdjustsStackInCalls && MI.isCall() && MI.getOperand(0).isSymbol() && + !strcmp(MI.getOperand(0).getSymbolName(), StackProbeSymbolName.data())) + CallChangesSP = true; + + // Test whether we should ignore a def of this register due to it being part + // of the stack pointer. + auto IgnoreSPAlias = [this, &MI, CallChangesSP](Register R) -> bool { + if (CallChangesSP) + return false; + return MI.isCall() && MTracker->SPAliases.count(R); + }; + // Find the regs killed by MI, and find regmasks of preserved regs. // Max out the number of statically allocated elements in `DeadRegs`, as this // prevents fallback to std::set::count() operations. @@ -1298,7 +1324,7 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) { // Determine whether the operand is a register def. if (MO.isReg() && MO.isDef() && MO.getReg() && Register::isPhysicalRegister(MO.getReg()) && - !(MI.isCall() && MTracker->SPAliases.count(MO.getReg()))) { + !IgnoreSPAlias(MO.getReg())) { // Remove ranges of all aliased registers. for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI) // FIXME: Can we break out of this loop early if no insertion occurs? @@ -1347,6 +1373,9 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) { continue; Register Reg = MTracker->LocIdxToLocID[L.Idx]; + if (IgnoreSPAlias(Reg)) + continue; + for (auto *MO : RegMaskPtrs) if (MO->clobbersPhysReg(Reg)) TTracker->clobberMloc(L.Idx, MI.getIterator(), false); @@ -1628,9 +1657,10 @@ bool InstrRefBasedLDV::transferRegisterCopy(MachineInstr &MI) { /// fragments of that DILocalVariable which overlap. This reduces work during /// the data-flow stage from "Find any overlapping fragments" to "Check if the /// known-to-overlap fragments are present". -/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for +/// \param MI A previously unprocessed debug instruction to analyze for /// fragment usage. void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) { + assert(MI.isDebugValue() || MI.isDebugRef()); DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); FragmentInfo ThisFragment = MIVar.getFragmentOrDefault(); @@ -1732,7 +1762,7 @@ void InstrRefBasedLDV::produceMLocTransferFunction( for (auto &MI : MBB) { process(MI); // Also accumulate fragment map. - if (MI.isDebugValue()) + if (MI.isDebugValue() || MI.isDebugRef()) accumulateFragmentMap(MI); // Create a map from the instruction number (if present) to the @@ -2322,15 +2352,8 @@ Optional<ValueIDNum> InstrRefBasedLDV::pickVPHILoc( bool InstrRefBasedLDV::vlocJoin( MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs, - SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks, SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore, DbgValue &LiveIn) { - // To emulate VarLocBasedImpl, process this block if it's not in scope but - // _does_ assign a variable value. No live-ins for this scope are transferred - // in though, so we can return immediately. - if (InScopeBlocks.count(&MBB) == 0 && !ArtificialBlocks.count(&MBB)) - return false; - LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n"); bool Changed = false; @@ -2466,11 +2489,10 @@ void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc, // "blocks that are potentially in scope. See comment at start of vlocJoin. SmallPtrSet<const MachineBasicBlock *, 8> InScopeBlocks = BlocksToExplore; - // Old LiveDebugValues tracks variable locations that come out of blocks - // not in scope, where DBG_VALUEs occur. This is something we could - // legitimately ignore, but lets allow it for now. - if (EmulateOldLDV) - BlocksToExplore.insert(AssignBlocks.begin(), AssignBlocks.end()); + // VarLoc LiveDebugValues tracks variable locations that are defined in + // blocks not in scope. This is something we could legitimately ignore, but + // lets allow it for now for the sake of coverage. + BlocksToExplore.insert(AssignBlocks.begin(), AssignBlocks.end()); // We also need to propagate variable values through any artificial blocks // that immediately follow blocks in scope. @@ -2635,7 +2657,7 @@ void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc, // Join values from predecessors. Updates LiveInIdx, and writes output // into JoinedInLocs. bool InLocsChanged = - vlocJoin(*MBB, LiveOutIdx, InScopeBlocks, BlocksToExplore, *LiveIn); + vlocJoin(*MBB, LiveOutIdx, BlocksToExplore, *LiveIn); SmallVector<const MachineBasicBlock *, 8> Preds; for (const auto *Pred : MBB->predecessors()) @@ -2730,6 +2752,8 @@ void InstrRefBasedLDV::buildVLocValueMap(const DILocation *DILoc, continue; if (BlockLiveIn->Kind == DbgValue::VPHI) BlockLiveIn->Kind = DbgValue::Def; + assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() == + Var.getFragment() && "Fragment info missing during value prop"); Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn)); } } // Per-variable loop. @@ -2879,6 +2903,12 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, MFI = &MF.getFrameInfo(); LS.initialize(MF); + const auto &STI = MF.getSubtarget(); + AdjustsStackInCalls = MFI->adjustsStack() && + STI.getFrameLowering()->stackProbeFunctionModifiesSP(); + if (AdjustsStackInCalls) + StackProbeSymbolName = STI.getTargetLowering()->getStackProbeSymbolName(MF); + MTracker = new MLocTracker(MF, *TII, *TRI, *MF.getSubtarget().getTargetLowering()); VTracker = nullptr; @@ -2895,7 +2925,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, ++MaxNumBlocks; MLocTransfer.resize(MaxNumBlocks); - vlocs.resize(MaxNumBlocks); + vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr)); SavedLiveIns.resize(MaxNumBlocks); initialSetup(MF); @@ -3040,6 +3070,8 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, BBNumToRPO.clear(); DebugInstrNumToInstr.clear(); DebugPHINumToValue.clear(); + OverlapFragments.clear(); + SeenFragments.clear(); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index d96ef6d4f6e5..789205e61cdb 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -655,6 +655,14 @@ public: const DbgValueProperties &Properties); }; +/// Types for recording sets of variable fragments that overlap. For a given +/// local variable, we record all other fragments of that variable that could +/// overlap it, to reduce search time. +using FragmentOfVar = + std::pair<const DILocalVariable *, DIExpression::FragmentInfo>; +using OverlapMap = + DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>; + /// Collection of DBG_VALUEs observed when traversing a block. Records each /// variable and the value the DBG_VALUE refers to. Requires the machine value /// location dataflow algorithm to have run already, so that values can be @@ -672,9 +680,12 @@ public: MapVector<DebugVariable, DbgValue> Vars; DenseMap<DebugVariable, const DILocation *> Scopes; MachineBasicBlock *MBB = nullptr; + const OverlapMap &OverlappingFragments; + DbgValueProperties EmptyProperties; public: - VLocTracker() {} + VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr) + : OverlappingFragments(O), EmptyProperties(EmptyExpr, false) {} void defVar(const MachineInstr &MI, const DbgValueProperties &Properties, Optional<ValueIDNum> ID) { @@ -689,6 +700,8 @@ public: if (!Result.second) Result.first->second = Rec; Scopes[Var] = MI.getDebugLoc().get(); + + considerOverlaps(Var, MI.getDebugLoc().get()); } void defVar(const MachineInstr &MI, const MachineOperand &MO) { @@ -704,16 +717,37 @@ public: if (!Result.second) Result.first->second = Rec; Scopes[Var] = MI.getDebugLoc().get(); + + considerOverlaps(Var, MI.getDebugLoc().get()); } -}; -/// Types for recording sets of variable fragments that overlap. For a given -/// local variable, we record all other fragments of that variable that could -/// overlap it, to reduce search time. -using FragmentOfVar = - std::pair<const DILocalVariable *, DIExpression::FragmentInfo>; -using OverlapMap = - DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>; + void considerOverlaps(const DebugVariable &Var, const DILocation *Loc) { + auto Overlaps = OverlappingFragments.find( + {Var.getVariable(), Var.getFragmentOrDefault()}); + if (Overlaps == OverlappingFragments.end()) + return; + + // Otherwise: terminate any overlapped variable locations. + for (auto FragmentInfo : Overlaps->second) { + // The "empty" fragment is stored as DebugVariable::DefaultFragment, so + // that it overlaps with everything, however its cannonical representation + // in a DebugVariable is as "None". + Optional<DIExpression::FragmentInfo> OptFragmentInfo = FragmentInfo; + if (DebugVariable::isDefaultFragment(FragmentInfo)) + OptFragmentInfo = None; + + DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo, + Var.getInlinedAt()); + DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef); + + // Attempt insertion; overwrite if it's already mapped. + auto Result = Vars.insert(std::make_pair(Overlapped, Rec)); + if (!Result.second) + Result.first->second = Rec; + Scopes[Overlapped] = Loc; + } + } +}; // XXX XXX docs class InstrRefBasedLDV : public LDVImpl { @@ -817,6 +851,16 @@ private: OverlapMap OverlapFragments; VarToFragments SeenFragments; + /// True if we need to examine call instructions for stack clobbers. We + /// normally assume that they don't clobber SP, but stack probes on Windows + /// do. + bool AdjustsStackInCalls = false; + + /// If AdjustsStackInCalls is true, this holds the name of the target's stack + /// probe function, which is the function we expect will alter the stack + /// pointer. + StringRef StackProbeSymbolName; + /// Tests whether this instruction is a spill to a stack slot. bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF); @@ -962,7 +1006,6 @@ private: /// \returns true if any live-ins change value, either from value propagation /// or PHI elimination. bool vlocJoin(MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs, - SmallPtrSet<const MachineBasicBlock *, 8> &InScopeBlocks, SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore, DbgValue &LiveIn); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp index dcd546f9c6db..5f976bf43c5b 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -1875,34 +1875,57 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) { LLVM_DEBUG(dbgs() << "********** EMITTING INSTR REFERENCES **********\n"); - // Re-insert any debug instrs back in the position they were. Ordering - // is preserved by vector. We must re-insert in the same order to ensure that - // debug instructions don't swap, which could re-order assignments. - for (auto &P : StashedDebugInstrs) { - SlotIndex Idx = P.Idx; + // Re-insert any debug instrs back in the position they were. We must + // re-insert in the same order to ensure that debug instructions don't swap, + // which could re-order assignments. Do so in a batch -- once we find the + // insert position, insert all instructions at the same SlotIdx. They are + // guaranteed to appear in-sequence in StashedDebugInstrs because we insert + // them in order. + for (auto StashIt = StashedDebugInstrs.begin(); + StashIt != StashedDebugInstrs.end(); ++StashIt) { + SlotIndex Idx = StashIt->Idx; + MachineBasicBlock *MBB = StashIt->MBB; + MachineInstr *MI = StashIt->MI; + + auto EmitInstsHere = [this, &StashIt, MBB, Idx, + MI](MachineBasicBlock::iterator InsertPos) { + // Insert this debug instruction. + MBB->insert(InsertPos, MI); + + // Look at subsequent stashed debug instructions: if they're at the same + // index, insert those too. + auto NextItem = std::next(StashIt); + while (NextItem != StashedDebugInstrs.end() && NextItem->Idx == Idx) { + assert(NextItem->MBB == MBB && "Instrs with same slot index should be" + "in the same block"); + MBB->insert(InsertPos, NextItem->MI); + StashIt = NextItem; + NextItem = std::next(StashIt); + }; + }; // Start block index: find the first non-debug instr in the block, and // insert before it. - if (Idx == Slots->getMBBStartIdx(P.MBB)) { + if (Idx == Slots->getMBBStartIdx(MBB)) { MachineBasicBlock::iterator InsertPos = - findInsertLocation(P.MBB, Idx, *LIS, BBSkipInstsMap); - P.MBB->insert(InsertPos, P.MI); + findInsertLocation(MBB, Idx, *LIS, BBSkipInstsMap); + EmitInstsHere(InsertPos); continue; } if (MachineInstr *Pos = Slots->getInstructionFromIndex(Idx)) { // Insert at the end of any debug instructions. auto PostDebug = std::next(Pos->getIterator()); - PostDebug = skipDebugInstructionsForward(PostDebug, P.MBB->instr_end()); - P.MBB->insert(PostDebug, P.MI); + PostDebug = skipDebugInstructionsForward(PostDebug, MBB->instr_end()); + EmitInstsHere(PostDebug); } else { // Insert position disappeared; walk forwards through slots until we // find a new one. - SlotIndex End = Slots->getMBBEndIdx(P.MBB); + SlotIndex End = Slots->getMBBEndIdx(MBB); for (; Idx < End; Idx = Slots->getNextNonNullIndex(Idx)) { Pos = Slots->getInstructionFromIndex(Idx); if (Pos) { - P.MBB->insert(Pos->getIterator(), P.MI); + EmitInstsHere(Pos->getIterator()); break; } } @@ -1911,8 +1934,8 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) { // insert! It's not safe to discard any debug instructions; place them // in front of the first terminator, or in front of end(). if (Idx >= End) { - auto TermIt = P.MBB->getFirstTerminator(); - P.MBB->insert(TermIt, P.MI); + auto TermIt = MBB->getFirstTerminator(); + EmitInstsHere(TermIt); } } } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp index d91ff734ad8f..6380c4bfd6e6 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -108,8 +108,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex UseIdx) const { OrigIdx = OrigIdx.getRegSlot(true); UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true)); - for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = OrigMI->getOperand(i); + for (const MachineOperand &MO : OrigMI->operands()) { if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) continue; @@ -425,15 +424,8 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead, // The new intervals would have to be spilled anyway so its not worth it. // Also they currently aren't spilled so creating them and not spilling // them results in incorrect code. - bool BeingSpilled = false; - for (unsigned i = 0, e = RegsBeingSpilled.size(); i != e; ++i) { - if (VReg == RegsBeingSpilled[i]) { - BeingSpilled = true; - break; - } - } - - if (BeingSpilled) continue; + if (llvm::is_contained(RegsBeingSpilled, VReg)) + continue; // LI may have been separated, create new intervals. LI->RenumberValues(); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h index dace05f1ad95..ada5c5be484a 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h +++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h @@ -18,7 +18,7 @@ namespace llvm { /// Helper function that distributes live range value numbers and the -/// corresponding segments of a master live range \p LR to a list of newly +/// corresponding segments of a primary live range \p LR to a list of newly /// created live ranges \p SplitLRs. \p VNIClasses maps each value number in \p /// LR to 0 meaning it should stay or to 1..N meaning it should go to a specific /// live range in the \p SplitLRs array. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp index 51ba4b7e53eb..e8744797707b 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp @@ -58,9 +58,9 @@ void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const { MachineInstr * LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const { - for (unsigned i = 0, e = Kills.size(); i != e; ++i) - if (Kills[i]->getParent() == MBB) - return Kills[i]; + for (MachineInstr *MI : Kills) + if (MI->getParent() == MBB) + return MI; return nullptr; } @@ -811,8 +811,8 @@ bool LiveVariables::isLiveOut(Register Reg, const MachineBasicBlock &MBB) { LiveVariables::VarInfo &VI = getVarInfo(Reg); SmallPtrSet<const MachineBasicBlock *, 8> Kills; - for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i) - Kills.insert(VI.Kills[i]->getParent()); + for (MachineInstr *MI : VI.Kills) + Kills.insert(MI->getParent()); // Loop over all of the successors of the basic block, checking to see if // the value is either live in the block, or if it is killed in the block. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 2e99c8595cbd..ee2387d1e8e6 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -316,14 +316,14 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) { // than that, but the increased register pressure makes that a // tricky thing to balance. Investigate if re-materializing these // becomes an issue. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + for (const MachineOperand &MO : MI.operands()) { // Consider replacing all frame index operands that reference // an object allocated in the local block. - if (MI.getOperand(i).isFI()) { + if (MO.isFI()) { // Don't try this with values not in the local block. - if (!MFI.isObjectPreAllocated(MI.getOperand(i).getIndex())) + if (!MFI.isObjectPreAllocated(MO.getIndex())) break; - int Idx = MI.getOperand(i).getIndex(); + int Idx = MO.getIndex(); int64_t LocalOffset = LocalOffsets[Idx]; if (!TRI->needsFrameBaseReg(&MI, LocalOffset)) break; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp index 90ecc6fc68fc..b742ad9823c9 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -314,6 +314,8 @@ bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) { } bool Changed = MIRSampleLoader->runOnFunction(MF); + if (Changed) + MBFI->calculate(MF, *MBFI->getMBPI(), *&getAnalysis<MachineLoopInfo>()); if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None && (ViewBlockFreqFuncName.empty() || diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp index 366d06871245..310c2721c3bd 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp @@ -1170,9 +1170,10 @@ auto MachineFunction::salvageCopySSA(MachineInstr &MI) void MachineFunction::finalizeDebugInstrRefs() { auto *TII = getSubtarget().getInstrInfo(); - auto MakeDbgValue = [&](MachineInstr &MI) { + auto MakeUndefDbgValue = [&](MachineInstr &MI) { const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_VALUE); MI.setDesc(RefII); + MI.getOperand(0).setReg(0); MI.getOperand(1).ChangeToRegister(0, false); }; @@ -1187,15 +1188,15 @@ void MachineFunction::finalizeDebugInstrRefs() { Register Reg = MI.getOperand(0).getReg(); // Some vregs can be deleted as redundant in the meantime. Mark those - // as DBG_VALUE $noreg. - if (Reg == 0) { - MakeDbgValue(MI); + // as DBG_VALUE $noreg. Additionally, some normal instructions are + // quickly deleted, leaving dangling references to vregs with no def. + if (Reg == 0 || !RegInfo->hasOneDef(Reg)) { + MakeUndefDbgValue(MI); continue; } assert(Reg.isVirtual()); MachineInstr &DefMI = *RegInfo->def_instr_begin(Reg); - assert(RegInfo->hasOneDef(Reg)); // If we've found a copy-like instruction, follow it back to the // instruction that defines the source value, see salvageCopySSA docs @@ -1327,9 +1328,9 @@ bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx, assert(Old != New && "Not making a change?"); bool MadeChange = false; MachineJumpTableEntry &JTE = JumpTables[Idx]; - for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j) - if (JTE.MBBs[j] == Old) { - JTE.MBBs[j] = New; + for (MachineBasicBlock *&MBB : JTE.MBBs) + if (MBB == Old) { + MBB = New; MadeChange = true; } return MadeChange; @@ -1342,8 +1343,8 @@ void MachineJumpTableInfo::print(raw_ostream &OS) const { for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) { OS << printJumpTableEntryReference(i) << ':'; - for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j) - OS << ' ' << printMBBReference(*JumpTables[i].MBBs[j]); + for (const MachineBasicBlock *MBB : JumpTables[i].MBBs) + OS << ' ' << printMBBReference(*MBB); if (i != e) OS << '\n'; } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp index 5c4f75e9ceb9..aaa80432d2f2 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp @@ -1490,12 +1490,10 @@ bool MachineInstr::allDefsAreDead() const { /// instruction to this instruction. void MachineInstr::copyImplicitOps(MachineFunction &MF, const MachineInstr &MI) { - for (unsigned i = MI.getDesc().getNumOperands(), e = MI.getNumOperands(); - i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : + llvm::drop_begin(MI.operands(), MI.getDesc().getNumOperands())) if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) addOperand(MF, MO); - } } bool MachineInstr::hasComplexRegisterTies() const { diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp index 4d080e1a4f82..680dbe54ffaf 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp @@ -1071,7 +1071,9 @@ void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) { // The Value and Offset may differ due to CSE. But the flags and size // should be the same. assert(MMO->getFlags() == getFlags() && "Flags mismatch!"); - assert(MMO->getSize() == getSize() && "Size mismatch!"); + assert((MMO->getSize() == ~UINT64_C(0) || getSize() == ~UINT64_C(0) || + MMO->getSize() == getSize()) && + "Size mismatch!"); if (MMO->getBaseAlign() >= getBaseAlign()) { // Update the alignment value. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp index cfbccebaff3e..7783b5e0d3cc 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp @@ -617,20 +617,11 @@ MachineFunction *MachineOutliner::createOutlinedFunction( F->addFnAttr(Attribute::OptimizeForSize); F->addFnAttr(Attribute::MinSize); - // Include target features from an arbitrary candidate for the outlined - // function. This makes sure the outlined function knows what kinds of - // instructions are going into it. This is fine, since all parent functions - // must necessarily support the instructions that are in the outlined region. Candidate &FirstCand = OF.Candidates.front(); - const Function &ParentFn = FirstCand.getMF()->getFunction(); - if (ParentFn.hasFnAttribute("target-features")) - F->addFnAttr(ParentFn.getFnAttribute("target-features")); + const TargetInstrInfo &TII = + *FirstCand.getMF()->getSubtarget().getInstrInfo(); - // Set nounwind, so we don't generate eh_frame. - if (llvm::all_of(OF.Candidates, [](const outliner::Candidate &C) { - return C.getMF()->getFunction().hasFnAttribute(Attribute::NoUnwind); - })) - F->addFnAttr(Attribute::NoUnwind); + TII.mergeOutliningCandidateAttributes(*F, OF.Candidates); BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); IRBuilder<> Builder(EntryBB); @@ -639,8 +630,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction( MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock(); - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetInstrInfo &TII = *STI.getInstrInfo(); // Insert the new function into the module. MF.insert(MF.begin(), &MBB); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp index e18318386def..8d6459a627fa 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1455,17 +1455,15 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { int asap = 0; int zeroLatencyDepth = 0; SUnit *SU = &SUnits[I]; - for (SUnit::const_pred_iterator IP = SU->Preds.begin(), - EP = SU->Preds.end(); - IP != EP; ++IP) { - SUnit *pred = IP->getSUnit(); - if (IP->getLatency() == 0) + for (const SDep &P : SU->Preds) { + SUnit *pred = P.getSUnit(); + if (P.getLatency() == 0) zeroLatencyDepth = std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1); - if (ignoreDependence(*IP, true)) + if (ignoreDependence(P, true)) continue; - asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() - - getDistance(pred, SU, *IP) * MII)); + asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() - + getDistance(pred, SU, P) * MII)); } maxASAP = std::max(maxASAP, asap); ScheduleInfo[I].ASAP = asap; @@ -1521,9 +1519,8 @@ static bool pred_L(SetVector<SUnit *> &NodeOrder, SmallSetVector<SUnit *, 8> &Preds, const NodeSet *S = nullptr) { Preds.clear(); - for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end(); - I != E; ++I) { - for (const SDep &Pred : (*I)->Preds) { + for (const SUnit *SU : NodeOrder) { + for (const SDep &Pred : SU->Preds) { if (S && S->count(Pred.getSUnit()) == 0) continue; if (ignoreDependence(Pred, true)) @@ -1532,7 +1529,7 @@ static bool pred_L(SetVector<SUnit *> &NodeOrder, Preds.insert(Pred.getSUnit()); } // Back-edges are predecessors with an anti-dependence. - for (const SDep &Succ : (*I)->Succs) { + for (const SDep &Succ : SU->Succs) { if (Succ.getKind() != SDep::Anti) continue; if (S && S->count(Succ.getSUnit()) == 0) @@ -2546,8 +2543,7 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, unsigned Pos = 0; for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E; ++I, ++Pos) { - for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg())) continue; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp index 30745c7a5583..54c478645dcf 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp @@ -596,8 +596,7 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI, // MI is cheap, we probably don't want to break the critical edge for it. // However, if this would allow some definitions of its source operands // to be sunk then it's probably worth it. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); @@ -789,8 +788,7 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI, // If this instruction is inside a loop and sinking this instruction can make // more registers live range shorten, it is still prifitable. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { // Ignore non-register operands. if (!MO.isReg()) continue; @@ -889,8 +887,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB, // SuccToSinkTo - This is the successor to sink this instruction to, once we // decide. MachineBasicBlock *SuccToSinkTo = nullptr; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; // Ignore non-register operands. Register Reg = MO.getReg(); @@ -1322,8 +1319,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, // If the instruction to move defines a dead physical register which is live // when leaving the basic block, don't move it because it could turn into a // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>) - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI.getOperand(I); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || MO.isUse()) continue; Register Reg = MO.getReg(); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp index d6bb3e7c9e58..32078db76cf3 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1276,11 +1276,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { if (DstTy.getNumElements() != MI->getNumOperands() - 1) report("G_BUILD_VECTOR must have an operand for each elemement", MI); - for (unsigned i = 2; i < MI->getNumOperands(); ++i) { - if (MRI->getType(MI->getOperand(1).getReg()) != - MRI->getType(MI->getOperand(i).getReg())) + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2)) + if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg())) report("G_BUILD_VECTOR source operand types are not homogeneous", MI); - } break; } @@ -1292,12 +1290,10 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { if (!DstTy.isVector() || SrcEltTy.isVector()) report("G_BUILD_VECTOR_TRUNC must produce a vector from scalar operands", MI); - for (unsigned i = 2; i < MI->getNumOperands(); ++i) { - if (MRI->getType(MI->getOperand(1).getReg()) != - MRI->getType(MI->getOperand(i).getReg())) + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2)) + if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg())) report("G_BUILD_VECTOR_TRUNC source operand types are not homogeneous", MI); - } if (SrcEltTy.getSizeInBits() <= DstTy.getElementType().getSizeInBits()) report("G_BUILD_VECTOR_TRUNC source operand types are not larger than " "dest elt type", @@ -1316,11 +1312,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { if (MI->getNumOperands() < 3) report("G_CONCAT_VECTOR requires at least 2 source operands", MI); - for (unsigned i = 2; i < MI->getNumOperands(); ++i) { - if (MRI->getType(MI->getOperand(1).getReg()) != - MRI->getType(MI->getOperand(i).getReg())) + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2)) + if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg())) report("G_CONCAT_VECTOR source operand types are not homogeneous", MI); - } if (DstTy.getNumElements() != SrcTy.getNumElements() * (MI->getNumOperands() - 1)) report("G_CONCAT_VECTOR num dest and source elements should match", MI); @@ -3063,9 +3057,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, SlotIndex PEnd = LiveInts->getMBBEndIdx(Pred); // Predecessor of landing pad live-out on last call. if (MFI->isEHPad()) { - for (auto I = Pred->rbegin(), E = Pred->rend(); I != E; ++I) { - if (I->isCall()) { - PEnd = Indexes->getInstructionIndex(*I).getBoundaryIndex(); + for (const MachineInstr &MI : llvm::reverse(*Pred)) { + if (MI.isCall()) { + PEnd = Indexes->getInstructionIndex(MI).getBoundaryIndex(); break; } } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp index 8b3cdfab4d42..aaa6403cc978 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -73,8 +73,7 @@ void ModuloScheduleExpander::expand() { // stage difference for each use. Keep the maximum value. for (MachineInstr *MI : Schedule.getInstructions()) { int DefStage = Schedule.getStage(MI); - for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { - MachineOperand &Op = MI->getOperand(i); + for (const MachineOperand &Op : MI->operands()) { if (!Op.isReg() || !Op.isDef()) continue; @@ -1006,8 +1005,7 @@ void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI, unsigned CurStageNum, unsigned InstrStageNum, ValueMapTy *VRMap) { - for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = NewMI->getOperand(i); + for (MachineOperand &MO : NewMI->operands()) { if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg())) continue; Register reg = MO.getReg(); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 9a4f70a6070f..29a88480fd9f 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -527,9 +527,9 @@ static void updateLiveness(MachineFunction &MF) { const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + for (const CalleeSavedInfo &I : CSI) { for (MachineBasicBlock *MBB : Visited) { - MCPhysReg Reg = CSI[i].getReg(); + MCPhysReg Reg = I.getReg(); // Add the callee-saved register as live-in. // It's killed at the spill. if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) @@ -540,17 +540,16 @@ static void updateLiveness(MachineFunction &MF) { // each MBB between the prologue and epilogue so that it is not clobbered // before it is reloaded in the epilogue. The Visited set contains all // blocks outside of the region delimited by prologue/epilogue. - if (CSI[i].isSpilledToReg()) { + if (I.isSpilledToReg()) { for (MachineBasicBlock &MBB : MF) { if (Visited.count(&MBB)) continue; - MCPhysReg DstReg = CSI[i].getDstReg(); + MCPhysReg DstReg = I.getDstReg(); if (!MBB.isLiveIn(DstReg)) MBB.addLiveIn(DstReg); } } } - } /// Insert restore code for the callee-saved registers used in the function. @@ -902,9 +901,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { // incoming stack pointer if a frame pointer is required and is closer // to the incoming rather than the final stack pointer. const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - bool EarlyScavengingSlots = (TFI.hasFP(MF) && TFI.isFPCloseToIncomingSP() && - RegInfo->useFPForScavengingIndex(MF) && - !RegInfo->hasStackRealignment(MF)); + bool EarlyScavengingSlots = TFI.allocateScavengingFrameIndexesNearIncomingSP(MF); if (RS && EarlyScavengingSlots) { SmallVector<int, 2> SFIs; RS->getScavengingFrameIndices(SFIs); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp index 68920e2e50df..6653145d3d2a 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp @@ -1258,8 +1258,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // Free registers occupied by defs. // Iterate operands in reverse order, so we see the implicit super register // defs first (we added them earlier in case of <def,read-undef>). - for (unsigned I = MI.getNumOperands(); I-- > 0;) { - MachineOperand &MO = MI.getOperand(I); + for (MachineOperand &MO : llvm::reverse(MI.operands())) { if (!MO.isReg() || !MO.isDef()) continue; @@ -1362,8 +1361,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // Free early clobbers. if (HasEarlyClobber) { - for (unsigned I = MI.getNumOperands(); I-- > 0; ) { - MachineOperand &MO = MI.getOperand(I); + for (MachineOperand &MO : llvm::reverse(MI.operands())) { if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber()) continue; // subreg defs don't free the full register. We left the subreg number @@ -1440,8 +1438,7 @@ void RegAllocFast::handleBundle(MachineInstr &MI) { MachineBasicBlock::instr_iterator BundledMI = MI.getIterator(); ++BundledMI; while (BundledMI->isBundledWithPred()) { - for (unsigned I = 0; I < BundledMI->getNumOperands(); ++I) { - MachineOperand &MO = BundledMI->getOperand(I); + for (MachineOperand &MO : BundledMI->operands()) { if (!MO.isReg()) continue; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp index 5a93b58e0baf..50411c177007 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -199,7 +199,8 @@ class RAGreedy : public MachineFunctionPass, struct RegInfo { LiveRangeStage Stage = RS_New; - // Cascade - Eviction loop prevention. See canEvictInterference(). + // Cascade - Eviction loop prevention. See + // canEvictInterferenceBasedOnCost(). unsigned Cascade = 0; RegInfo() = default; @@ -207,13 +208,51 @@ class RAGreedy : public MachineFunctionPass, IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo; + LiveRangeStage getStage(Register Reg) const { + return ExtraRegInfo[Reg].Stage; + } + LiveRangeStage getStage(const LiveInterval &VirtReg) const { - return ExtraRegInfo[VirtReg.reg()].Stage; + return getStage(VirtReg.reg()); + } + + void setStage(Register Reg, LiveRangeStage Stage) { + ExtraRegInfo.resize(MRI->getNumVirtRegs()); + ExtraRegInfo[Reg].Stage = Stage; } void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) { + setStage(VirtReg.reg(), Stage); + } + + /// Return the current stage of the register, if present, otherwise initialize + /// it and return that. + LiveRangeStage getOrInitStage(Register Reg) { + ExtraRegInfo.grow(Reg); + return getStage(Reg); + } + + unsigned getCascade(Register Reg) const { return ExtraRegInfo[Reg].Cascade; } + + void setCascade(Register Reg, unsigned Cascade) { ExtraRegInfo.resize(MRI->getNumVirtRegs()); - ExtraRegInfo[VirtReg.reg()].Stage = Stage; + ExtraRegInfo[Reg].Cascade = Cascade; + } + + unsigned getOrAssignNewCascade(Register Reg) { + unsigned Cascade = getCascade(Reg); + if (!Cascade) { + Cascade = NextCascade++; + setCascade(Reg, Cascade); + } + return Cascade; + } + + unsigned getCascadeOrCurrentNext(Register Reg) const { + unsigned Cascade = getCascade(Reg); + if (!Cascade) + Cascade = NextCascade; + return Cascade; } template<typename Iterator> @@ -410,8 +449,11 @@ private: void calcGapWeights(MCRegister, SmallVectorImpl<float> &); Register canReassign(LiveInterval &VirtReg, Register PrevReg) const; bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const; - bool canEvictInterference(LiveInterval &, MCRegister, bool, EvictionCost &, - const SmallVirtRegSet &) const; + bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool, + EvictionCost &, + const SmallVirtRegSet &) const; + bool canEvictHintInterference(LiveInterval &, MCRegister, + const SmallVirtRegSet &) const; bool canEvictInterferenceInRange(const LiveInterval &VirtReg, MCRegister PhysReg, SlotIndex Start, SlotIndex End, EvictionCost &MaxCost) const; @@ -683,15 +725,16 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { assert(Reg.isVirtual() && "Can only enqueue virtual registers"); unsigned Prio; - ExtraRegInfo.grow(Reg); - if (ExtraRegInfo[Reg].Stage == RS_New) - ExtraRegInfo[Reg].Stage = RS_Assign; - - if (ExtraRegInfo[Reg].Stage == RS_Split) { + auto Stage = getOrInitStage(Reg); + if (Stage == RS_New) { + Stage = RS_Assign; + setStage(Reg, Stage); + } + if (Stage == RS_Split) { // Unsplit ranges that couldn't be allocated immediately are deferred until // everything else has been allocated. Prio = Size; - } else if (ExtraRegInfo[Reg].Stage == RS_Memory) { + } else if (Stage == RS_Memory) { // Memory operand should be considered last. // Change the priority such that Memory operand are assigned in // the reverse order that they came in. @@ -706,7 +749,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC)); - if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() && + if (Stage == RS_Assign && !ForceGlobal && !LI->empty() && LIS->intervalIsInOneMBB(*LI)) { // Allocate original local ranges in linear instruction order. Since they // are singly defined, this produces optimal coloring in the absence of @@ -780,10 +823,8 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg, if (Order.isHint(Hint)) { MCRegister PhysHint = Hint.asMCReg(); LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n'); - EvictionCost MaxCost; - MaxCost.setBrokenHints(1); - if (canEvictInterference(VirtReg, PhysHint, true, MaxCost, - FixedRegisters)) { + + if (canEvictHintInterference(VirtReg, PhysHint, FixedRegisters)) { evictInterference(VirtReg, PhysHint, NewVRegs); return PhysHint; } @@ -864,8 +905,19 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint, return false; } -/// canEvictInterference - Return true if all interferences between VirtReg and -/// PhysReg can be evicted. +/// canEvictHintInterference - return true if the interference for VirtReg +/// on the PhysReg, which is VirtReg's hint, can be evicted in favor of VirtReg. +bool RAGreedy::canEvictHintInterference( + LiveInterval &VirtReg, MCRegister PhysReg, + const SmallVirtRegSet &FixedRegisters) const { + EvictionCost MaxCost; + MaxCost.setBrokenHints(1); + return canEvictInterferenceBasedOnCost(VirtReg, PhysReg, true, MaxCost, + FixedRegisters); +} + +/// canEvictInterferenceBasedOnCost - Return true if all interferences between +/// VirtReg and PhysReg can be evicted. /// /// @param VirtReg Live range that is about to be assigned. /// @param PhysReg Desired register for assignment. @@ -873,7 +925,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint, /// @param MaxCost Only look for cheaper candidates and update with new cost /// when returning true. /// @returns True when interference can be evicted cheaper than MaxCost. -bool RAGreedy::canEvictInterference( +bool RAGreedy::canEvictInterferenceBasedOnCost( LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint, EvictionCost &MaxCost, const SmallVirtRegSet &FixedRegisters) const { // It is only possible to evict virtual register interference. @@ -1054,9 +1106,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg, // Make sure that VirtReg has a cascade number, and assign that cascade // number to every evicted register. These live ranges than then only be // evicted by a newer cascade, preventing infinite loops. - unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade; - if (!Cascade) - Cascade = ExtraRegInfo[VirtReg.reg()].Cascade = NextCascade++; + unsigned Cascade = getOrAssignNewCascade(VirtReg.reg()); LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI) << " interference: Cascade " << Cascade << '\n'); @@ -1082,10 +1132,10 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg, LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg()); Matrix->unassign(*Intf); - assert((ExtraRegInfo[Intf->reg()].Cascade < Cascade || + assert((getCascade(Intf->reg()) < Cascade || VirtReg.isSpillable() < Intf->isSpillable()) && "Cannot decrease cascade number, illegal eviction"); - ExtraRegInfo[Intf->reg()].Cascade = Cascade; + setCascade(Intf->reg(), Cascade); ++NumEvicted; NewVRegs.push_back(Intf->reg()); } @@ -1150,8 +1200,8 @@ MCRegister RAGreedy::tryFindEvictionCandidate( continue; } - if (!canEvictInterference(VirtReg, PhysReg, false, BestCost, - FixedRegisters)) + if (!canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost, + FixedRegisters)) continue; // Best so far. @@ -1756,7 +1806,6 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, SE->finish(&IntvMap); DebugVars->splitRegister(Reg, LREdit.regs(), *LIS); - ExtraRegInfo.resize(MRI->getNumVirtRegs()); unsigned OrigBlocks = SA->getNumLiveBlocks(); // Sort out the new intervals created by splitting. We get four kinds: @@ -1765,10 +1814,10 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, // - Block-local splits are candidates for local splitting. // - DCE leftovers should go back on the queue. for (unsigned I = 0, E = LREdit.size(); I != E; ++I) { - LiveInterval &Reg = LIS->getInterval(LREdit.get(I)); + const LiveInterval &Reg = LIS->getInterval(LREdit.get(I)); // Ignore old intervals from DCE. - if (getStage(Reg) != RS_New) + if (getOrInitStage(Reg.reg()) != RS_New) continue; // Remainder interval. Don't try splitting again, spill if it doesn't @@ -2012,13 +2061,11 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order, // Tell LiveDebugVariables about the new ranges. DebugVars->splitRegister(Reg, LREdit.regs(), *LIS); - ExtraRegInfo.resize(MRI->getNumVirtRegs()); - // Sort out the new intervals created by splitting. The remainder interval // goes straight to spilling, the new local ranges get to stay RS_New. for (unsigned I = 0, E = LREdit.size(); I != E; ++I) { - LiveInterval &LI = LIS->getInterval(LREdit.get(I)); - if (getStage(LI) == RS_New && IntvMap[I] == 0) + const LiveInterval &LI = LIS->getInterval(LREdit.get(I)); + if (getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0) setStage(LI, RS_Spill); } @@ -2104,8 +2151,6 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVector<unsigned, 8> IntvMap; SE->finish(&IntvMap); DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS); - ExtraRegInfo.resize(MRI->getNumVirtRegs()); - // Assign all new registers to RS_Spill. This was the last chance. setStage(LREdit.begin(), LREdit.end(), RS_Spill); return 0; @@ -2400,7 +2445,6 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVector<unsigned, 8> IntvMap; SE->finish(&IntvMap); DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS); - // If the new range has the same number of instructions as before, mark it as // RS_Split2 so the next split will be forced to make progress. Otherwise, // leave the new intervals as RS_New so they can compete. @@ -3021,7 +3065,7 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, LiveRangeStage Stage = getStage(VirtReg); LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade " - << ExtraRegInfo[VirtReg.reg()].Cascade << '\n'); + << getCascade(VirtReg.reg()) << '\n'); // Try to evict a less worthy live range, but only for ranges from the primary // queue. The RS_Split ranges already failed to do this, and they should not @@ -3311,7 +3355,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops)); SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI)); ExtraRegInfo.clear(); - ExtraRegInfo.resize(MRI->getNumVirtRegs()); NextCascade = 1; IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI); GlobalCand.resize(32); // This will grow as needed. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp index c847068bca90..4c8534cf2d01 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -3908,20 +3908,20 @@ void RegisterCoalescer::lateLiveIntervalUpdate() { bool RegisterCoalescer:: copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) { bool Progress = false; - for (unsigned i = 0, e = CurrList.size(); i != e; ++i) { - if (!CurrList[i]) + for (MachineInstr *&MI : CurrList) { + if (!MI) continue; // Skip instruction pointers that have already been erased, for example by // dead code elimination. - if (ErasedInstrs.count(CurrList[i])) { - CurrList[i] = nullptr; + if (ErasedInstrs.count(MI)) { + MI = nullptr; continue; } bool Again = false; - bool Success = joinCopy(CurrList[i], Again); + bool Success = joinCopy(MI, Again); Progress |= Success; if (Success || !Again) - CurrList[i] = nullptr; + MI = nullptr; } return Progress; } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 3f013eb6024e..0e8e8338b46d 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -406,11 +406,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { // register in later operands. The lanes of other defs will now be live // after this instruction, so these should not be treated as killed by the // instruction even though they appear to be killed in this one operand. - for (int I = OperIdx + 1, E = MI->getNumOperands(); I != E; ++I) { - const MachineOperand &OtherMO = MI->getOperand(I); + for (const MachineOperand &OtherMO : + llvm::drop_begin(MI->operands(), OperIdx + 1)) if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg) KillLaneMask &= ~getLaneMaskForMO(OtherMO); - } } // Clear undef flag, we'll re-add it later once we know which subregister diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ce400ea43f29..df5a041b87cd 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4436,7 +4436,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { SDValue OptimizedDiv = isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N); - if (OptimizedDiv.getNode()) { + if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) { // If the equivalent Div node also exists, update its users. unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), @@ -4464,6 +4464,9 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { SDLoc DL(N); if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + // fold (mulhs x, 0) -> 0 // do not return N0/N1, because undef node may exist. if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || @@ -4521,6 +4524,9 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { SDLoc DL(N); if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + // fold (mulhu x, 0) -> 0 // do not return N0/N1, because undef node may exist. if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || @@ -4779,6 +4785,106 @@ SDValue DAGCombiner::visitMULO(SDNode *N) { return SDValue(); } +// Function to calculate whether the Min/Max pair of SDNodes (potentially +// swapped around) make a signed saturate pattern, clamping to between -2^(BW-1) +// and 2^(BW-1)-1. Returns the node being clamped and the bitwidth of the clamp +// in BW. Should work with both SMIN/SMAX nodes and setcc/select combo. The +// operands are the same as SimplifySelectCC. N0<N1 ? N2 : N3 +static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, + SDValue N3, ISD::CondCode CC, unsigned &BW) { + auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3, + ISD::CondCode CC) { + // The compare and select operand should be the same or the select operands + // should be truncated versions of the comparison. + if (N0 != N2 && (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0))) + return 0; + // The constants need to be the same or a truncated version of each other. + ConstantSDNode *N1C = isConstOrConstSplat(N1); + ConstantSDNode *N3C = isConstOrConstSplat(N3); + if (!N1C || !N3C) + return 0; + const APInt &C1 = N1C->getAPIntValue(); + const APInt &C2 = N3C->getAPIntValue(); + if (C1.getBitWidth() < C2.getBitWidth() || + C1 != C2.sextOrSelf(C1.getBitWidth())) + return 0; + return CC == ISD::SETLT ? ISD::SMIN : (CC == ISD::SETGT ? ISD::SMAX : 0); + }; + + // Check the initial value is a SMIN/SMAX equivalent. + unsigned Opcode0 = isSignedMinMax(N0, N1, N2, N3, CC); + if (!Opcode0) + return SDValue(); + + SDValue N00, N01, N02, N03; + ISD::CondCode N0CC; + switch (N0.getOpcode()) { + case ISD::SMIN: + case ISD::SMAX: + N00 = N02 = N0.getOperand(0); + N01 = N03 = N0.getOperand(1); + N0CC = N0.getOpcode() == ISD::SMIN ? ISD::SETLT : ISD::SETGT; + break; + case ISD::SELECT_CC: + N00 = N0.getOperand(0); + N01 = N0.getOperand(1); + N02 = N0.getOperand(2); + N03 = N0.getOperand(3); + N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get(); + break; + case ISD::SELECT: + case ISD::VSELECT: + if (N0.getOperand(0).getOpcode() != ISD::SETCC) + return SDValue(); + N00 = N0.getOperand(0).getOperand(0); + N01 = N0.getOperand(0).getOperand(1); + N02 = N0.getOperand(1); + N03 = N0.getOperand(2); + N0CC = cast<CondCodeSDNode>(N0.getOperand(0).getOperand(2))->get(); + break; + default: + return SDValue(); + } + + unsigned Opcode1 = isSignedMinMax(N00, N01, N02, N03, N0CC); + if (!Opcode1 || Opcode0 == Opcode1) + return SDValue(); + + ConstantSDNode *MinCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N1 : N01); + ConstantSDNode *MaxCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N01 : N1); + if (!MinCOp || !MaxCOp || MinCOp->getValueType(0) != MaxCOp->getValueType(0)) + return SDValue(); + + const APInt &MinC = MinCOp->getAPIntValue(); + const APInt &MaxC = MaxCOp->getAPIntValue(); + APInt MinCPlus1 = MinC + 1; + if (-MaxC != MinCPlus1 || !MinCPlus1.isPowerOf2()) + return SDValue(); + BW = MinCPlus1.exactLogBase2() + 1; + return N02; +} + +static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, + SDValue N3, ISD::CondCode CC, + SelectionDAG &DAG) { + unsigned BW; + SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW); + if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT) + return SDValue(); + EVT FPVT = Fp.getOperand(0).getValueType(); + EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW); + if (FPVT.isVector()) + NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT, + FPVT.getVectorElementCount()); + if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat( + ISD::FP_TO_SINT_SAT, Fp.getOperand(0).getValueType(), NewVT)) + return SDValue(); + SDLoc DL(Fp); + SDValue Sat = DAG.getNode(ISD::FP_TO_SINT_SAT, DL, NewVT, Fp.getOperand(0), + DAG.getValueType(NewVT.getScalarType())); + return DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0)); +} + SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4817,6 +4923,11 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { return DAG.getNode(AltOpcode, DL, VT, N0, N1); } + if (Opcode == ISD::SMIN || Opcode == ISD::SMAX) + if (SDValue S = PerformMinMaxFpToSatCombine( + N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG)) + return S; + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -9940,9 +10051,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { // If this is a masked load with an all ones mask, we can use a unmasked load. // FIXME: Can we do this for indexed, compressing, or truncating stores? - if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && - MST->isUnindexed() && !MST->isCompressingStore() && - !MST->isTruncatingStore()) + if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() && + !MST->isCompressingStore() && !MST->isTruncatingStore()) return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(), MST->getBasePtr(), MST->getMemOperand()); @@ -9997,9 +10107,8 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { // If this is a masked load with an all ones mask, we can use a unmasked load. // FIXME: Can we do this for indexed, expanding, or extending loads? - if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && - MLD->isUnindexed() && !MLD->isExpandingLoad() && - MLD->getExtensionType() == ISD::NON_EXTLOAD) { + if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() && + !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) { SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(), MLD->getMemOperand()); return CombineTo(N, NewLd, NewLd.getValue(1)); @@ -10138,6 +10247,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { return FMinMax; } + if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG)) + return S; + // If this select has a condition (setcc) with narrower operands than the // select, try to widen the compare to match the select width. // TODO: This should be extended to handle any constant. @@ -15007,7 +15119,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), @@ -23034,6 +23146,9 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT)); } + if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG)) + return S; + return SDValue(); } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index c1bb65409282..331e0325aea3 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -765,7 +765,7 @@ InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD, assert(!SD->isVariadic()); SDDbgOperand DbgOperand = SD->getLocationOps()[0]; MDNode *Var = SD->getVariable(); - MDNode *Expr = SD->getExpression(); + DIExpression *Expr = (DIExpression*)SD->getExpression(); DebugLoc DL = SD->getDebugLoc(); const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_INSTR_REF); @@ -775,6 +775,13 @@ InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD, DbgOperand.getKind() == SDDbgOperand::CONST) return EmitDbgValueFromSingleOp(SD, VRBaseMap); + // Immediately fold any indirectness from the LLVM-IR intrinsic into the + // expression: + if (SD->isIndirect()) { + std::vector<uint64_t> Elts = {dwarf::DW_OP_deref}; + Expr = DIExpression::append(Expr, Elts); + } + // It may not be immediately possible to identify the MachineInstr that // defines a VReg, it can depend for example on the order blocks are // emitted in. When this happens, or when further analysis is needed later, diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index eb9d2286aeb4..08598eeded7a 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3553,9 +3553,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // Node. Tmp1 = Node->getOperand(0); Tmp2 = Node->getOperand(1); - if (Tmp2.getOpcode() == ISD::SETCC) { - Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, - Tmp1, Tmp2.getOperand(2), + if (Tmp2.getOpcode() == ISD::SETCC && + TLI.isOperationLegalOrCustom(ISD::BR_CC, + Tmp2.getOperand(0).getValueType())) { + Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1, Tmp2.getOperand(2), Tmp2.getOperand(0), Tmp2.getOperand(1), Node->getOperand(2)); } else { diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 1f73c9eea104..98312f91d8c0 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -28,7 +28,7 @@ using namespace llvm; static cl::opt<bool> EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden); -/// Do extensive, expensive, sanity checking. +/// Do extensive, expensive, basic correctness checking. void DAGTypeLegalizer::PerformExpensiveChecks() { // If a node is not processed, then none of its values should be mapped by any // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues. @@ -534,7 +534,8 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) { // The node morphed into a different node. Normally for this to happen // the original node would have to be marked NewNode. However this can // in theory momentarily not be the case while ReplaceValueWith is doing - // its stuff. Mark the original node NewNode to help sanity checking. + // its stuff. Mark the original node NewNode to help basic correctness + // checking. N->setNodeId(NewNode); if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed) // It morphed into a previously analyzed node - nothing more to do. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 539c9cb9c256..7ec2638b1e71 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1820,10 +1820,10 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, else std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl); - unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize()); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoSize, Alignment, - MLD->getAAInfo(), MLD->getRanges()); + MLD->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, MLD->getAAInfo(), + MLD->getRanges()); Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT, MMO, MLD->getAddressingMode(), ExtType, @@ -1837,7 +1837,6 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, // Generate hi masked load. Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG, MLD->isExpandingLoad()); - unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize()); MachinePointerInfo MPI; if (LoMemVT.isScalableVector()) @@ -1847,8 +1846,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, LoMemVT.getStoreSize().getFixedSize()); MMO = DAG.getMachineFunction().getMachineMemOperand( - MPI, MachineMemOperand::MOLoad, HiSize, Alignment, MLD->getAAInfo(), - MLD->getRanges()); + MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment, + MLD->getAAInfo(), MLD->getRanges()); Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi, HiMemVT, MMO, MLD->getAddressingMode(), ExtType, @@ -2662,10 +2661,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty); SDValue Lo, Hi, Res; - unsigned LoSize = MemoryLocation::getSizeOrUnknown(LoMemVT.getStoreSize()); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - N->getPointerInfo(), MachineMemOperand::MOStore, LoSize, Alignment, - N->getAAInfo(), N->getRanges()); + N->getPointerInfo(), MachineMemOperand::MOStore, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO, N->getAddressingMode(), N->isTruncatingStore(), @@ -2689,10 +2687,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, MPI = N->getPointerInfo().getWithOffset( LoMemVT.getStoreSize().getFixedSize()); - unsigned HiSize = MemoryLocation::getSizeOrUnknown(HiMemVT.getStoreSize()); MMO = DAG.getMachineFunction().getMachineMemOperand( - MPI, MachineMemOperand::MOStore, HiSize, Alignment, N->getAAInfo(), - N->getRanges()); + MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment, + N->getAAInfo(), N->getRanges()); Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO, N->getAddressingMode(), N->isTruncatingStore(), diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 55fe26eb64cd..2695ed36991c 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -268,8 +268,8 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) { // Now see if there are no other dependencies // to instructions already in the packet. - for (unsigned i = 0, e = Packet.size(); i != e; ++i) - for (const SDep &Succ : Packet[i]->Succs) { + for (const SUnit *S : Packet) + for (const SDep &Succ : S->Succs) { // Since we do not add pseudos to packets, might as well // ignore order deps. if (Succ.isCtrl()) diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 95f7e43b151d..84e6d2a16422 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -706,8 +706,8 @@ void ScheduleDAGSDNodes::dump() const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void ScheduleDAGSDNodes::dumpSchedule() const { - for (unsigned i = 0, e = Sequence.size(); i != e; i++) { - if (SUnit *SU = Sequence[i]) + for (const SUnit *SU : Sequence) { + if (SU) dumpNode(*SU); else dbgs() << "**** NOOP ****\n"; diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 008665d50233..c282e03387dd 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -406,8 +406,8 @@ bool ISD::isVPOpcode(unsigned Opcode) { switch (Opcode) { default: return false; -#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) \ - case ISD::SDOPC: \ +#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) \ + case ISD::VPSD: \ return true; #include "llvm/IR/VPIntrinsics.def" } @@ -416,23 +416,25 @@ bool ISD::isVPOpcode(unsigned Opcode) { bool ISD::isVPBinaryOp(unsigned Opcode) { switch (Opcode) { default: - return false; -#define PROPERTY_VP_BINARYOP_SDNODE(SDOPC) \ - case ISD::SDOPC: \ - return true; + break; +#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD: +#define VP_PROPERTY_BINARYOP return true; +#define END_REGISTER_VP_SDNODE(VPSD) break; #include "llvm/IR/VPIntrinsics.def" } + return false; } bool ISD::isVPReduction(unsigned Opcode) { switch (Opcode) { default: - return false; -#define PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \ - case ISD::SDOPC: \ - return true; + break; +#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD: +#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true; +#define END_REGISTER_VP_SDNODE(VPSD) break; #include "llvm/IR/VPIntrinsics.def" } + return false; } /// The operand position of the vector mask. @@ -440,8 +442,8 @@ Optional<unsigned> ISD::getVPMaskIdx(unsigned Opcode) { switch (Opcode) { default: return None; -#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, ...) \ - case ISD::SDOPC: \ +#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, ...) \ + case ISD::VPSD: \ return MASKPOS; #include "llvm/IR/VPIntrinsics.def" } @@ -452,8 +454,8 @@ Optional<unsigned> ISD::getVPExplicitVectorLengthIdx(unsigned Opcode) { switch (Opcode) { default: return None; -#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, EVLPOS) \ - case ISD::SDOPC: \ +#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, EVLPOS) \ + case ISD::VPSD: \ return EVLPOS; #include "llvm/IR/VPIntrinsics.def" } @@ -974,7 +976,7 @@ void SelectionDAG::DeallocateNode(SDNode *N) { } #ifndef NDEBUG -/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid. +/// VerifySDNode - Check the given SDNode. Aborts if it is invalid. static void VerifySDNode(SDNode *N) { switch (N->getOpcode()) { default: @@ -4540,10 +4542,25 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { } // FIXME: unify with llvm::haveNoCommonBitsSet. -// FIXME: could also handle masked merge pattern (X & ~M) op (Y & M) bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { assert(A.getValueType() == B.getValueType() && "Values must have the same type"); + // Match masked merge pattern (X & ~M) op (Y & M) + if (A->getOpcode() == ISD::AND && B->getOpcode() == ISD::AND) { + auto MatchNoCommonBitsPattern = [&](SDValue NotM, SDValue And) { + if (isBitwiseNot(NotM, true)) { + SDValue NotOperand = NotM->getOperand(0); + return NotOperand == And->getOperand(0) || + NotOperand == And->getOperand(1); + } + return false; + }; + if (MatchNoCommonBitsPattern(A->getOperand(0), B) || + MatchNoCommonBitsPattern(A->getOperand(1), B) || + MatchNoCommonBitsPattern(B->getOperand(0), A) || + MatchNoCommonBitsPattern(B->getOperand(1), A)) + return true; + } return KnownBits::haveNoCommonBitsSet(computeKnownBits(A), computeKnownBits(B)); } @@ -5070,7 +5087,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getUNDEF(VT); break; case ISD::BITCAST: - // Basic sanity checking. assert(VT.getSizeInBits() == Operand.getValueSizeInBits() && "Cannot BITCAST between types of different sizes!"); if (VT == Operand.getValueType()) return Operand; // noop conversion. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 5d911c165293..7726a0007e44 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4336,9 +4336,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I, MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, - // TODO: Make MachineMemOperands aware of scalable - // vectors. - VT.getStoreSize().getKnownMinSize(), *Alignment, I.getAAMetadata()); + MemoryLocation::UnknownSize, *Alignment, I.getAAMetadata()); SDValue StoreNode = DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO, ISD::UNINDEXED, false /* Truncating */, IsCompressing); @@ -4496,22 +4494,14 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); // Do not serialize masked loads of constant memory with anything. - MemoryLocation ML; - if (VT.isScalableVector()) - ML = MemoryLocation::getAfter(PtrOperand); - else - ML = MemoryLocation(PtrOperand, LocationSize::precise( - DAG.getDataLayout().getTypeStoreSize(I.getType())), - AAInfo); + MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); bool AddToChain = !AA || !AA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, - // TODO: Make MachineMemOperands aware of scalable - // vectors. - VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges); + MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges); SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO, @@ -5807,8 +5797,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::vscale: { match(&I, m_VScale(DAG.getDataLayout())); EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); - setValue(&I, - DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1))); + setValue(&I, DAG.getVScale(sdl, VT, APInt(VT.getSizeInBits(), 1))); return; } case Intrinsic::vastart: visitVAStart(I); return; @@ -6942,10 +6931,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT); unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC); - SDValue N = - DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT); + SDValue N = DAG.getCopyFromReg(DAG.getEntryNode(), sdl, VReg, PtrVT); if (Intrinsic == Intrinsic::eh_exceptioncode) - N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32); + N = DAG.getZExtOrTrunc(N, sdl, MVT::i32); setValue(&I, N); return; } @@ -6957,7 +6945,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (Triple.getArch() != Triple::x86_64) return; - SDLoc DL = getCurSDLoc(); SmallVector<SDValue, 8> Ops; // We want to say that we always want the arguments in registers. @@ -6974,7 +6961,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // see that some registers may be assumed clobbered and have to preserve // them across calls to the intrinsic. MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL, - DL, NodeTys, Ops); + sdl, NodeTys, Ops); SDValue patchableNode = SDValue(MN, 0); DAG.setRoot(patchableNode); setValue(&I, patchableNode); @@ -6988,7 +6975,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (Triple.getArch() != Triple::x86_64) return; - SDLoc DL = getCurSDLoc(); SmallVector<SDValue, 8> Ops; // We want to say that we always want the arguments in registers. @@ -7009,7 +6995,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, // see that some registers may be assumed clobbered and have to preserve // them across calls to the intrinsic. MachineSDNode *MN = DAG.getMachineNode( - TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, DL, NodeTys, Ops); + TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, sdl, NodeTys, Ops); SDValue patchableNode = SDValue(MN, 0); DAG.setRoot(patchableNode); setValue(&I, patchableNode); @@ -7047,7 +7033,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (!Base) report_fatal_error( "llvm.icall.branch.funnel operand must be a GlobalValue"); - Ops.push_back(DAG.getTargetGlobalAddress(Base, getCurSDLoc(), MVT::i64, 0)); + Ops.push_back(DAG.getTargetGlobalAddress(Base, sdl, MVT::i64, 0)); struct BranchFunnelTarget { int64_t Offset; @@ -7068,8 +7054,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, report_fatal_error( "llvm.icall.branch.funnel operand must be a GlobalValue"); Targets.push_back({Offset, DAG.getTargetGlobalAddress( - GA->getGlobal(), getCurSDLoc(), - Val.getValueType(), GA->getOffset())}); + GA->getGlobal(), sdl, Val.getValueType(), + GA->getOffset())}); } llvm::sort(Targets, [](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) { @@ -7077,13 +7063,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, }); for (auto &T : Targets) { - Ops.push_back(DAG.getTargetConstant(T.Offset, getCurSDLoc(), MVT::i32)); + Ops.push_back(DAG.getTargetConstant(T.Offset, sdl, MVT::i32)); Ops.push_back(T.Target); } Ops.push_back(DAG.getRoot()); // Chain - SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL, - getCurSDLoc(), MVT::Other, Ops), + SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL, sdl, + MVT::Other, Ops), 0); DAG.setRoot(N); setValue(&I, N); @@ -7102,7 +7088,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo(); bool ZeroMemory = Intrinsic == Intrinsic::aarch64_settag_zero; SDValue Val = TSI.EmitTargetCodeForSetTag( - DAG, getCurSDLoc(), getRoot(), getValue(I.getArgOperand(0)), + DAG, sdl, getRoot(), getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), MachinePointerInfo(I.getArgOperand(0)), ZeroMemory); DAG.setRoot(Val); @@ -7114,46 +7100,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, SDValue Const = getValue(I.getOperand(1)); EVT PtrVT = Ptr.getValueType(); - setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), PtrVT, Ptr, - DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT))); + setValue(&I, DAG.getNode(ISD::AND, sdl, PtrVT, Ptr, + DAG.getZExtOrTrunc(Const, sdl, PtrVT))); return; } case Intrinsic::get_active_lane_mask: { - auto DL = getCurSDLoc(); + EVT CCVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); SDValue Index = getValue(I.getOperand(0)); - SDValue TripCount = getValue(I.getOperand(1)); - Type *ElementTy = I.getOperand(0)->getType(); - EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); - unsigned VecWidth = VT.getVectorNumElements(); + EVT ElementVT = Index.getValueType(); - SmallVector<SDValue, 16> OpsTripCount; - SmallVector<SDValue, 16> OpsIndex; - SmallVector<SDValue, 16> OpsStepConstants; - for (unsigned i = 0; i < VecWidth; i++) { - OpsTripCount.push_back(TripCount); - OpsIndex.push_back(Index); - OpsStepConstants.push_back( - DAG.getConstant(i, DL, EVT::getEVT(ElementTy))); + if (!TLI.shouldExpandGetActiveLaneMask(CCVT, ElementVT)) { + visitTargetIntrinsic(I, Intrinsic); + return; } - EVT CCVT = EVT::getVectorVT(I.getContext(), MVT::i1, VecWidth); + SDValue TripCount = getValue(I.getOperand(1)); + auto VecTy = CCVT.changeVectorElementType(ElementVT); - auto VecTy = EVT::getEVT(FixedVectorType::get(ElementTy, VecWidth)); - SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex); - SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants); + SDValue VectorIndex, VectorTripCount; + if (VecTy.isScalableVector()) { + VectorIndex = DAG.getSplatVector(VecTy, sdl, Index); + VectorTripCount = DAG.getSplatVector(VecTy, sdl, TripCount); + } else { + VectorIndex = DAG.getSplatBuildVector(VecTy, sdl, Index); + VectorTripCount = DAG.getSplatBuildVector(VecTy, sdl, TripCount); + } + SDValue VectorStep = DAG.getStepVector(sdl, VecTy); SDValue VectorInduction = DAG.getNode( - ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep); - SDValue VectorTripCount = DAG.getBuildVector(VecTy, DL, OpsTripCount); - SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0), + ISD::UADDO, sdl, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep); + SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction.getValue(0), VectorTripCount, ISD::CondCode::SETULT); - setValue(&I, DAG.getNode(ISD::AND, DL, CCVT, - DAG.getNOT(DL, VectorInduction.getValue(1), CCVT), + setValue(&I, DAG.getNode(ISD::AND, sdl, CCVT, + DAG.getNOT(sdl, VectorInduction.getValue(1), CCVT), SetCC)); return; } case Intrinsic::experimental_vector_insert: { - auto DL = getCurSDLoc(); - SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); SDValue Index = getValue(I.getOperand(2)); @@ -7163,16 +7145,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, MVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); if (Index.getValueType() != VectorIdxTy) Index = DAG.getVectorIdxConstant( - cast<ConstantSDNode>(Index)->getZExtValue(), DL); + cast<ConstantSDNode>(Index)->getZExtValue(), sdl); EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); - setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResultVT, Vec, SubVec, + setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, ResultVT, Vec, SubVec, Index)); return; } case Intrinsic::experimental_vector_extract: { - auto DL = getCurSDLoc(); - SDValue Vec = getValue(I.getOperand(0)); SDValue Index = getValue(I.getOperand(1)); EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); @@ -7182,9 +7162,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, MVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); if (Index.getValueType() != VectorIdxTy) Index = DAG.getVectorIdxConstant( - cast<ConstantSDNode>(Index)->getZExtValue(), DL); + cast<ConstantSDNode>(Index)->getZExtValue(), sdl); - setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index)); + setValue(&I, + DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index)); return; } case Intrinsic::experimental_vector_reverse: @@ -7314,9 +7295,9 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) { Optional<unsigned> ResOPC; switch (VPIntrin.getIntrinsicID()) { -#define BEGIN_REGISTER_VP_INTRINSIC(INTRIN, ...) case Intrinsic::INTRIN: -#define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) ResOPC = ISD::VPSDID; -#define END_REGISTER_VP_INTRINSIC(...) break; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) ResOPC = ISD::VPSD; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e4a69adff05b..737695b5eabe 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -645,6 +645,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( if (DemandedBits == 0 || DemandedElts == 0) return DAG.getUNDEF(Op.getValueType()); + bool IsLE = DAG.getDataLayout().isLittleEndian(); unsigned NumElts = DemandedElts.getBitWidth(); unsigned BitWidth = DemandedBits.getBitWidth(); KnownBits LHSKnown, RHSKnown; @@ -663,16 +664,15 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( Src, DemandedBits, DemandedElts, DAG, Depth + 1)) return DAG.getBitcast(DstVT, V); - // TODO - bigendian once we have test coverage. - if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0 && - DAG.getDataLayout().isLittleEndian()) { + if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0) { unsigned Scale = NumDstEltBits / NumSrcEltBits; unsigned NumSrcElts = SrcVT.getVectorNumElements(); APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); APInt DemandedSrcElts = APInt::getZero(NumSrcElts); for (unsigned i = 0; i != Scale; ++i) { - unsigned Offset = i * NumSrcEltBits; - APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset); + unsigned EltOffset = IsLE ? i : (Scale - 1 - i); + unsigned BitOffset = EltOffset * NumSrcEltBits; + APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset); if (!Sub.isZero()) { DemandedSrcBits |= Sub; for (unsigned j = 0; j != NumElts; ++j) @@ -687,8 +687,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( } // TODO - bigendian once we have test coverage. - if ((NumSrcEltBits % NumDstEltBits) == 0 && - DAG.getDataLayout().isLittleEndian()) { + if (IsLE && (NumSrcEltBits % NumDstEltBits) == 0) { unsigned Scale = NumSrcEltBits / NumDstEltBits; unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1; APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); @@ -802,8 +801,8 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); EVT DstVT = Op.getValueType(); - if (DemandedElts == 1 && DstVT.getSizeInBits() == SrcVT.getSizeInBits() && - DAG.getDataLayout().isLittleEndian() && + if (IsLE && DemandedElts == 1 && + DstVT.getSizeInBits() == SrcVT.getSizeInBits() && DemandedBits.getActiveBits() <= SrcVT.getScalarSizeInBits()) { return DAG.getBitcast(DstVT, Src); } @@ -913,6 +912,7 @@ bool TargetLowering::SimplifyDemandedBits( if (Op.getValueType().isScalableVector()) return false; + bool IsLE = TLO.DAG.getDataLayout().isLittleEndian(); unsigned NumElts = OriginalDemandedElts.getBitWidth(); assert((!Op.getValueType().isVector() || NumElts == Op.getValueType().getVectorNumElements()) && @@ -1725,11 +1725,40 @@ bool TargetLowering::SimplifyDemandedBits( case ISD::ROTR: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); + bool IsROTL = (Op.getOpcode() == ISD::ROTL); // If we're rotating an 0/-1 value, then it stays an 0/-1 value. if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1)) return TLO.CombineTo(Op, Op0); + if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) { + unsigned Amt = SA->getAPIntValue().urem(BitWidth); + unsigned RevAmt = BitWidth - Amt; + + // rotl: (Op0 << Amt) | (Op0 >> (BW - Amt)) + // rotr: (Op0 << (BW - Amt)) | (Op0 >> Amt) + APInt Demanded0 = DemandedBits.rotr(IsROTL ? Amt : RevAmt); + if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO, + Depth + 1)) + return true; + + // rot*(x, 0) --> x + if (Amt == 0) + return TLO.CombineTo(Op, Op0); + + // See if we don't demand either half of the rotated bits. + if ((!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT)) && + DemandedBits.countTrailingZeros() >= (IsROTL ? Amt : RevAmt)) { + Op1 = TLO.DAG.getConstant(IsROTL ? Amt : RevAmt, dl, Op1.getValueType()); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, Op1)); + } + if ((!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT)) && + DemandedBits.countLeadingZeros() >= (IsROTL ? RevAmt : Amt)) { + Op1 = TLO.DAG.getConstant(IsROTL ? RevAmt : Amt, dl, Op1.getValueType()); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); + } + } + // For pow-2 bitwidths we only demand the bottom modulo amt bits. if (isPowerOf2_32(BitWidth)) { APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1); @@ -1887,9 +1916,8 @@ bool TargetLowering::SimplifyDemandedBits( if (DemandedBits.getActiveBits() <= InBits) { // If we only need the non-extended bits of the bottom element // then we can just bitcast to the result. - if (IsVecInReg && DemandedElts == 1 && - VT.getSizeInBits() == SrcVT.getSizeInBits() && - TLO.DAG.getDataLayout().isLittleEndian()) + if (IsLE && IsVecInReg && DemandedElts == 1 && + VT.getSizeInBits() == SrcVT.getSizeInBits()) return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src)); unsigned Opc = @@ -1925,9 +1953,8 @@ bool TargetLowering::SimplifyDemandedBits( if (DemandedBits.getActiveBits() <= InBits) { // If we only need the non-extended bits of the bottom element // then we can just bitcast to the result. - if (IsVecInReg && DemandedElts == 1 && - VT.getSizeInBits() == SrcVT.getSizeInBits() && - TLO.DAG.getDataLayout().isLittleEndian()) + if (IsLE && IsVecInReg && DemandedElts == 1 && + VT.getSizeInBits() == SrcVT.getSizeInBits()) return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src)); unsigned Opc = @@ -1976,9 +2003,8 @@ bool TargetLowering::SimplifyDemandedBits( // If we only need the bottom element then we can just bitcast. // TODO: Handle ANY_EXTEND? - if (IsVecInReg && DemandedElts == 1 && - VT.getSizeInBits() == SrcVT.getSizeInBits() && - TLO.DAG.getDataLayout().isLittleEndian()) + if (IsLE && IsVecInReg && DemandedElts == 1 && + VT.getSizeInBits() == SrcVT.getSizeInBits()) return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src)); APInt InDemandedBits = DemandedBits.trunc(InBits); @@ -2140,16 +2166,15 @@ bool TargetLowering::SimplifyDemandedBits( // Bitcast from a vector using SimplifyDemanded Bits/VectorElts. // Demand the elt/bit if any of the original elts/bits are demanded. - // TODO - bigendian once we have test coverage. - if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0 && - TLO.DAG.getDataLayout().isLittleEndian()) { + if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0) { unsigned Scale = BitWidth / NumSrcEltBits; unsigned NumSrcElts = SrcVT.getVectorNumElements(); APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); APInt DemandedSrcElts = APInt::getZero(NumSrcElts); for (unsigned i = 0; i != Scale; ++i) { - unsigned Offset = i * NumSrcEltBits; - APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset); + unsigned EltOffset = IsLE ? i : (Scale - 1 - i); + unsigned BitOffset = EltOffset * NumSrcEltBits; + APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset); if (!Sub.isZero()) { DemandedSrcBits |= Sub; for (unsigned j = 0; j != NumElts; ++j) @@ -2167,8 +2192,8 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, KnownSrcBits, TLO, Depth + 1)) return true; - } else if ((NumSrcEltBits % BitWidth) == 0 && - TLO.DAG.getDataLayout().isLittleEndian()) { + } else if (IsLE && (NumSrcEltBits % BitWidth) == 0) { + // TODO - bigendian once we have test coverage. unsigned Scale = NumSrcEltBits / BitWidth; unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1; APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits); @@ -2409,6 +2434,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDLoc DL(Op); unsigned EltSizeInBits = VT.getScalarSizeInBits(); + bool IsLE = TLO.DAG.getDataLayout().isLittleEndian(); // Helper for demanding the specified elements and all the bits of both binary // operands. @@ -2484,7 +2510,7 @@ bool TargetLowering::SimplifyDemandedVectorElts( // Try calling SimplifyDemandedBits, converting demanded elts to the bits // of the large element. // TODO - bigendian once we have test coverage. - if (TLO.DAG.getDataLayout().isLittleEndian()) { + if (IsLE) { unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits(); APInt SrcDemandedBits = APInt::getZero(SrcEltSizeInBits); for (unsigned i = 0; i != NumElts; ++i) @@ -2797,9 +2823,9 @@ bool TargetLowering::SimplifyDemandedVectorElts( KnownZero = SrcZero.zextOrTrunc(NumElts); KnownUndef = SrcUndef.zextOrTrunc(NumElts); - if (Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG && + if (IsLE && Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG && Op.getValueSizeInBits() == Src.getValueSizeInBits() && - DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian()) { + DemandedSrcElts == 1) { // aext - if we just need the bottom element then we can bitcast. return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src)); } @@ -2812,8 +2838,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( // zext - if we just need the bottom element then we can mask: // zext(and(x,c)) -> and(x,c') iff the zext is the only user of the and. - if (DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian() && - Src.getOpcode() == ISD::AND && Op->isOnlyUserOf(Src.getNode()) && + if (IsLE && DemandedSrcElts == 1 && Src.getOpcode() == ISD::AND && + Op->isOnlyUserOf(Src.getNode()) && Op.getValueSizeInBits() == Src.getValueSizeInBits()) { SDLoc DL(Op); EVT SrcVT = Src.getValueType(); @@ -2834,9 +2860,19 @@ bool TargetLowering::SimplifyDemandedVectorElts( // TODO: There are more binop opcodes that could be handled here - MIN, // MAX, saturated math, etc. + case ISD::ADD: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0 == Op1 && Op->isOnlyUserOf(Op0.getNode())) { + APInt UndefLHS, ZeroLHS; + if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO, + Depth + 1, /*AssumeSingleUse*/ true)) + return true; + } + LLVM_FALLTHROUGH; + } case ISD::OR: case ISD::XOR: - case ISD::ADD: case ISD::SUB: case ISD::FADD: case ISD::FSUB: @@ -5586,7 +5622,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) .trunc(W); assert(!P.isZero() && "No multiplicative inverse!"); // unreachable - assert((D0 * P).isOne() && "Multiplicative inverse sanity check."); + assert((D0 * P).isOne() && "Multiplicative inverse basic check failed."); // Q = floor((2^W - 1) u/ D) // R = ((2^W - 1) u% D) @@ -5832,7 +5868,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode, .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) .trunc(W); assert(!P.isZero() && "No multiplicative inverse!"); // unreachable - assert((D0 * P).isOne() && "Multiplicative inverse sanity check."); + assert((D0 * P).isOne() && "Multiplicative inverse basic check failed."); // A = floor((2^(W - 1) - 1) / D0) & -2^K APInt A = APInt::getSignedMaxValue(W).udiv(D0); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp index 9aea5a7a8853..f49ba5ccd447 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -159,8 +159,7 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) { // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands. for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isFI()) continue; int FI = MO.getIndex(); @@ -394,8 +393,7 @@ void StackSlotColoring::RewriteInstruction(MachineInstr &MI, SmallVectorImpl<int> &SlotMapping, MachineFunction &MF) { // Update the operands. - for (unsigned i = 0, ee = MI.getNumOperands(); i != ee; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (MachineOperand &MO : MI.operands()) { if (!MO.isFI()) continue; int OldFI = MO.getIndex(); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp index 943bd18c6c8b..54fc6ee45d00 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp @@ -70,12 +70,6 @@ static cl::opt<unsigned> TailDupIndirectBranchSize( "end with indirect branches."), cl::init(20), cl::Hidden); -static cl::opt<unsigned> TailDupJmpTableLoopSize( - "tail-dup-jmptable-loop-size", - cl::desc("Maximum loop latches to consider tail duplication that are " - "successors of loop header."), - cl::init(128), cl::Hidden); - static cl::opt<bool> TailDupVerify("tail-dup-verify", cl::desc("Verify sanity of PHI instructions during taildup"), @@ -569,29 +563,6 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, if (TailBB.isSuccessor(&TailBB)) return false; - // When doing tail-duplication with jumptable loops like: - // 1 -> 2 <-> 3 | - // \ <-> 4 | - // \ <-> 5 | - // \ <-> ... | - // \---> rest | - // quadratic number of edges and much more loops are added to CFG. This - // may cause compile time regression when jumptable is quiet large. - // So set the limit on jumptable cases. - auto isLargeJumpTableLoop = [](const MachineBasicBlock &TailBB) { - const SmallPtrSet<const MachineBasicBlock *, 8> Preds(TailBB.pred_begin(), - TailBB.pred_end()); - // Check the basic block has large number of successors, all of them only - // have one successor which is the basic block itself. - return llvm::count_if( - TailBB.successors(), [&](const MachineBasicBlock *SuccBB) { - return Preds.count(SuccBB) && SuccBB->succ_size() == 1; - }) > TailDupJmpTableLoopSize; - }; - - if (isLargeJumpTableLoop(TailBB)) - return false; - // Set the limit on the cost to duplicate. When optimizing for size, // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index b0594ec086b2..fbf190a52585 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -136,6 +136,16 @@ unsigned TargetFrameLowering::getStackAlignmentSkew( return 0; } +bool TargetFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const { + if (!hasFP(MF)) + return false; + + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + return RegInfo->useFPForScavengingIndex(MF) && + !RegInfo->hasStackRealignment(MF); +} + bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) { if (!F.hasLocalLinkage() || F.hasAddressTaken() || !F.hasFnAttribute(Attribute::NoRecurse)) diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp index e74b3195a130..5119dac36713 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -957,8 +957,7 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric( // If any of the registers accessed are non-constant, conservatively assume // the instruction is not rematerializable. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (Reg == 0) @@ -1401,3 +1400,21 @@ std::string TargetInstrInfo::createMIROperandComment( } TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() {} + +void TargetInstrInfo::mergeOutliningCandidateAttributes( + Function &F, std::vector<outliner::Candidate> &Candidates) const { + // Include target features from an arbitrary candidate for the outlined + // function. This makes sure the outlined function knows what kinds of + // instructions are going into it. This is fine, since all parent functions + // must necessarily support the instructions that are in the outlined region. + outliner::Candidate &FirstCand = Candidates.front(); + const Function &ParentFn = FirstCand.getMF()->getFunction(); + if (ParentFn.hasFnAttribute("target-features")) + F.addFnAttr(ParentFn.getFnAttribute("target-features")); + + // Set nounwind, so we don't generate eh_frame. + if (llvm::all_of(Candidates, [](const outliner::Candidate &C) { + return C.getMF()->getFunction().hasFnAttribute(Attribute::NoUnwind); + })) + F.addFnAttr(Attribute::NoUnwind); +} diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 1d3bb286c882..d1c2cdeb133b 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1082,7 +1082,7 @@ const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference( if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) return nullptr; - // Basic sanity checks. + // Basic correctness checks. if (LHS->getType()->getPointerAddressSpace() != 0 || RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || RHS->isThreadLocal()) @@ -2135,7 +2135,7 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference( if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) return nullptr; - // Basic sanity checks. + // Basic correctness checks. if (LHS->getType()->getPointerAddressSpace() != 0 || RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || RHS->isThreadLocal()) diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 46cec5407565..dfd962be2882 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -373,19 +373,25 @@ static bool isTwoAddrUse(MachineInstr &MI, Register Reg, Register &DstReg) { return false; } -/// Given a register, if has a single in-basic block use, return the use -/// instruction if it's a copy or a two-address use. +/// Given a register, if all its uses are in the same basic block, return the +/// last use instruction if it's a copy or a two-address use. static MachineInstr * findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB, MachineRegisterInfo *MRI, const TargetInstrInfo *TII, - bool &IsCopy, Register &DstReg, bool &IsDstPhys) { - if (!MRI->hasOneNonDBGUse(Reg)) - // None or more than one use. - return nullptr; - MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg); - MachineInstr &UseMI = *UseOp.getParent(); - if (UseMI.getParent() != MBB) + bool &IsCopy, Register &DstReg, bool &IsDstPhys, + LiveIntervals *LIS) { + MachineOperand *UseOp = nullptr; + for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { + MachineInstr *MI = MO.getParent(); + if (MI->getParent() != MBB) + return nullptr; + if (isPlainlyKilled(MI, Reg, LIS)) + UseOp = &MO; + } + if (!UseOp) return nullptr; + MachineInstr &UseMI = *UseOp->getParent(); + Register SrcReg; bool IsSrcPhys; if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) { @@ -399,7 +405,7 @@ findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB, } if (UseMI.isCommutable()) { unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex; - unsigned Src2 = UseMI.getOperandNo(&UseOp); + unsigned Src2 = UseMI.getOperandNo(UseOp); if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) { MachineOperand &MO = UseMI.getOperand(Src1); if (MO.isReg() && MO.isUse() && @@ -492,8 +498,7 @@ void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) { return; } - for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isRegMask()) { removeMapRegEntry(MO, SrcRegMap, TRI); continue; @@ -685,7 +690,6 @@ bool TwoAddressInstructionPass::convertInstTo3Addr( // If the old instruction is debug value tracked, an update is required. if (auto OldInstrNum = mi->peekDebugInstrNum()) { - // Sanity check. assert(mi->getNumExplicitDefs() == 1); assert(NewMI->getNumExplicitDefs() == 1); @@ -724,7 +728,7 @@ void TwoAddressInstructionPass::scanUses(Register DstReg) { Register NewReg; Register Reg = DstReg; while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy, - NewReg, IsDstPhys)) { + NewReg, IsDstPhys, LIS)) { if (IsCopy && !Processed.insert(UseMI).second) break; @@ -1336,8 +1340,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi, // Success, or at least we made an improvement. Keep the unfolded // instructions and discard the original. if (LV) { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.getReg().isVirtual()) { if (MO.isUse()) { if (MO.isKill()) { diff --git a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index fb0798f204e1..7673a721c4ea 100644 --- a/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -15,6 +15,7 @@ #include "llvm/DebugInfo/DWARF/DWARFExpression.h" #include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/DebugInfo/DWARF/DWARFSection.h" +#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h" #include "llvm/Support/DJB.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/WithColor.h" @@ -317,12 +318,33 @@ bool DWARFVerifier::handleDebugAbbrev() { return NumErrors == 0; } -unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, - DWARFSectionKind SectionKind) { +unsigned DWARFVerifier::verifyUnits(const DWARFUnitVector &Units) { + unsigned NumDebugInfoErrors = 0; + ReferenceMap CrossUnitReferences; + + for (const auto &Unit : Units) { + ReferenceMap UnitLocalReferences; + NumDebugInfoErrors += + verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences); + NumDebugInfoErrors += verifyDebugInfoReferences( + UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); }); + } + + NumDebugInfoErrors += verifyDebugInfoReferences( + CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * { + if (DWARFUnit *U = Units.getUnitForOffset(Offset)) + return U; + return nullptr; + }); + + return NumDebugInfoErrors; +} + +unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S) { const DWARFObject &DObj = DCtx.getDWARFObj(); DWARFDataExtractor DebugInfoData(DObj, S, DCtx.isLittleEndian(), 0); unsigned NumDebugInfoErrors = 0; - uint64_t OffsetStart = 0, Offset = 0, UnitIdx = 0; + uint64_t Offset = 0, UnitIdx = 0; uint8_t UnitType = 0; bool isUnitDWARF64 = false; bool isHeaderChainValid = true; @@ -334,48 +356,11 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, /// lies between to valid DIEs. ReferenceMap CrossUnitReferences; while (hasDIE) { - OffsetStart = Offset; if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType, isUnitDWARF64)) { isHeaderChainValid = false; if (isUnitDWARF64) break; - } else { - DWARFUnitHeader Header; - Header.extract(DCtx, DebugInfoData, &OffsetStart, SectionKind); - ReferenceMap UnitLocalReferences; - DWARFUnit *Unit; - switch (UnitType) { - case dwarf::DW_UT_type: - case dwarf::DW_UT_split_type: { - Unit = TypeUnitVector.addUnit(std::make_unique<DWARFTypeUnit>( - DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(), - &DObj.getLocSection(), DObj.getStrSection(), - DObj.getStrOffsetsSection(), &DObj.getAddrSection(), - DObj.getLineSection(), DCtx.isLittleEndian(), false, - TypeUnitVector)); - break; - } - case dwarf::DW_UT_skeleton: - case dwarf::DW_UT_split_compile: - case dwarf::DW_UT_compile: - case dwarf::DW_UT_partial: - // UnitType = 0 means that we are verifying a compile unit in DWARF v4. - case 0: { - Unit = CompileUnitVector.addUnit(std::make_unique<DWARFCompileUnit>( - DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(), - &DObj.getLocSection(), DObj.getStrSection(), - DObj.getStrOffsetsSection(), &DObj.getAddrSection(), - DObj.getLineSection(), DCtx.isLittleEndian(), false, - CompileUnitVector)); - break; - } - default: { llvm_unreachable("Invalid UnitType."); } - } - NumDebugInfoErrors += - verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences); - NumDebugInfoErrors += verifyDebugInfoReferences( - UnitLocalReferences, [&](uint64_t Offset) { return Unit; }); } hasDIE = DebugInfoData.isValidOffset(Offset); ++UnitIdx; @@ -386,14 +371,6 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, } if (!isHeaderChainValid) ++NumDebugInfoErrors; - NumDebugInfoErrors += verifyDebugInfoReferences( - CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * { - if (DWARFUnit *U = TypeUnitVector.getUnitForOffset(Offset)) - return U; - if (DWARFUnit *U = CompileUnitVector.getUnitForOffset(Offset)) - return U; - return nullptr; - }); return NumDebugInfoErrors; } @@ -403,13 +380,16 @@ bool DWARFVerifier::handleDebugInfo() { OS << "Verifying .debug_info Unit Header Chain...\n"; DObj.forEachInfoSections([&](const DWARFSection &S) { - NumErrors += verifyUnitSection(S, DW_SECT_INFO); + NumErrors += verifyUnitSection(S); }); OS << "Verifying .debug_types Unit Header Chain...\n"; DObj.forEachTypesSections([&](const DWARFSection &S) { - NumErrors += verifyUnitSection(S, DW_SECT_EXT_TYPES); + NumErrors += verifyUnitSection(S); }); + + OS << "Verifying non-dwo Units...\n"; + NumErrors += verifyUnits(DCtx.getNormalUnitsVector()); return NumErrors == 0; } diff --git a/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp b/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp index d2f1bf4323ee..f380aa90035e 100644 --- a/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp +++ b/contrib/llvm-project/llvm/lib/Demangle/DLangDemangle.cpp @@ -14,12 +14,250 @@ //===----------------------------------------------------------------------===// #include "llvm/Demangle/Demangle.h" +#include "llvm/Demangle/StringView.h" #include "llvm/Demangle/Utility.h" +#include <cctype> #include <cstring> +#include <limits> using namespace llvm; using llvm::itanium_demangle::OutputBuffer; +using llvm::itanium_demangle::StringView; + +namespace { + +/// Demangle information structure. +struct Demangler { + /// Initialize the information structure we use to pass around information. + /// + /// \param Mangled String to demangle. + Demangler(const char *Mangled); + + /// Extract and demangle the mangled symbol and append it to the output + /// string. + /// + /// \param Demangled Output buffer to write the demangled name. + /// + /// \return The remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#name_mangling . + /// \see https://dlang.org/spec/abi.html#MangledName . + const char *parseMangle(OutputBuffer *Demangled); + +private: + /// Extract and demangle a given mangled symbol and append it to the output + /// string. + /// + /// \param Demangled output buffer to write the demangled name. + /// \param Mangled mangled symbol to be demangled. + /// + /// \return The remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#name_mangling . + /// \see https://dlang.org/spec/abi.html#MangledName . + const char *parseMangle(OutputBuffer *Demangled, const char *Mangled); + + /// Extract the number from a given string. + /// + /// \param Mangled string to extract the number. + /// \param Ret assigned result value. + /// + /// \return The remaining string on success or nullptr on failure. + /// + /// \note A result larger than UINT_MAX is considered a failure. + /// + /// \see https://dlang.org/spec/abi.html#Number . + const char *decodeNumber(const char *Mangled, unsigned long *Ret); + + /// Check whether it is the beginning of a symbol name. + /// + /// \param Mangled string to extract the symbol name. + /// + /// \return true on success, false otherwise. + /// + /// \see https://dlang.org/spec/abi.html#SymbolName . + bool isSymbolName(const char *Mangled); + + /// Extract and demangle an identifier from a given mangled symbol append it + /// to the output string. + /// + /// \param Demangled Output buffer to write the demangled name. + /// \param Mangled Mangled symbol to be demangled. + /// + /// \return The remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#SymbolName . + const char *parseIdentifier(OutputBuffer *Demangled, const char *Mangled); + + /// Extract and demangle the plain identifier from a given mangled symbol and + /// prepend/append it to the output string, with a special treatment for some + /// magic compiler generated symbols. + /// + /// \param Demangled Output buffer to write the demangled name. + /// \param Mangled Mangled symbol to be demangled. + /// \param Len Length of the mangled symbol name. + /// + /// \return The remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#LName . + const char *parseLName(OutputBuffer *Demangled, const char *Mangled, + unsigned long Len); + + /// Extract and demangle the qualified symbol from a given mangled symbol + /// append it to the output string. + /// + /// \param Demangled Output buffer to write the demangled name. + /// \param Mangled Mangled symbol to be demangled. + /// + /// \return The remaining string on success or nullptr on failure. + /// + /// \see https://dlang.org/spec/abi.html#QualifiedName . + const char *parseQualified(OutputBuffer *Demangled, const char *Mangled); + + /// The string we are demangling. + const char *Str; +}; + +} // namespace + +const char *Demangler::decodeNumber(const char *Mangled, unsigned long *Ret) { + // Return nullptr if trying to extract something that isn't a digit. + if (Mangled == nullptr || !std::isdigit(*Mangled)) + return nullptr; + + unsigned long Val = 0; + + do { + unsigned long Digit = Mangled[0] - '0'; + + // Check for overflow. + if (Val > (std::numeric_limits<unsigned int>::max() - Digit) / 10) + return nullptr; + + Val = Val * 10 + Digit; + ++Mangled; + } while (std::isdigit(*Mangled)); + + if (*Mangled == '\0') + return nullptr; + + *Ret = Val; + return Mangled; +} + +bool Demangler::isSymbolName(const char *Mangled) { + if (std::isdigit(*Mangled)) + return true; + + // TODO: Handle symbol back references and template instances. + return false; +} + +const char *Demangler::parseMangle(OutputBuffer *Demangled, + const char *Mangled) { + // A D mangled symbol is comprised of both scope and type information. + // MangleName: + // _D QualifiedName Type + // _D QualifiedName Z + // ^ + // The caller should have guaranteed that the start pointer is at the + // above location. + // Note that type is never a function type, but only the return type of + // a function or the type of a variable. + Mangled += 2; + + Mangled = parseQualified(Demangled, Mangled); + + if (Mangled != nullptr) { + // Artificial symbols end with 'Z' and have no type. + if (*Mangled == 'Z') + ++Mangled; + else { + // TODO: Implement symbols with types. + return nullptr; + } + } + + return Mangled; +} + +const char *Demangler::parseQualified(OutputBuffer *Demangled, + const char *Mangled) { + // Qualified names are identifiers separated by their encoded length. + // Nested functions also encode their argument types without specifying + // what they return. + // QualifiedName: + // SymbolFunctionName + // SymbolFunctionName QualifiedName + // ^ + // SymbolFunctionName: + // SymbolName + // SymbolName TypeFunctionNoReturn + // SymbolName M TypeFunctionNoReturn + // SymbolName M TypeModifiers TypeFunctionNoReturn + // The start pointer should be at the above location. + + // Whether it has more than one symbol + size_t NotFirst = false; + do { + // Skip over anonymous symbols. + if (*Mangled == '0') { + do + ++Mangled; + while (*Mangled == '0'); + + continue; + } + + if (NotFirst) + *Demangled << '.'; + NotFirst = true; + + Mangled = parseIdentifier(Demangled, Mangled); + + } while (Mangled && isSymbolName(Mangled)); + + return Mangled; +} + +const char *Demangler::parseIdentifier(OutputBuffer *Demangled, + const char *Mangled) { + unsigned long Len; + + if (Mangled == nullptr || *Mangled == '\0') + return nullptr; + + // TODO: Parse back references and lengthless template instances. + + const char *Endptr = decodeNumber(Mangled, &Len); + + if (Endptr == nullptr || Len == 0) + return nullptr; + + if (strlen(Endptr) < Len) + return nullptr; + + Mangled = Endptr; + + // TODO: Parse template instances with a length prefix. + + return parseLName(Demangled, Mangled, Len); +} + +const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled, + unsigned long Len) { + *Demangled << StringView(Mangled, Len); + Mangled += Len; + + return Mangled; +} + +Demangler::Demangler(const char *Mangled) : Str(Mangled) {} + +const char *Demangler::parseMangle(OutputBuffer *Demangled) { + return parseMangle(Demangled, this->Str); +} char *llvm::dlangDemangle(const char *MangledName) { if (MangledName == nullptr || strncmp(MangledName, "_D", 2) != 0) @@ -29,8 +267,19 @@ char *llvm::dlangDemangle(const char *MangledName) { if (!initializeOutputBuffer(nullptr, nullptr, Demangled, 1024)) return nullptr; - if (strcmp(MangledName, "_Dmain") == 0) + if (strcmp(MangledName, "_Dmain") == 0) { Demangled << "D main"; + } else { + + Demangler D = Demangler(MangledName); + MangledName = D.parseMangle(&Demangled); + + // Check that the entire symbol was successfully demangled. + if (MangledName == nullptr || *MangledName != '\0') { + std::free(Demangled.getBuffer()); + return nullptr; + } + } // OutputBuffer's internal buffer is not null terminated and therefore we need // to add it to comply with C null terminated strings. @@ -40,6 +289,6 @@ char *llvm::dlangDemangle(const char *MangledName) { return Demangled.getBuffer(); } - free(Demangled.getBuffer()); + std::free(Demangled.getBuffer()); return nullptr; } diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp index fe3c433bd2c5..a14bd4d2c3fd 100644 --- a/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/ExecutionEngine.cpp @@ -1256,8 +1256,7 @@ void ExecutionEngine::emitGlobals() { // If there are multiple modules, map the non-canonical globals to their // canonical location. if (!NonCanonicalGlobals.empty()) { - for (unsigned i = 0, e = NonCanonicalGlobals.size(); i != e; ++i) { - const GlobalValue *GV = NonCanonicalGlobals[i]; + for (const GlobalValue *GV : NonCanonicalGlobals) { const GlobalValue *CGV = LinkedGlobalsMap[std::make_pair( std::string(GV->getName()), GV->getType())]; void *Ptr = getPointerToGlobalIfAvailable(CGV); diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h index fdc987751286..f9101d71dfa8 100644 --- a/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h +++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h @@ -143,6 +143,9 @@ protected: // Only SHF_ALLOC sections will have graph sections. DenseMap<ELFSectionIndex, Section *> GraphSections; DenseMap<ELFSymbolIndex, Symbol *> GraphSymbols; + DenseMap<const typename ELFFile::Elf_Shdr *, + ArrayRef<typename ELFFile::Elf_Word>> + ShndxTables; }; template <typename ELFT> @@ -241,7 +244,7 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::prepare() { return SectionStringTabOrErr.takeError(); // Get the SHT_SYMTAB section. - for (auto &Sec : Sections) + for (auto &Sec : Sections) { if (Sec.sh_type == ELF::SHT_SYMTAB) { if (!SymTabSec) SymTabSec = &Sec; @@ -250,6 +253,20 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::prepare() { G->getName()); } + // Extended table. + if (Sec.sh_type == ELF::SHT_SYMTAB_SHNDX) { + uint32_t SymtabNdx = Sec.sh_link; + if (SymtabNdx >= Sections.size()) + return make_error<JITLinkError>("sh_link is out of bound"); + + auto ShndxTable = Obj.getSHNDXTable(Sec); + if (!ShndxTable) + return ShndxTable.takeError(); + + ShndxTables.insert({&Sections[SymtabNdx], *ShndxTable}); + } + } + return Error::success(); } @@ -299,11 +316,6 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() { else Prot = MemProt::Read | MemProt::Write; - // For now we just use this to skip the "undefined" section, probably need - // to revist. - if (Sec.sh_size == 0) - continue; - auto &GraphSec = G->createSection(*Name, Prot); if (Sec.sh_type != ELF::SHT_NOBITS) { auto Data = Obj.template getSectionContentsAsArray<char>(Sec); @@ -401,9 +413,19 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() { (Sym.getType() == ELF::STT_NOTYPE || Sym.getType() == ELF::STT_FUNC || Sym.getType() == ELF::STT_OBJECT || Sym.getType() == ELF::STT_SECTION || Sym.getType() == ELF::STT_TLS)) { - - // FIXME: Handle extended tables. - if (auto *GraphSec = getGraphSection(Sym.st_shndx)) { + // Handle extended tables. + unsigned Shndx = Sym.st_shndx; + if (Shndx == ELF::SHN_XINDEX) { + auto ShndxTable = ShndxTables.find(SymTabSec); + if (ShndxTable == ShndxTables.end()) + continue; + auto NdxOrErr = object::getExtendedSymbolTableIndex<ELFT>( + Sym, SymIndex, ShndxTable->second); + if (!NdxOrErr) + return NdxOrErr.takeError(); + Shndx = *NdxOrErr; + } + if (auto *GraphSec = getGraphSection(Shndx)) { Block *B = nullptr; { auto Blocks = GraphSec->blocks(); diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp index 6b24d6461b63..56a97f83d915 100644 --- a/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -612,9 +612,14 @@ void LookupState::continueLookup(Error Err) { DefinitionGenerator::~DefinitionGenerator() {} +JITDylib::~JITDylib() { + LLVM_DEBUG(dbgs() << "Destroying JITDylib " << getName() << "\n"); +} + Error JITDylib::clear() { std::vector<ResourceTrackerSP> TrackersToRemove; ES.runSessionLocked([&]() { + assert(State != Closed && "JD is defunct"); for (auto &KV : TrackerSymbols) TrackersToRemove.push_back(KV.first); TrackersToRemove.push_back(getDefaultResourceTracker()); @@ -628,6 +633,7 @@ Error JITDylib::clear() { ResourceTrackerSP JITDylib::getDefaultResourceTracker() { return ES.runSessionLocked([this] { + assert(State != Closed && "JD is defunct"); if (!DefaultTracker) DefaultTracker = new ResourceTracker(this); return DefaultTracker; @@ -636,19 +642,22 @@ ResourceTrackerSP JITDylib::getDefaultResourceTracker() { ResourceTrackerSP JITDylib::createResourceTracker() { return ES.runSessionLocked([this] { + assert(State == Open && "JD is defunct"); ResourceTrackerSP RT = new ResourceTracker(this); return RT; }); } void JITDylib::removeGenerator(DefinitionGenerator &G) { - std::lock_guard<std::mutex> Lock(GeneratorsMutex); - auto I = llvm::find_if(DefGenerators, - [&](const std::shared_ptr<DefinitionGenerator> &H) { - return H.get() == &G; - }); - assert(I != DefGenerators.end() && "Generator not found"); - DefGenerators.erase(I); + ES.runSessionLocked([&] { + assert(State == Open && "JD is defunct"); + auto I = llvm::find_if(DefGenerators, + [&](const std::shared_ptr<DefinitionGenerator> &H) { + return H.get() == &G; + }); + assert(I != DefGenerators.end() && "Generator not found"); + DefGenerators.erase(I); + }); } Expected<SymbolFlagsMap> @@ -708,10 +717,8 @@ Error JITDylib::replace(MaterializationResponsibility &FromMR, auto Err = ES.runSessionLocked([&, this]() -> Error { - auto RT = getTracker(FromMR); - - if (RT->isDefunct()) - return make_error<ResourceTrackerDefunct>(std::move(RT)); + if (FromMR.RT->isDefunct()) + return make_error<ResourceTrackerDefunct>(std::move(FromMR.RT)); #ifndef NDEBUG for (auto &KV : MU->getSymbols()) { @@ -735,7 +742,8 @@ Error JITDylib::replace(MaterializationResponsibility &FromMR, if (MII != MaterializingInfos.end()) { if (MII->second.hasQueriesPending()) { MustRunMR = ES.createMaterializationResponsibility( - *RT, std::move(MU->SymbolFlags), std::move(MU->InitSymbol)); + *FromMR.RT, std::move(MU->SymbolFlags), + std::move(MU->InitSymbol)); MustRunMU = std::move(MU); return Error::success(); } @@ -743,10 +751,8 @@ Error JITDylib::replace(MaterializationResponsibility &FromMR, } // Otherwise, make MU responsible for all the symbols. - auto RTI = MRTrackers.find(&FromMR); - assert(RTI != MRTrackers.end() && "No tracker for FromMR"); - auto UMI = - std::make_shared<UnmaterializedInfo>(std::move(MU), RTI->second); + auto UMI = std::make_shared<UnmaterializedInfo>(std::move(MU), + FromMR.RT.get()); for (auto &KV : UMI->MU->getSymbols()) { auto SymI = Symbols.find(KV.first); assert(SymI->second.getState() == SymbolState::Materializing && @@ -787,13 +793,11 @@ JITDylib::delegate(MaterializationResponsibility &FromMR, return ES.runSessionLocked( [&]() -> Expected<std::unique_ptr<MaterializationResponsibility>> { - auto RT = getTracker(FromMR); - - if (RT->isDefunct()) - return make_error<ResourceTrackerDefunct>(std::move(RT)); + if (FromMR.RT->isDefunct()) + return make_error<ResourceTrackerDefunct>(std::move(FromMR.RT)); return ES.createMaterializationResponsibility( - *RT, std::move(SymbolFlags), std::move(InitSymbol)); + *FromMR.RT, std::move(SymbolFlags), std::move(InitSymbol)); }); } @@ -903,10 +907,13 @@ Error JITDylib::resolve(MaterializationResponsibility &MR, AsynchronousSymbolQuerySet CompletedQueries; if (auto Err = ES.runSessionLocked([&, this]() -> Error { - auto RTI = MRTrackers.find(&MR); - assert(RTI != MRTrackers.end() && "No resource tracker for MR?"); - if (RTI->second->isDefunct()) - return make_error<ResourceTrackerDefunct>(RTI->second); + if (MR.RT->isDefunct()) + return make_error<ResourceTrackerDefunct>(MR.RT); + + if (State != Open) + return make_error<StringError>("JITDylib " + getName() + + " is defunct", + inconvertibleErrorCode()); struct WorklistEntry { SymbolTable::iterator SymI; @@ -1001,10 +1008,13 @@ Error JITDylib::emit(MaterializationResponsibility &MR, DenseMap<JITDylib *, SymbolNameVector> ReadySymbols; if (auto Err = ES.runSessionLocked([&, this]() -> Error { - auto RTI = MRTrackers.find(&MR); - assert(RTI != MRTrackers.end() && "No resource tracker for MR?"); - if (RTI->second->isDefunct()) - return make_error<ResourceTrackerDefunct>(RTI->second); + if (MR.RT->isDefunct()) + return make_error<ResourceTrackerDefunct>(MR.RT); + + if (State != Open) + return make_error<StringError>("JITDylib " + getName() + + " is defunct", + inconvertibleErrorCode()); SymbolNameSet SymbolsInErrorState; std::vector<SymbolTable::iterator> Worklist; @@ -1149,9 +1159,12 @@ Error JITDylib::emit(MaterializationResponsibility &MR, void JITDylib::unlinkMaterializationResponsibility( MaterializationResponsibility &MR) { ES.runSessionLocked([&]() { - auto I = MRTrackers.find(&MR); - assert(I != MRTrackers.end() && "MaterializationResponsibility not linked"); - MRTrackers.erase(I); + auto I = TrackerMRs.find(MR.RT.get()); + assert(I != TrackerMRs.end() && "No MRs in TrackerMRs list for RT"); + assert(I->second.count(&MR) && "MR not in TrackerMRs list for RT"); + I->second.erase(&MR); + if (I->second.empty()) + TrackerMRs.erase(MR.RT.get()); }); } @@ -1169,8 +1182,16 @@ JITDylib::failSymbols(FailedSymbolsWorklist Worklist) { (*FailedSymbolsMap)[&JD].insert(Name); - assert(JD.Symbols.count(Name) && "No symbol table entry for Name"); - auto &Sym = JD.Symbols[Name]; + // Look up the symbol to fail. + auto SymI = JD.Symbols.find(Name); + + // It's possible that this symbol has already been removed, e.g. if a + // materialization failure happens concurrently with a ResourceTracker or + // JITDylib removal. In that case we can safely skip this symbol and + // continue. + if (SymI == JD.Symbols.end()) + continue; + auto &Sym = SymI->second; // Move the symbol into the error state. // Note that this may be redundant: The symbol might already have been @@ -1267,6 +1288,7 @@ JITDylib::failSymbols(FailedSymbolsWorklist Worklist) { void JITDylib::setLinkOrder(JITDylibSearchOrder NewLinkOrder, bool LinkAgainstThisJITDylibFirst) { ES.runSessionLocked([&]() { + assert(State == Open && "JD is defunct"); if (LinkAgainstThisJITDylibFirst) { LinkOrder.clear(); if (NewLinkOrder.empty() || NewLinkOrder.front().first != this) @@ -1285,6 +1307,7 @@ void JITDylib::addToLinkOrder(JITDylib &JD, JITDylibLookupFlags JDLookupFlags) { void JITDylib::replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD, JITDylibLookupFlags JDLookupFlags) { ES.runSessionLocked([&]() { + assert(State == Open && "JD is defunct"); for (auto &KV : LinkOrder) if (KV.first == &OldJD) { KV = {&NewJD, JDLookupFlags}; @@ -1295,6 +1318,7 @@ void JITDylib::replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD, void JITDylib::removeFromLinkOrder(JITDylib &JD) { ES.runSessionLocked([&]() { + assert(State == Open && "JD is defunct"); auto I = llvm::find_if(LinkOrder, [&](const JITDylibSearchOrder::value_type &KV) { return KV.first == &JD; @@ -1306,6 +1330,7 @@ void JITDylib::removeFromLinkOrder(JITDylib &JD) { Error JITDylib::remove(const SymbolNameSet &Names) { return ES.runSessionLocked([&]() -> Error { + assert(State == Open && "JD is defunct"); using SymbolMaterializerItrPair = std::pair<SymbolTable::iterator, UnmaterializedInfosMap::iterator>; std::vector<SymbolMaterializerItrPair> SymbolsToRemove; @@ -1365,8 +1390,23 @@ Error JITDylib::remove(const SymbolNameSet &Names) { void JITDylib::dump(raw_ostream &OS) { ES.runSessionLocked([&, this]() { OS << "JITDylib \"" << getName() << "\" (ES: " - << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n" - << "Link order: " << LinkOrder << "\n" + << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) + << ", State = "; + switch (State) { + case Open: + OS << "Open"; + break; + case Closing: + OS << "Closing"; + break; + case Closed: + OS << "Closed"; + break; + } + OS << ")\n"; + if (State == Closed) + return; + OS << "Link order: " << LinkOrder << "\n" << "Symbol table:\n"; for (auto &KV : Symbols) { @@ -1454,17 +1494,11 @@ JITDylib::JITDylib(ExecutionSession &ES, std::string Name) LinkOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols}); } -ResourceTrackerSP JITDylib::getTracker(MaterializationResponsibility &MR) { - auto I = MRTrackers.find(&MR); - assert(I != MRTrackers.end() && "MR is not linked"); - assert(I->second && "Linked tracker is null"); - return I->second; -} - std::pair<JITDylib::AsynchronousSymbolQuerySet, std::shared_ptr<SymbolDependenceMap>> JITDylib::removeTracker(ResourceTracker &RT) { // Note: Should be called under the session lock. + assert(State != Closed && "JD is defunct"); SymbolNameVector SymbolsToRemove; std::vector<std::pair<JITDylib *, SymbolStringPtr>> SymbolsToFail; @@ -1525,6 +1559,7 @@ JITDylib::removeTracker(ResourceTracker &RT) { } void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) { + assert(State != Closed && "JD is defunct"); assert(&DstRT != &SrcRT && "No-op transfers shouldn't call transferTracker"); assert(&DstRT.getJITDylib() == this && "DstRT is not for this JITDylib"); assert(&SrcRT.getJITDylib() == this && "SrcRT is not for this JITDylib"); @@ -1536,9 +1571,22 @@ void JITDylib::transferTracker(ResourceTracker &DstRT, ResourceTracker &SrcRT) { } // Update trackers for any active materialization responsibilities. - for (auto &KV : MRTrackers) { - if (KV.second == &SrcRT) - KV.second = &DstRT; + { + auto I = TrackerMRs.find(&SrcRT); + if (I != TrackerMRs.end()) { + auto &SrcMRs = I->second; + auto &DstMRs = TrackerMRs[&DstRT]; + for (auto *MR : SrcMRs) + MR->RT = &DstRT; + if (DstMRs.empty()) + DstMRs = std::move(SrcMRs); + else + for (auto *MR : SrcMRs) + DstMRs.insert(MR); + // Erase SrcRT entry in TrackerMRs. Use &SrcRT key rather than iterator I + // for this, since I may have been invalidated by 'TrackerMRs[&DstRT]'. + TrackerMRs.erase(&SrcRT); + } } // If we're transfering to the default tracker we just need to delete the @@ -1872,6 +1920,40 @@ Expected<JITDylib &> ExecutionSession::createJITDylib(std::string Name) { return JD; } +Error ExecutionSession::removeJITDylib(JITDylib &JD) { + // Keep JD alive throughout this routine, even if all other references + // have been dropped. + JITDylibSP JDKeepAlive = &JD; + + // Set JD to 'Closing' state and remove JD from the ExecutionSession. + runSessionLocked([&] { + assert(JD.State == JITDylib::Open && "JD already closed"); + JD.State = JITDylib::Closing; + auto I = llvm::find(JDs, &JD); + assert(I != JDs.end() && "JD does not appear in session JDs"); + JDs.erase(I); + }); + + // Clear the JITDylib. + auto Err = JD.clear(); + + // Set JD to closed state. Clear remaining data structures. + runSessionLocked([&] { + assert(JD.State == JITDylib::Closing && "JD should be closing"); + JD.State = JITDylib::Closed; + assert(JD.Symbols.empty() && "JD.Symbols is not empty after clear"); + assert(JD.UnmaterializedInfos.empty() && + "JD.UnmaterializedInfos is not empty after clear"); + assert(JD.MaterializingInfos.empty() && + "JD.MaterializingInfos is not empty after clear"); + assert(JD.TrackerSymbols.empty() && + "TrackerSymbols is not empty after clear"); + JD.DefGenerators.clear(); + JD.LinkOrder.clear(); + }); + return Err; +} + std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) { if (JDs.empty()) return {}; @@ -1883,6 +1965,8 @@ std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) { for (auto &JD : JDs) { + assert(JD->State == Open && "JD is defunct"); + if (Visited.count(JD.get())) continue; @@ -2311,8 +2395,11 @@ void ExecutionSession::OL_applyQueryPhase1( }); // Build the definition generator stack for this JITDylib. - for (auto &DG : reverse(JD.DefGenerators)) - IPLS->CurDefGeneratorStack.push_back(DG); + runSessionLocked([&] { + IPLS->CurDefGeneratorStack.reserve(JD.DefGenerators.size()); + for (auto &DG : reverse(JD.DefGenerators)) + IPLS->CurDefGeneratorStack.push_back(DG); + }); // Flag that we've done our initialization. IPLS->NewJITDylib = false; @@ -2629,17 +2716,15 @@ void ExecutionSession::OL_completeLookup( LLVM_DEBUG(dbgs() << "Adding MUs to dispatch:\n"); for (auto &KV : CollectedUMIs) { - auto &JD = *KV.first; LLVM_DEBUG({ + auto &JD = *KV.first; dbgs() << " For " << JD.getName() << ": Adding " << KV.second.size() << " MUs.\n"; }); for (auto &UMI : KV.second) { - std::unique_ptr<MaterializationResponsibility> MR( - new MaterializationResponsibility( - &JD, std::move(UMI->MU->SymbolFlags), - std::move(UMI->MU->InitSymbol))); - JD.MRTrackers[MR.get()] = UMI->RT; + auto MR = createMaterializationResponsibility( + *UMI->RT, std::move(UMI->MU->SymbolFlags), + std::move(UMI->MU->InitSymbol)); OutstandingMUs.push_back( std::make_pair(std::move(UMI->MU), std::move(MR))); } @@ -2757,18 +2842,18 @@ void ExecutionSession::OL_destroyMaterializationResponsibility( assert(MR.SymbolFlags.empty() && "All symbols should have been explicitly materialized or failed"); - MR.JD->unlinkMaterializationResponsibility(MR); + MR.JD.unlinkMaterializationResponsibility(MR); } SymbolNameSet ExecutionSession::OL_getRequestedSymbols( const MaterializationResponsibility &MR) { - return MR.JD->getRequestedSymbols(MR.SymbolFlags); + return MR.JD.getRequestedSymbols(MR.SymbolFlags); } Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR, const SymbolMap &Symbols) { LLVM_DEBUG({ - dbgs() << "In " << MR.JD->getName() << " resolving " << Symbols << "\n"; + dbgs() << "In " << MR.JD.getName() << " resolving " << Symbols << "\n"; }); #ifndef NDEBUG for (auto &KV : Symbols) { @@ -2783,15 +2868,16 @@ Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR, } #endif - return MR.JD->resolve(MR, Symbols); + return MR.JD.resolve(MR, Symbols); } Error ExecutionSession::OL_notifyEmitted(MaterializationResponsibility &MR) { LLVM_DEBUG({ - dbgs() << "In " << MR.JD->getName() << " emitting " << MR.SymbolFlags << "\n"; + dbgs() << "In " << MR.JD.getName() << " emitting " << MR.SymbolFlags + << "\n"; }); - if (auto Err = MR.JD->emit(MR, MR.SymbolFlags)) + if (auto Err = MR.JD.emit(MR, MR.SymbolFlags)) return Err; MR.SymbolFlags.clear(); @@ -2802,10 +2888,11 @@ Error ExecutionSession::OL_defineMaterializing( MaterializationResponsibility &MR, SymbolFlagsMap NewSymbolFlags) { LLVM_DEBUG({ - dbgs() << "In " << MR.JD->getName() << " defining materializing symbols " + dbgs() << "In " << MR.JD.getName() << " defining materializing symbols " << NewSymbolFlags << "\n"; }); - if (auto AcceptedDefs = MR.JD->defineMaterializing(std::move(NewSymbolFlags))) { + if (auto AcceptedDefs = + MR.JD.defineMaterializing(std::move(NewSymbolFlags))) { // Add all newly accepted symbols to this responsibility object. for (auto &KV : *AcceptedDefs) MR.SymbolFlags.insert(KV); @@ -2817,14 +2904,14 @@ Error ExecutionSession::OL_defineMaterializing( void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) { LLVM_DEBUG({ - dbgs() << "In " << MR.JD->getName() << " failing materialization for " + dbgs() << "In " << MR.JD.getName() << " failing materialization for " << MR.SymbolFlags << "\n"; }); JITDylib::FailedSymbolsWorklist Worklist; for (auto &KV : MR.SymbolFlags) - Worklist.push_back(std::make_pair(MR.JD.get(), KV.first)); + Worklist.push_back(std::make_pair(&MR.JD, KV.first)); MR.SymbolFlags.clear(); if (Worklist.empty()) @@ -2834,9 +2921,8 @@ void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) { std::shared_ptr<SymbolDependenceMap> FailedSymbols; runSessionLocked([&]() { - auto RTI = MR.JD->MRTrackers.find(&MR); - assert(RTI != MR.JD->MRTrackers.end() && "No tracker for this"); - if (RTI->second->isDefunct()) + // If the tracker is defunct then there's nothing to do here. + if (MR.RT->isDefunct()) return; std::tie(FailedQueries, FailedSymbols) = @@ -2858,12 +2944,12 @@ Error ExecutionSession::OL_replace(MaterializationResponsibility &MR, if (MU->getInitializerSymbol() == MR.InitSymbol) MR.InitSymbol = nullptr; - LLVM_DEBUG(MR.JD->getExecutionSession().runSessionLocked([&]() { - dbgs() << "In " << MR.JD->getName() << " replacing symbols with " << *MU + LLVM_DEBUG(MR.JD.getExecutionSession().runSessionLocked([&]() { + dbgs() << "In " << MR.JD.getName() << " replacing symbols with " << *MU << "\n"; });); - return MR.JD->replace(MR, std::move(MU)); + return MR.JD.replace(MR, std::move(MU)); } Expected<std::unique_ptr<MaterializationResponsibility>> @@ -2886,8 +2972,8 @@ ExecutionSession::OL_delegate(MaterializationResponsibility &MR, MR.SymbolFlags.erase(I); } - return MR.JD->delegate(MR, std::move(DelegatedFlags), - std::move(DelegatedInitSymbol)); + return MR.JD.delegate(MR, std::move(DelegatedFlags), + std::move(DelegatedInitSymbol)); } void ExecutionSession::OL_addDependencies( @@ -2899,7 +2985,7 @@ void ExecutionSession::OL_addDependencies( }); assert(MR.SymbolFlags.count(Name) && "Symbol not covered by this MaterializationResponsibility instance"); - MR.JD->addDependencies(Name, Dependencies); + MR.JD.addDependencies(Name, Dependencies); } void ExecutionSession::OL_addDependenciesForAll( @@ -2910,7 +2996,7 @@ void ExecutionSession::OL_addDependenciesForAll( << Dependencies << "\n"; }); for (auto &KV : MR.SymbolFlags) - MR.JD->addDependencies(KV.first, Dependencies); + MR.JD.addDependencies(KV.first, Dependencies); } #ifndef NDEBUG diff --git a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 1b7fdb588275..0de76ab78e0f 100644 --- a/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/contrib/llvm-project/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -1301,7 +1301,7 @@ RuntimeDyldELF::processRelocationRef( MemMgr.allowStubAllocation()) { resolveAArch64Branch(SectionID, Value, RelI, Stubs); } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) { - // Craete new GOT entry or find existing one. If GOT entry is + // Create new GOT entry or find existing one. If GOT entry is // to be created, then we also emit ABS64 relocation for it. uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64); resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend, diff --git a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index ce998df757ec..18f1a2314853 100644 --- a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -993,6 +993,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections( Value *ST = ConstantInt::get(I32Ty, 1); llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop( Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop"); + Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator()); + AllocaIP = Builder.saveIP(); InsertPointTy AfterIP = applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, true); BasicBlock *LoopAfterBB = AfterIP.getBlock(); diff --git a/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp index 7734c0a8de58..c9748e1387eb 100644 --- a/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp +++ b/contrib/llvm-project/llvm/lib/IR/AsmWriter.cpp @@ -353,12 +353,11 @@ void llvm::printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name) { // Scan the name to see if it needs quotes first. bool NeedsQuotes = isdigit(static_cast<unsigned char>(Name[0])); if (!NeedsQuotes) { - for (unsigned i = 0, e = Name.size(); i != e; ++i) { + for (unsigned char C : Name) { // By making this unsigned, the value passed in to isalnum will always be // in the range 0-255. This is important when building with MSVC because // its implementation will assert. This situation can arise when dealing // with UTF-8 multibyte characters. - unsigned char C = Name[i]; if (!isalnum(static_cast<unsigned char>(C)) && C != '-' && C != '.' && C != '_') { NeedsQuotes = true; @@ -1309,27 +1308,8 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD, bool FromValue = false); static void WriteOptimizationInfo(raw_ostream &Out, const User *U) { - if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U)) { - // 'Fast' is an abbreviation for all fast-math-flags. - if (FPO->isFast()) - Out << " fast"; - else { - if (FPO->hasAllowReassoc()) - Out << " reassoc"; - if (FPO->hasNoNaNs()) - Out << " nnan"; - if (FPO->hasNoInfs()) - Out << " ninf"; - if (FPO->hasNoSignedZeros()) - Out << " nsz"; - if (FPO->hasAllowReciprocal()) - Out << " arcp"; - if (FPO->hasAllowContract()) - Out << " contract"; - if (FPO->hasApproxFunc()) - Out << " afn"; - } - } + if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U)) + Out << FPO->getFastMathFlags(); if (const OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(U)) { diff --git a/contrib/llvm-project/llvm/lib/IR/Core.cpp b/contrib/llvm-project/llvm/lib/IR/Core.cpp index 905372982dc2..2c396ae97499 100644 --- a/contrib/llvm-project/llvm/lib/IR/Core.cpp +++ b/contrib/llvm-project/llvm/lib/IR/Core.cpp @@ -2266,6 +2266,14 @@ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee, unwrap<Constant>(Aliasee), unwrap(M))); } +LLVMValueRef LLVMAddAlias2(LLVMModuleRef M, LLVMTypeRef ValueTy, + unsigned AddrSpace, LLVMValueRef Aliasee, + const char *Name) { + return wrap(GlobalAlias::create(unwrap(ValueTy), AddrSpace, + GlobalValue::ExternalLinkage, Name, + unwrap<Constant>(Aliasee), unwrap(M))); +} + LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M, const char *Name, size_t NameLen) { return wrap(unwrap(M)->getNamedAlias(Name)); diff --git a/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp index ca7dafc814ce..548962bd6a98 100644 --- a/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/IR/DIBuilder.cpp @@ -34,7 +34,20 @@ static cl::opt<bool> DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU) : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr), ValueFn(nullptr), LabelFn(nullptr), - AllowUnresolvedNodes(AllowUnresolvedNodes) {} + AllowUnresolvedNodes(AllowUnresolvedNodes) { + if (CUNode) { + if (const auto &ETs = CUNode->getEnumTypes()) + AllEnumTypes.assign(ETs.begin(), ETs.end()); + if (const auto &RTs = CUNode->getRetainedTypes()) + AllRetainTypes.assign(RTs.begin(), RTs.end()); + if (const auto &GVs = CUNode->getGlobalVariables()) + AllGVs.assign(GVs.begin(), GVs.end()); + if (const auto &IMs = CUNode->getImportedEntities()) + AllImportedModules.assign(IMs.begin(), IMs.end()); + if (const auto &MNs = CUNode->getMacros()) + AllMacrosPerParent.insert({nullptr, {MNs.begin(), MNs.end()}}); + } +} void DIBuilder::trackIfUnresolved(MDNode *N) { if (!N) diff --git a/contrib/llvm-project/llvm/lib/IR/Instructions.cpp b/contrib/llvm-project/llvm/lib/IR/Instructions.cpp index c42df49d97ea..ad27a6d8c08e 100644 --- a/contrib/llvm-project/llvm/lib/IR/Instructions.cpp +++ b/contrib/llvm-project/llvm/lib/IR/Instructions.cpp @@ -2474,7 +2474,7 @@ bool ShuffleVectorInst::isReplicationMask(ArrayRef<int> Mask, // Additionally, mask size is a replication factor multiplied by vector size, // which further significantly reduces the search space. - // Before doing that, let's perform basic sanity check first. + // Before doing that, let's perform basic correctness checking first. int Largest = -1; for (int MaskElt : Mask) { if (MaskElt == UndefMaskElem) diff --git a/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp b/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp index 7552906fd07a..9206cd37a6d1 100644 --- a/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp +++ b/contrib/llvm-project/llvm/lib/IR/IntrinsicInst.cpp @@ -358,13 +358,13 @@ Value *VPIntrinsic::getMemoryPointerParam() const { Optional<unsigned> VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) { switch (VPID) { default: - return None; - -#define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS) \ - case Intrinsic::VPID: \ - return POINTERPOS; + break; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define VP_PROPERTY_MEMOP(POINTERPOS, ...) return POINTERPOS; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" } + return None; } /// \return The data (payload) operand of this store or scatter. @@ -378,52 +378,51 @@ Value *VPIntrinsic::getMemoryDataParam() const { Optional<unsigned> VPIntrinsic::getMemoryDataParamPos(Intrinsic::ID VPID) { switch (VPID) { default: - return None; - -#define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS) \ - case Intrinsic::VPID: \ - return DATAPOS; + break; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS) return DATAPOS; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" } + return None; } bool VPIntrinsic::isVPIntrinsic(Intrinsic::ID ID) { switch (ID) { default: - return false; - + break; #define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ case Intrinsic::VPID: \ - break; + return true; #include "llvm/IR/VPIntrinsics.def" } - return true; + return false; } // Equivalent non-predicated opcode Optional<unsigned> VPIntrinsic::getFunctionalOpcodeForVP(Intrinsic::ID ID) { - Optional<unsigned> FunctionalOC; switch (ID) { default: break; #define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: -#define HANDLE_VP_TO_OPC(OPC) FunctionalOC = Instruction::OPC; -#define END_REGISTER_VP_INTRINSIC(...) break; +#define VP_PROPERTY_FUNCTIONAL_OPC(OPC) return Instruction::OPC; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" } - - return FunctionalOC; + return None; } Intrinsic::ID VPIntrinsic::getForOpcode(unsigned IROPC) { switch (IROPC) { default: - return Intrinsic::not_intrinsic; + break; -#define HANDLE_VP_TO_OPC(OPC) case Instruction::OPC: +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) break; +#define VP_PROPERTY_FUNCTIONAL_OPC(OPC) case Instruction::OPC: #define END_REGISTER_VP_INTRINSIC(VPID) return Intrinsic::VPID; #include "llvm/IR/VPIntrinsics.def" } + return Intrinsic::not_intrinsic; } bool VPIntrinsic::canIgnoreVectorLengthParam() const { @@ -516,13 +515,13 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) { switch (ID) { default: - return false; -#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ - case Intrinsic::VPID: \ break; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" } - return true; + return false; } unsigned VPReductionIntrinsic::getVectorParamPos() const { @@ -535,24 +534,26 @@ unsigned VPReductionIntrinsic::getStartParamPos() const { Optional<unsigned> VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) { switch (ID) { -#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ - case Intrinsic::VPID: \ - return VECTORPOS; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return VECTORPOS; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" default: - return None; + break; } + return None; } Optional<unsigned> VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) { switch (ID) { -#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS) \ - case Intrinsic::VPID: \ - return STARTPOS; +#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID: +#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS) return STARTPOS; +#define END_REGISTER_VP_INTRINSIC(VPID) break; #include "llvm/IR/VPIntrinsics.def" default: - return None; + break; } + return None; } Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const { diff --git a/contrib/llvm-project/llvm/lib/IR/Operator.cpp b/contrib/llvm-project/llvm/lib/IR/Operator.cpp index cf309ffd6212..d15fcfbc5b9f 100644 --- a/contrib/llvm-project/llvm/lib/IR/Operator.cpp +++ b/contrib/llvm-project/llvm/lib/IR/Operator.cpp @@ -226,4 +226,25 @@ bool GEPOperator::collectOffset( } return true; } + +void FastMathFlags::print(raw_ostream &O) const { + if (all()) + O << " fast"; + else { + if (allowReassoc()) + O << " reassoc"; + if (noNaNs()) + O << " nnan"; + if (noInfs()) + O << " ninf"; + if (noSignedZeros()) + O << " nsz"; + if (allowReciprocal()) + O << " arcp"; + if (allowContract()) + O << " contract"; + if (approxFunc()) + O << " afn"; + } +} } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp b/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp index d0c1517f480b..a03fafec9fac 100644 --- a/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp +++ b/contrib/llvm-project/llvm/lib/IR/PassTimingInfo.cpp @@ -187,7 +187,7 @@ Timer &TimePassesHandler::getPassTimer(StringRef PassID) { Timer *T = new Timer(PassID, FullDesc, TG); Timers.emplace_back(T); - assert(Count == Timers.size() && "sanity check"); + assert(Count == Timers.size() && "Timers vector not adjusted correctly."); return *T; } diff --git a/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp b/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp index 9be6de693ee3..2117527a64f0 100644 --- a/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/IR/SafepointIRVerifier.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// // -// Run a sanity check on the IR to ensure that Safepoints - if they've been -// inserted - were inserted correctly. In particular, look for use of -// non-relocated values after a safepoint. It's primary use is to check the +// Run a basic correctness check on the IR to ensure that Safepoints - if +// they've been inserted - were inserted correctly. In particular, look for use +// of non-relocated values after a safepoint. It's primary use is to check the // correctness of safepoint insertion immediately after insertion, but it can // also be used to verify that later transforms have not found a way to break // safepoint semenatics. diff --git a/contrib/llvm-project/llvm/lib/IR/Verifier.cpp b/contrib/llvm-project/llvm/lib/IR/Verifier.cpp index dc4370d4b6ed..154b59835b01 100644 --- a/contrib/llvm-project/llvm/lib/IR/Verifier.cpp +++ b/contrib/llvm-project/llvm/lib/IR/Verifier.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This file defines the function verifier interface, that can be used for some -// sanity checking of input to the system. +// basic correctness checking of input to the system. // // Note that this does not provide full `Java style' security and verifications, // instead it just tries to ensure that code is well-formed. @@ -1604,7 +1604,7 @@ Verifier::visitModuleFlag(const MDNode *Op, Assert(ID, "invalid ID operand in module flag (expected metadata string)", Op->getOperand(1)); - // Sanity check the values for behaviors with additional requirements. + // Check the values for behaviors with additional requirements. switch (MFB) { case Module::Error: case Module::Warning: @@ -5269,24 +5269,32 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { Op0ElemTy = cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType(); break; - case Intrinsic::matrix_column_major_load: + case Intrinsic::matrix_column_major_load: { Stride = dyn_cast<ConstantInt>(Call.getArgOperand(1)); NumRows = cast<ConstantInt>(Call.getArgOperand(3)); NumColumns = cast<ConstantInt>(Call.getArgOperand(4)); ResultTy = cast<VectorType>(Call.getType()); - Op0ElemTy = - cast<PointerType>(Call.getArgOperand(0)->getType())->getElementType(); + + PointerType *Op0PtrTy = + cast<PointerType>(Call.getArgOperand(0)->getType()); + if (!Op0PtrTy->isOpaque()) + Op0ElemTy = Op0PtrTy->getElementType(); break; - case Intrinsic::matrix_column_major_store: + } + case Intrinsic::matrix_column_major_store: { Stride = dyn_cast<ConstantInt>(Call.getArgOperand(2)); NumRows = cast<ConstantInt>(Call.getArgOperand(4)); NumColumns = cast<ConstantInt>(Call.getArgOperand(5)); ResultTy = cast<VectorType>(Call.getArgOperand(0)->getType()); Op0ElemTy = cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType(); - Op1ElemTy = - cast<PointerType>(Call.getArgOperand(1)->getType())->getElementType(); + + PointerType *Op1PtrTy = + cast<PointerType>(Call.getArgOperand(1)->getType()); + if (!Op1PtrTy->isOpaque()) + Op1ElemTy = Op1PtrTy->getElementType(); break; + } default: llvm_unreachable("unexpected intrinsic"); } @@ -5295,9 +5303,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { ResultTy->getElementType()->isFloatingPointTy(), "Result type must be an integer or floating-point type!", IF); - Assert(ResultTy->getElementType() == Op0ElemTy, - "Vector element type mismatch of the result and first operand " - "vector!", IF); + if (Op0ElemTy) + Assert(ResultTy->getElementType() == Op0ElemTy, + "Vector element type mismatch of the result and first operand " + "vector!", IF); if (Op1ElemTy) Assert(ResultTy->getElementType() == Op1ElemTy, diff --git a/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp index d41c7d3217d7..0d1a864f31ac 100644 --- a/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp +++ b/contrib/llvm-project/llvm/lib/InterfaceStub/ELFObjHandler.cpp @@ -372,7 +372,7 @@ Error appendToError(Error Err, StringRef After) { /// This function populates a DynamicEntries struct using an ELFT::DynRange. /// After populating the struct, the members are validated with -/// some basic sanity checks. +/// some basic correctness checks. /// /// @param Dyn Target DynamicEntries struct to populate. /// @param DynTable Source dynamic table. diff --git a/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp index 154b2d051f34..2ca921017171 100644 --- a/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/MC/MCAsmStreamer.cpp @@ -1069,16 +1069,14 @@ void MCAsmStreamer::PrintQuotedString(StringRef Data, raw_ostream &OS) const { OS << '"'; if (MAI->hasPairedDoubleQuoteStringConstants()) { - for (unsigned i = 0, e = Data.size(); i != e; ++i) { - unsigned char C = Data[i]; + for (unsigned char C : Data) { if (C == '"') OS << "\"\""; else OS << (char)C; } } else { - for (unsigned i = 0, e = Data.size(); i != e; ++i) { - unsigned char C = Data[i]; + for (unsigned char C : Data) { if (C == '"' || C == '\\') { OS << '\\' << (char)C; continue; diff --git a/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp index 1ba999a63113..fbf3c860368a 100644 --- a/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/MC/MCELFStreamer.cpp @@ -646,8 +646,6 @@ void MCELFStreamer::emitBundleAlignMode(unsigned AlignPow2) { void MCELFStreamer::emitBundleLock(bool AlignToEnd) { MCSection &Sec = *getCurrentSectionOnly(); - // Sanity checks - // if (!getAssembler().isBundlingEnabled()) report_fatal_error(".bundle_lock forbidden when bundling is disabled"); @@ -667,7 +665,6 @@ void MCELFStreamer::emitBundleLock(bool AlignToEnd) { void MCELFStreamer::emitBundleUnlock() { MCSection &Sec = *getCurrentSectionOnly(); - // Sanity checks if (!getAssembler().isBundlingEnabled()) report_fatal_error(".bundle_unlock forbidden when bundling is disabled"); else if (!isBundleLocked()) diff --git a/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp index ddc41d0a08ab..e95019c12db7 100644 --- a/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -676,14 +676,14 @@ EndStmt: getContext().getELFSection(SectionName, Type, Flags, Size, GroupName, IsComdat, UniqueID, LinkedToSym); getStreamer().SwitchSection(Section, Subsection); - if (Section->getType() != Type && + // Check that flags are used consistently. However, the GNU assembler permits + // to leave out in subsequent uses of the same sections; for compatibility, + // do likewise. + if (!TypeName.empty() && Section->getType() != Type && !allowSectionTypeMismatch(getContext().getTargetTriple(), SectionName, Type)) Error(loc, "changed section type for " + SectionName + ", expected: 0x" + utohexstr(Section->getType())); - // Check that flags are used consistently. However, the GNU assembler permits - // to leave out in subsequent uses of the same sections; for compatibility, - // do likewise. if ((extraFlags || Size || !TypeName.empty()) && Section->getFlags() != Flags) Error(loc, "changed section flags for " + SectionName + ", expected: 0x" + utohexstr(Section->getFlags())); diff --git a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp index 646f416821ae..73c687331d30 100644 --- a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -56,6 +56,8 @@ using llvm::support::endian::write32le; namespace { +constexpr int OffsetLabelIntervalBits = 20; + using name = SmallString<COFF::NameSize>; enum AuxiliaryType { @@ -120,6 +122,8 @@ public: relocations Relocations; COFFSection(StringRef Name) : Name(std::string(Name)) {} + + SmallVector<COFFSymbol *, 1> OffsetSymbols; }; class WinCOFFObjectWriter : public MCObjectWriter { @@ -149,6 +153,7 @@ public: symbol_list WeakDefaults; bool UseBigObj; + bool UseOffsetLabels = false; bool EmitAddrsigSection = false; MCSectionCOFF *AddrsigSection; @@ -174,7 +179,7 @@ public: COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol *Symbol); COFFSection *createSection(StringRef Name); - void defineSection(MCSectionCOFF const &Sec); + void defineSection(MCSectionCOFF const &Sec, const MCAsmLayout &Layout); COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol); void DefineSymbol(const MCSymbol &Symbol, MCAssembler &Assembler, @@ -244,6 +249,11 @@ WinCOFFObjectWriter::WinCOFFObjectWriter( std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) : W(OS, support::little), TargetObjectWriter(std::move(MOTW)) { Header.Machine = TargetObjectWriter->getMachine(); + // Some relocations on ARM64 (the 21 bit ADRP relocations) have a slightly + // limited range for the immediate offset (+/- 1 MB); create extra offset + // label symbols with regular intervals to allow referencing a + // non-temporary symbol that is close enough. + UseOffsetLabels = Header.Machine == COFF::IMAGE_FILE_MACHINE_ARM64; } COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) { @@ -299,7 +309,8 @@ static uint32_t getAlignment(const MCSectionCOFF &Sec) { /// This function takes a section data object from the assembler /// and creates the associated COFF section staging object. -void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) { +void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec, + const MCAsmLayout &Layout) { COFFSection *Section = createSection(MCSec.getName()); COFFSymbol *Symbol = createSymbol(MCSec.getName()); Section->Symbol = Symbol; @@ -329,6 +340,20 @@ void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) { // Bind internal COFF section to MC section. Section->MCSection = &MCSec; SectionMap[&MCSec] = Section; + + if (UseOffsetLabels && !MCSec.getFragmentList().empty()) { + const uint32_t Interval = 1 << OffsetLabelIntervalBits; + uint32_t N = 1; + for (uint32_t Off = Interval, E = Layout.getSectionAddressSize(&MCSec); + Off < E; Off += Interval) { + auto Name = ("$L" + MCSec.getName() + "_" + Twine(N++)).str(); + COFFSymbol *Label = createSymbol(Name); + Label->Section = Section; + Label->Data.StorageClass = COFF::IMAGE_SYM_CLASS_LABEL; + Label->Data.Value = Off; + Section->OffsetSymbols.push_back(Label); + } + } } static uint64_t getSymbolValue(const MCSymbol &Symbol, @@ -688,7 +713,7 @@ void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm, // "Define" each section & symbol. This creates section & symbol // entries in the staging area. for (const auto &Section : Asm) - defineSection(static_cast<const MCSectionCOFF &>(Section)); + defineSection(static_cast<const MCSectionCOFF &>(Section), Layout); for (const MCSymbol &Symbol : Asm.symbols()) if (!Symbol.isTemporary()) @@ -774,8 +799,23 @@ void WinCOFFObjectWriter::recordRelocation(MCAssembler &Asm, assert( SectionMap.find(TargetSection) != SectionMap.end() && "Section must already have been defined in executePostLayoutBinding!"); - Reloc.Symb = SectionMap[TargetSection]->Symbol; + COFFSection *Section = SectionMap[TargetSection]; + Reloc.Symb = Section->Symbol; FixedValue += Layout.getSymbolOffset(A); + // Technically, we should do the final adjustments of FixedValue (below) + // before picking an offset symbol, otherwise we might choose one which + // is slightly too far away. The relocations where it really matters + // (arm64 adrp relocations) don't get any offset though. + if (UseOffsetLabels && !Section->OffsetSymbols.empty()) { + uint64_t LabelIndex = FixedValue >> OffsetLabelIntervalBits; + if (LabelIndex > 0) { + if (LabelIndex <= Section->OffsetSymbols.size()) + Reloc.Symb = Section->OffsetSymbols[LabelIndex - 1]; + else + Reloc.Symb = Section->OffsetSymbols.back(); + FixedValue -= Reloc.Symb->Data.Value; + } + } } else { assert( SymbolMap.find(&A) != SymbolMap.end() && diff --git a/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp b/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp index 0ab845a4c28f..d8283f8d2682 100644 --- a/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/MCA/InstrBuilder.cpp @@ -612,7 +612,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n'); LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n'); - // Sanity check on the instruction descriptor. + // Validation check on the instruction descriptor. if (Error Err = verifyInstrDesc(*ID, MCI)) return std::move(Err); diff --git a/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp b/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp index 6e021d3d9232..2b11f73b19df 100644 --- a/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp +++ b/contrib/llvm-project/llvm/lib/MCA/Stages/ExecuteStage.cpp @@ -188,7 +188,7 @@ Error ExecuteStage::execute(InstRef &IR) { #ifndef NDEBUG // Ensure that the HWS has not stored this instruction in its queues. - HWS.sanityCheck(IR); + HWS.instructionCheck(IR); #endif if (IR.getInstruction()->isEliminated()) diff --git a/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp b/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp index 50035d6c7523..cf1f12d9a9a7 100644 --- a/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp +++ b/contrib/llvm-project/llvm/lib/Object/ELFObjectFile.cpp @@ -682,7 +682,7 @@ readDynsymVersionsImpl(const ELFFile<ELFT> &EF, std::vector<VersionEntry> Ret; size_t I = 0; - for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) { + for (const ELFSymbolRef &Sym : Symbols) { ++I; Expected<const typename ELFT::Versym *> VerEntryOrErr = EF.template getEntry<typename ELFT::Versym>(*VerSec, I); @@ -691,7 +691,7 @@ readDynsymVersionsImpl(const ELFFile<ELFT> &EF, " from " + describe(EF, *VerSec) + ": " + toString(VerEntryOrErr.takeError())); - Expected<uint32_t> FlagsOrErr = It->getFlags(); + Expected<uint32_t> FlagsOrErr = Sym.getFlags(); if (!FlagsOrErr) return createError("unable to read flags for symbol with index " + Twine(I) + ": " + toString(FlagsOrErr.takeError())); diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp index 5f38ca13cfc2..66ad16db1ba4 100644 --- a/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/ObjectYAML/COFFEmitter.cpp @@ -476,29 +476,25 @@ static bool writeCOFF(COFFParser &CP, raw_ostream &OS) { assert(OS.tell() == CP.SectionTableStart); // Output section table. - for (std::vector<COFFYAML::Section>::iterator i = CP.Obj.Sections.begin(), - e = CP.Obj.Sections.end(); - i != e; ++i) { - OS.write(i->Header.Name, COFF::NameSize); - OS << binary_le(i->Header.VirtualSize) - << binary_le(i->Header.VirtualAddress) - << binary_le(i->Header.SizeOfRawData) - << binary_le(i->Header.PointerToRawData) - << binary_le(i->Header.PointerToRelocations) - << binary_le(i->Header.PointerToLineNumbers) - << binary_le(i->Header.NumberOfRelocations) - << binary_le(i->Header.NumberOfLineNumbers) - << binary_le(i->Header.Characteristics); + for (const COFFYAML::Section &S : CP.Obj.Sections) { + OS.write(S.Header.Name, COFF::NameSize); + OS << binary_le(S.Header.VirtualSize) + << binary_le(S.Header.VirtualAddress) + << binary_le(S.Header.SizeOfRawData) + << binary_le(S.Header.PointerToRawData) + << binary_le(S.Header.PointerToRelocations) + << binary_le(S.Header.PointerToLineNumbers) + << binary_le(S.Header.NumberOfRelocations) + << binary_le(S.Header.NumberOfLineNumbers) + << binary_le(S.Header.Characteristics); } assert(OS.tell() == CP.SectionTableStart + CP.SectionTableSize); unsigned CurSymbol = 0; StringMap<unsigned> SymbolTableIndexMap; - for (std::vector<COFFYAML::Symbol>::iterator I = CP.Obj.Symbols.begin(), - E = CP.Obj.Symbols.end(); - I != E; ++I) { - SymbolTableIndexMap[I->Name] = CurSymbol; - CurSymbol += 1 + I->Header.NumberOfAuxSymbols; + for (const COFFYAML::Symbol &Sym : CP.Obj.Symbols) { + SymbolTableIndexMap[Sym.Name] = CurSymbol; + CurSymbol += 1 + Sym.Header.NumberOfAuxSymbols; } // Output section data. diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp index fdf9aeae1622..e0dde4433d24 100644 --- a/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/contrib/llvm-project/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -155,6 +155,10 @@ void ScalarEnumerationTraits<ELFYAML::ELF_NT>::enumeration( ECase(NT_FREEBSD_PROCSTAT_OSREL); ECase(NT_FREEBSD_PROCSTAT_PSSTRINGS); ECase(NT_FREEBSD_PROCSTAT_AUXV); + // NetBSD core note types. + ECase(NT_NETBSDCORE_PROCINFO); + ECase(NT_NETBSDCORE_AUXV); + ECase(NT_NETBSDCORE_LWPSTATUS); // OpenBSD core note types. ECase(NT_OPENBSD_PROCINFO); ECase(NT_OPENBSD_AUXV); diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp index c653c29ec9a7..e5ffb12df434 100644 --- a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOEmitter.cpp @@ -54,6 +54,7 @@ private: void writeNameList(raw_ostream &OS); void writeStringTable(raw_ostream &OS); void writeExportTrie(raw_ostream &OS); + void writeDynamicSymbolTable(raw_ostream &OS); void dumpExportEntry(raw_ostream &OS, MachOYAML::ExportEntry &Entry); void ZeroToOffset(raw_ostream &OS, size_t offset); @@ -482,6 +483,7 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) { MachO::dyld_info_command *DyldInfoOnlyCmd = 0; MachO::symtab_command *SymtabCmd = 0; + MachO::dysymtab_command *DSymtabCmd = 0; for (auto &LC : Obj.LoadCommands) { switch (LC.Data.load_command_data.cmd) { case MachO::LC_SYMTAB: @@ -504,6 +506,11 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) { WriteQueue.push_back(std::make_pair(DyldInfoOnlyCmd->export_off, &MachOWriter::writeExportTrie)); break; + case MachO::LC_DYSYMTAB: + DSymtabCmd = &LC.Data.dysymtab_command_data; + WriteQueue.push_back(std::make_pair( + DSymtabCmd->indirectsymoff, &MachOWriter::writeDynamicSymbolTable)); + break; } } @@ -556,6 +563,12 @@ void MachOWriter::writeStringTable(raw_ostream &OS) { } } +void MachOWriter::writeDynamicSymbolTable(raw_ostream &OS) { + for (auto Data : Obj.LinkEdit.IndirectSymbols) + OS.write(reinterpret_cast<const char *>(&Data), + sizeof(yaml::Hex32::BaseType)); +} + class UniversalWriter { public: UniversalWriter(yaml::YamlObjectFile &ObjectFile) diff --git a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp index c9562bd72258..f32009458110 100644 --- a/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp +++ b/contrib/llvm-project/llvm/lib/ObjectYAML/MachOYAML.cpp @@ -164,6 +164,7 @@ void MappingTraits<MachOYAML::LinkEditData>::mapping( IO.mapOptional("ExportTrie", LinkEditData.ExportTrie); IO.mapOptional("NameList", LinkEditData.NameList); IO.mapOptional("StringTable", LinkEditData.StringTable); + IO.mapOptional("IndirectSymbols", LinkEditData.IndirectSymbols); } void MappingTraits<MachOYAML::RebaseOpcode>::mapping( diff --git a/contrib/llvm-project/llvm/lib/Option/OptTable.cpp b/contrib/llvm-project/llvm/lib/Option/OptTable.cpp index 37c2fcbab181..19e05b9272bb 100644 --- a/contrib/llvm-project/llvm/lib/Option/OptTable.cpp +++ b/contrib/llvm-project/llvm/lib/Option/OptTable.cpp @@ -150,10 +150,9 @@ OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase) for (StringSet<>::const_iterator I = PrefixesUnion.begin(), E = PrefixesUnion.end(); I != E; ++I) { StringRef Prefix = I->getKey(); - for (StringRef::const_iterator C = Prefix.begin(), CE = Prefix.end(); - C != CE; ++C) - if (!is_contained(PrefixChars, *C)) - PrefixChars.push_back(*C); + for (char C : Prefix) + if (!is_contained(PrefixChars, C)) + PrefixChars.push_back(C); } } diff --git a/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp b/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp index ac5dfdbdd540..de1b0ace7876 100644 --- a/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/contrib/llvm-project/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1765,6 +1765,8 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, if (LTOPreLink) addRequiredLTOPreLinkPasses(MPM); + MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); + return MPM; } diff --git a/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp b/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp index 8e6be6730ea4..27a6c519ff82 100644 --- a/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/contrib/llvm-project/llvm/lib/Passes/StandardInstrumentations.cpp @@ -225,8 +225,8 @@ std::string doSystemDiff(StringRef Before, StringRef After, return "Unable to read result."; // Clean up. - for (unsigned I = 0; I < NumFiles; ++I) { - std::error_code EC = sys::fs::remove(FileName[I]); + for (const std::string &I : FileName) { + std::error_code EC = sys::fs::remove(I); if (EC) return "Unable to remove temporary file."; } diff --git a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp index 94bd4807041d..c6691e321b3c 100644 --- a/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp +++ b/contrib/llvm-project/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp @@ -83,7 +83,6 @@ Error RawCoverageReader::readIntMax(uint64_t &Result, uint64_t MaxPlus1) { Error RawCoverageReader::readSize(uint64_t &Result) { if (auto Err = readULEB128(Result)) return Err; - // Sanity check the number. if (Result > Data.size()) return make_error<CoverageMapError>(coveragemap_error::malformed); return Error::success(); diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp index 1168ad27fe52..ab3487ecffe8 100644 --- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp +++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProf.cpp @@ -657,19 +657,18 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input, Input.sortByTargetValues(); auto I = ValueData.begin(); auto IE = ValueData.end(); - for (auto J = Input.ValueData.begin(), JE = Input.ValueData.end(); J != JE; - ++J) { - while (I != IE && I->Value < J->Value) + for (const InstrProfValueData &J : Input.ValueData) { + while (I != IE && I->Value < J.Value) ++I; - if (I != IE && I->Value == J->Value) { + if (I != IE && I->Value == J.Value) { bool Overflowed; - I->Count = SaturatingMultiplyAdd(J->Count, Weight, I->Count, &Overflowed); + I->Count = SaturatingMultiplyAdd(J.Count, Weight, I->Count, &Overflowed); if (Overflowed) Warn(instrprof_error::counter_overflow); ++I; continue; } - ValueData.insert(I, *J); + ValueData.insert(I, J); } } diff --git a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp index b4e8025dbef9..885c1fe49240 100644 --- a/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/contrib/llvm-project/llvm/lib/ProfileData/InstrProfReader.cpp @@ -62,7 +62,6 @@ InstrProfReader::create(const Twine &Path) { Expected<std::unique_ptr<InstrProfReader>> InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) { - // Sanity check the buffer. if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint64_t>::max()) return make_error<InstrProfError>(instrprof_error::too_large); @@ -113,7 +112,6 @@ IndexedInstrProfReader::create(const Twine &Path, const Twine &RemappingPath) { Expected<std::unique_ptr<IndexedInstrProfReader>> IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer, std::unique_ptr<MemoryBuffer> RemappingBuffer) { - // Sanity check the buffer. if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint64_t>::max()) return make_error<InstrProfError>(instrprof_error::too_large); diff --git a/contrib/llvm-project/llvm/lib/ProfileData/RawMemProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/RawMemProfReader.cpp new file mode 100644 index 000000000000..f8d13c74fac3 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/ProfileData/RawMemProfReader.cpp @@ -0,0 +1,121 @@ +//===- RawMemProfReader.cpp - Instrumented memory profiling reader --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for reading MemProf profiling data. +// +//===----------------------------------------------------------------------===// + +#include <cstdint> +#include <type_traits> + +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/MemProfData.inc" +#include "llvm/ProfileData/RawMemProfReader.h" + +namespace llvm { +namespace memprof { +namespace { + +struct Summary { + uint64_t Version; + uint64_t TotalSizeBytes; + uint64_t NumSegments; + uint64_t NumMIBInfo; + uint64_t NumStackOffsets; +}; + +template <class T = uint64_t> inline T alignedRead(const char *Ptr) { + static_assert(std::is_pod<T>::value, "Not a pod type."); + assert(reinterpret_cast<size_t>(Ptr) % sizeof(T) == 0 && "Unaligned Read"); + return *reinterpret_cast<const T *>(Ptr); +} + +Summary computeSummary(const char *Start) { + auto *H = reinterpret_cast<const Header *>(Start); + + // Check alignment while reading the number of items in each section. + return Summary{ + H->Version, + H->TotalSize, + alignedRead(Start + H->SegmentOffset), + alignedRead(Start + H->MIBOffset), + alignedRead(Start + H->StackOffset), + }; +} + +} // namespace + +Expected<std::unique_ptr<RawMemProfReader>> +RawMemProfReader::create(const Twine &Path) { + auto BufferOr = MemoryBuffer::getFileOrSTDIN(Path, /*IsText=*/true); + if (std::error_code EC = BufferOr.getError()) + return errorCodeToError(EC); + + std::unique_ptr<MemoryBuffer> Buffer(BufferOr.get().release()); + + if (Buffer->getBufferSize() == 0) + return make_error<InstrProfError>(instrprof_error::empty_raw_profile); + + if (!RawMemProfReader::hasFormat(*Buffer)) + return make_error<InstrProfError>(instrprof_error::bad_magic); + + if (Buffer->getBufferSize() < sizeof(Header)) { + return make_error<InstrProfError>(instrprof_error::truncated); + } + + // The size of the buffer can be > header total size since we allow repeated + // serialization of memprof profiles to the same file. + uint64_t TotalSize = 0; + const char *Next = Buffer->getBufferStart(); + while (Next < Buffer->getBufferEnd()) { + auto *H = reinterpret_cast<const Header *>(Next); + if (H->Version != MEMPROF_RAW_VERSION) { + return make_error<InstrProfError>(instrprof_error::unsupported_version); + } + + TotalSize += H->TotalSize; + Next += H->TotalSize; + } + + if (Buffer->getBufferSize() != TotalSize) { + return make_error<InstrProfError>(instrprof_error::malformed); + } + + return std::make_unique<RawMemProfReader>(std::move(Buffer)); +} + +bool RawMemProfReader::hasFormat(const MemoryBuffer &Buffer) { + if (Buffer.getBufferSize() < sizeof(uint64_t)) + return false; + // Aligned read to sanity check that the buffer was allocated with at least 8b + // alignment. + const uint64_t Magic = alignedRead(Buffer.getBufferStart()); + return Magic == MEMPROF_RAW_MAGIC_64; +} + +void RawMemProfReader::printSummaries(raw_ostream &OS) const { + int Count = 0; + const char *Next = DataBuffer->getBufferStart(); + while (Next < DataBuffer->getBufferEnd()) { + auto Summary = computeSummary(Next); + OS << "MemProf Profile " << ++Count << "\n"; + OS << " Version: " << Summary.Version << "\n"; + OS << " TotalSizeBytes: " << Summary.TotalSizeBytes << "\n"; + OS << " NumSegments: " << Summary.NumSegments << "\n"; + OS << " NumMIBInfo: " << Summary.NumMIBInfo << "\n"; + OS << " NumStackOffsets: " << Summary.NumStackOffsets << "\n"; + // TODO: Print the build ids once we can record them using the + // sanitizer_procmaps library for linux. + + auto *H = reinterpret_cast<const Header *>(Next); + Next += H->TotalSize; + } +} + +} // namespace memprof +} // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp index c99a19020511..eefb7c2ba627 100644 --- a/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/contrib/llvm-project/llvm/lib/ProfileData/SampleProfReader.cpp @@ -1709,7 +1709,7 @@ setupMemoryBuffer(const Twine &Filename) { return EC; auto Buffer = std::move(BufferOrErr.get()); - // Sanity check the file. + // Check the file. if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint32_t>::max()) return sampleprof_error::too_large; diff --git a/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp index b3136a91e7f5..a3e41ccd199c 100644 --- a/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp +++ b/contrib/llvm-project/llvm/lib/Support/AArch64TargetParser.cpp @@ -240,52 +240,4 @@ AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) { return C.ArchID; } return ArchKind::INVALID; -} - -// Parse a branch protection specification, which has the form -// standard | none | [bti,pac-ret[+b-key,+leaf]*] -// Returns true on success, with individual elements of the specification -// returned in `PBP`. Returns false in error, with `Err` containing -// an erroneous part of the spec. -bool AArch64::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP, - StringRef &Err) { - PBP = {"none", "a_key", false}; - if (Spec == "none") - return true; // defaults are ok - - if (Spec == "standard") { - PBP.Scope = "non-leaf"; - PBP.BranchTargetEnforcement = true; - return true; - } - - SmallVector<StringRef, 4> Opts; - Spec.split(Opts, "+"); - for (int I = 0, E = Opts.size(); I != E; ++I) { - StringRef Opt = Opts[I].trim(); - if (Opt == "bti") { - PBP.BranchTargetEnforcement = true; - continue; - } - if (Opt == "pac-ret") { - PBP.Scope = "non-leaf"; - for (; I + 1 != E; ++I) { - StringRef PACOpt = Opts[I + 1].trim(); - if (PACOpt == "leaf") - PBP.Scope = "all"; - else if (PACOpt == "b-key") - PBP.Key = "b_key"; - else - break; - } - continue; - } - if (Opt == "") - Err = "<empty>"; - else - Err = Opt; - return false; - } - - return true; -} +}
\ No newline at end of file diff --git a/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp b/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp index 459691923af8..241cfb1eedbe 100644 --- a/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp +++ b/contrib/llvm-project/llvm/lib/Support/ARMAttributeParser.cpp @@ -59,6 +59,10 @@ const ARMAttributeParser::DisplayHandler ARMAttributeParser::displayRoutines[] = ATTRIBUTE_HANDLER(DSP_extension), ATTRIBUTE_HANDLER(T2EE_use), ATTRIBUTE_HANDLER(Virtualization_use), + ATTRIBUTE_HANDLER(PAC_extension), + ATTRIBUTE_HANDLER(BTI_extension), + ATTRIBUTE_HANDLER(PACRET_use), + ATTRIBUTE_HANDLER(BTI_use), ATTRIBUTE_HANDLER(nodefaults), }; @@ -350,6 +354,28 @@ Error ARMAttributeParser::Virtualization_use(AttrType tag) { return parseStringAttribute("Virtualization_use", tag, makeArrayRef(strings)); } +Error ARMAttributeParser::PAC_extension(ARMBuildAttrs::AttrType tag) { + static const char *strings[] = {"Not Permitted", "Permitted in NOP space", + "Permitted"}; + return parseStringAttribute("PAC_extension", tag, makeArrayRef(strings)); +} + +Error ARMAttributeParser::BTI_extension(ARMBuildAttrs::AttrType tag) { + static const char *strings[] = {"Not Permitted", "Permitted in NOP space", + "Permitted"}; + return parseStringAttribute("BTI_extension", tag, makeArrayRef(strings)); +} + +Error ARMAttributeParser::PACRET_use(ARMBuildAttrs::AttrType tag) { + static const char *strings[] = {"Not Used", "Used"}; + return parseStringAttribute("PACRET_use", tag, makeArrayRef(strings)); +} + +Error ARMAttributeParser::BTI_use(ARMBuildAttrs::AttrType tag) { + static const char *strings[] = {"Not Used", "Used"}; + return parseStringAttribute("BTI_use", tag, makeArrayRef(strings)); +} + Error ARMAttributeParser::nodefaults(AttrType tag) { uint64_t value = de.getULEB128(cursor); printAttribute(tag, value, "Unspecified Tags UNDEFINED"); diff --git a/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp b/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp index f20521f2a2d4..815cfc62a4b0 100644 --- a/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Support/ARMBuildAttrs.cpp @@ -50,6 +50,10 @@ static const TagNameItem tagData[] = { {ARMBuildAttrs::MPextension_use, "Tag_MPextension_use"}, {ARMBuildAttrs::DIV_use, "Tag_DIV_use"}, {ARMBuildAttrs::DSP_extension, "Tag_DSP_extension"}, + {ARMBuildAttrs::PAC_extension, "Tag_PAC_extension"}, + {ARMBuildAttrs::BTI_extension, "Tag_BTI_extension"}, + {ARMBuildAttrs::BTI_use, "Tag_BTI_use"}, + {ARMBuildAttrs::PACRET_use, "Tag_PACRET_use"}, {ARMBuildAttrs::nodefaults, "Tag_nodefaults"}, {ARMBuildAttrs::also_compatible_with, "Tag_also_compatible_with"}, {ARMBuildAttrs::T2EE_use, "Tag_T2EE_use"}, diff --git a/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp b/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp index e64934aa90cc..5b7004c86f5a 100644 --- a/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp +++ b/contrib/llvm-project/llvm/lib/Support/CommandLine.cpp @@ -2656,10 +2656,13 @@ cl::getRegisteredSubcommands() { void cl::HideUnrelatedOptions(cl::OptionCategory &Category, SubCommand &Sub) { initCommonOptions(); for (auto &I : Sub.OptionsMap) { + bool Unrelated = true; for (auto &Cat : I.second->Categories) { - if (Cat != &Category && Cat != &CommonOptions->GenericCategory) - I.second->setHiddenFlag(cl::ReallyHidden); + if (Cat == &Category || Cat == &CommonOptions->GenericCategory) + Unrelated = false; } + if (Unrelated) + I.second->setHiddenFlag(cl::ReallyHidden); } } @@ -2667,11 +2670,14 @@ void cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *> Categories, SubCommand &Sub) { initCommonOptions(); for (auto &I : Sub.OptionsMap) { + bool Unrelated = true; for (auto &Cat : I.second->Categories) { - if (!is_contained(Categories, Cat) && - Cat != &CommonOptions->GenericCategory) - I.second->setHiddenFlag(cl::ReallyHidden); + if (is_contained(Categories, Cat) || + Cat == &CommonOptions->GenericCategory) + Unrelated = false; } + if (Unrelated) + I.second->setHiddenFlag(cl::ReallyHidden); } } diff --git a/contrib/llvm-project/llvm/lib/Support/HTTPClient.cpp b/contrib/llvm-project/llvm/lib/Support/HTTPClient.cpp new file mode 100644 index 000000000000..68ba56d1fe50 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Support/HTTPClient.cpp @@ -0,0 +1,97 @@ +//===-- llvm/Support/HTTPClient.cpp - HTTP client library -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// +/// This file defines the methods of the HTTPRequest, HTTPClient, and +/// BufferedHTTPResponseHandler classes. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/HTTPClient.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace llvm; + +HTTPRequest::HTTPRequest(StringRef Url) { this->Url = Url.str(); } + +bool operator==(const HTTPRequest &A, const HTTPRequest &B) { + return A.Url == B.Url && A.Method == B.Method && + A.FollowRedirects == B.FollowRedirects; +} + +HTTPResponseHandler::~HTTPResponseHandler() = default; + +static inline bool parseContentLengthHeader(StringRef LineRef, + size_t &ContentLength) { + // Content-Length is a mandatory header, and the only one we handle. + return LineRef.consume_front("Content-Length: ") && + to_integer(LineRef.trim(), ContentLength, 10); +} + +Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) { + if (ResponseBuffer.Body) + return Error::success(); + + size_t ContentLength; + if (parseContentLengthHeader(HeaderLine, ContentLength)) + ResponseBuffer.Body = + WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength); + + return Error::success(); +} + +Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) { + if (!ResponseBuffer.Body) + return createStringError(errc::io_error, + "Unallocated response buffer. HTTP Body data " + "received before Content-Length header."); + if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize()) + return createStringError(errc::io_error, + "Content size exceeds buffer size."); + memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(), + BodyChunk.size()); + Offset += BodyChunk.size(); + return Error::success(); +} + +Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) { + ResponseBuffer.Code = Code; + return Error::success(); +} + +Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) { + BufferedHTTPResponseHandler Handler; + if (Error Err = perform(Request, Handler)) + return std::move(Err); + return std::move(Handler.ResponseBuffer); +} + +Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) { + HTTPRequest Request(Url); + return perform(Request); +} + +HTTPClient::HTTPClient() = default; + +HTTPClient::~HTTPClient() = default; + +bool HTTPClient::isAvailable() { return false; } + +void HTTPClient::cleanup() {} + +void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {} + +Error HTTPClient::perform(const HTTPRequest &Request, + HTTPResponseHandler &Handler) { + llvm_unreachable("No HTTP Client implementation available."); +} diff --git a/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp b/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp index 90483817c302..554e3248524c 100644 --- a/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp +++ b/contrib/llvm-project/llvm/lib/Support/KnownBits.cpp @@ -421,11 +421,10 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS, "Self multiplication knownbits mismatch"); // Compute a conservative estimate for high known-0 bits. - unsigned LeadZ = - std::max(LHS.countMinLeadingZeros() + RHS.countMinLeadingZeros(), - BitWidth) - - BitWidth; - LeadZ = std::min(LeadZ, BitWidth); + unsigned LHSLeadZ = LHS.countMinLeadingZeros(); + unsigned RHSLeadZ = RHS.countMinLeadingZeros(); + unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth; + assert(LeadZ <= BitWidth && "More zeros than bits?"); // The result of the bottom bits of an integer multiply can be // inferred by looking at the bottom bits of both operands and diff --git a/contrib/llvm-project/llvm/lib/Support/Regex.cpp b/contrib/llvm-project/llvm/lib/Support/Regex.cpp index 0d5cc1c00db1..7a804a1a2297 100644 --- a/contrib/llvm-project/llvm/lib/Support/Regex.cpp +++ b/contrib/llvm-project/llvm/lib/Support/Regex.cpp @@ -218,10 +218,10 @@ bool Regex::isLiteralERE(StringRef Str) { std::string Regex::escape(StringRef String) { std::string RegexStr; - for (unsigned i = 0, e = String.size(); i != e; ++i) { - if (strchr(RegexMetachars, String[i])) + for (char C : String) { + if (strchr(RegexMetachars, C)) RegexStr += '\\'; - RegexStr += String[i]; + RegexStr += C; } return RegexStr; diff --git a/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp b/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp index 8abf9f7ce0f1..5683d7005584 100644 --- a/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp +++ b/contrib/llvm-project/llvm/lib/Support/StringExtras.cpp @@ -60,8 +60,7 @@ void llvm::SplitString(StringRef Source, } void llvm::printEscapedString(StringRef Name, raw_ostream &Out) { - for (unsigned i = 0, e = Name.size(); i != e; ++i) { - unsigned char C = Name[i]; + for (unsigned char C : Name) { if (C == '\\') Out << '\\' << C; else if (isPrint(C) && C != '"') diff --git a/contrib/llvm-project/llvm/lib/Support/StringRef.cpp b/contrib/llvm-project/llvm/lib/Support/StringRef.cpp index c532a1abe906..652303fdb6a0 100644 --- a/contrib/llvm-project/llvm/lib/Support/StringRef.cpp +++ b/contrib/llvm-project/llvm/lib/Support/StringRef.cpp @@ -227,8 +227,8 @@ size_t StringRef::rfind_insensitive(StringRef Str) const { StringRef::size_type StringRef::find_first_of(StringRef Chars, size_t From) const { std::bitset<1 << CHAR_BIT> CharBits; - for (size_type i = 0; i != Chars.size(); ++i) - CharBits.set((unsigned char)Chars[i]); + for (char C : Chars) + CharBits.set((unsigned char)C); for (size_type i = std::min(From, Length), e = Length; i != e; ++i) if (CharBits.test((unsigned char)Data[i])) @@ -252,8 +252,8 @@ StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { StringRef::size_type StringRef::find_first_not_of(StringRef Chars, size_t From) const { std::bitset<1 << CHAR_BIT> CharBits; - for (size_type i = 0; i != Chars.size(); ++i) - CharBits.set((unsigned char)Chars[i]); + for (char C : Chars) + CharBits.set((unsigned char)C); for (size_type i = std::min(From, Length), e = Length; i != e; ++i) if (!CharBits.test((unsigned char)Data[i])) @@ -268,8 +268,8 @@ StringRef::size_type StringRef::find_first_not_of(StringRef Chars, StringRef::size_type StringRef::find_last_of(StringRef Chars, size_t From) const { std::bitset<1 << CHAR_BIT> CharBits; - for (size_type i = 0; i != Chars.size(); ++i) - CharBits.set((unsigned char)Chars[i]); + for (char C : Chars) + CharBits.set((unsigned char)C); for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) if (CharBits.test((unsigned char)Data[i])) @@ -293,8 +293,8 @@ StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { StringRef::size_type StringRef::find_last_not_of(StringRef Chars, size_t From) const { std::bitset<1 << CHAR_BIT> CharBits; - for (size_type i = 0, e = Chars.size(); i != e; ++i) - CharBits.set((unsigned char)Chars[i]); + for (char C : Chars) + CharBits.set((unsigned char)C); for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) if (!CharBits.test((unsigned char)Data[i])) diff --git a/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp b/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp index 1dadce4b9040..4acc23dd455b 100644 --- a/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp +++ b/contrib/llvm-project/llvm/lib/Support/TargetParser.cpp @@ -333,3 +333,51 @@ bool getCPUFeaturesExceptStdExt(CPUKind Kind, } // namespace RISCV } // namespace llvm + +// Parse a branch protection specification, which has the form +// standard | none | [bti,pac-ret[+b-key,+leaf]*] +// Returns true on success, with individual elements of the specification +// returned in `PBP`. Returns false in error, with `Err` containing +// an erroneous part of the spec. +bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP, + StringRef &Err) { + PBP = {"none", "a_key", false}; + if (Spec == "none") + return true; // defaults are ok + + if (Spec == "standard") { + PBP.Scope = "non-leaf"; + PBP.BranchTargetEnforcement = true; + return true; + } + + SmallVector<StringRef, 4> Opts; + Spec.split(Opts, "+"); + for (int I = 0, E = Opts.size(); I != E; ++I) { + StringRef Opt = Opts[I].trim(); + if (Opt == "bti") { + PBP.BranchTargetEnforcement = true; + continue; + } + if (Opt == "pac-ret") { + PBP.Scope = "non-leaf"; + for (; I + 1 != E; ++I) { + StringRef PACOpt = Opts[I + 1].trim(); + if (PACOpt == "leaf") + PBP.Scope = "all"; + else if (PACOpt == "b-key") + PBP.Key = "b_key"; + else + break; + } + continue; + } + if (Opt == "") + Err = "<empty>"; + else + Err = Opt; + return false; + } + + return true; +} diff --git a/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp b/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp index 81926d8071b2..c11e16d3cf98 100644 --- a/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp +++ b/contrib/llvm-project/llvm/lib/Support/ThreadPool.cpp @@ -29,7 +29,7 @@ ThreadPool::ThreadPool(ThreadPoolStrategy S) Threads.emplace_back([S, ThreadID, this] { S.apply_thread_strategy(ThreadID); while (true) { - PackagedTaskTy Task; + std::function<void()> Task; { std::unique_lock<std::mutex> LockGuard(QueueLock); // Wait for tasks to be pushed in the queue @@ -80,23 +80,6 @@ bool ThreadPool::isWorkerThread() const { return false; } -std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) { - /// Wrap the Task in a packaged_task to return a future object. - PackagedTaskTy PackagedTask(std::move(Task)); - auto Future = PackagedTask.get_future(); - { - // Lock the queue and push the new task - std::unique_lock<std::mutex> LockGuard(QueueLock); - - // Don't allow enqueueing after disabling the pool - assert(EnableFlag && "Queuing a thread during ThreadPool destruction"); - - Tasks.push(std::move(PackagedTask)); - } - QueueCondition.notify_one(); - return Future.share(); -} - // The destructor joins all threads, waiting for completion. ThreadPool::~ThreadPool() { { @@ -128,16 +111,6 @@ void ThreadPool::wait() { } } -std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) { - // Get a Future with launch::deferred execution using std::async - auto Future = std::async(std::launch::deferred, std::move(Task)).share(); - // Wrap the future so that both ThreadPool::wait() can operate and the - // returned future can be sync'ed on. - PackagedTaskTy PackagedTask([Future]() { Future.get(); }); - Tasks.push(std::move(PackagedTask)); - return Future; -} - ThreadPool::~ThreadPool() { wait(); } #endif diff --git a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp index 2acac63ce843..25079fe33edb 100644 --- a/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp +++ b/contrib/llvm-project/llvm/lib/TableGen/TGLexer.cpp @@ -1017,12 +1017,10 @@ void TGLexer::prepSkipToLineEnd() { } bool TGLexer::prepIsProcessingEnabled() { - for (auto I = PrepIncludeStack.back()->rbegin(), - E = PrepIncludeStack.back()->rend(); - I != E; ++I) { - if (!I->IsDefined) + for (const PreprocessorControlDesc &I : + llvm::reverse(*PrepIncludeStack.back())) + if (!I.IsDefined) return false; - } return true; } diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 9f527a17d390..aeebb49675b2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -818,18 +818,9 @@ void AArch64AsmPrinter::emitJumpTableInfo() { const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); if (JT.empty()) return; - const Function &F = MF->getFunction(); const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - bool JTInDiffSection = - !STI->isTargetCOFF() || - !TLOF.shouldPutJumpTableInFunctionSection( - MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32, - F); - if (JTInDiffSection) { - // Drop it in the readonly section. - MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM); - OutStreamer->SwitchSection(ReadOnlySec); - } + MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM); + OutStreamer->SwitchSection(ReadOnlySec); auto AFI = MF->getInfo<AArch64FunctionInfo>(); for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) { diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp index 533ab3b05de9..ff4a4dfc1b95 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -88,12 +88,9 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI, // If this is already the flag setting version of the instruction (e.g., SUBS) // just make sure the implicit-def of NZCV isn't marked dead. if (IsFlagSetting) { - for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands(); - I != E; ++I) { - MachineOperand &MO = MI.getOperand(I); + for (MachineOperand &MO : MI.implicit_operands()) if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV) MO.setIsDead(false); - } return &MI; } bool Is64Bit; @@ -104,8 +101,8 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI, MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpc), NewDestReg); - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + MIB.add(MO); return MIB; } diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 4c04e04a7d3c..ee6e670fe3cd 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -102,9 +102,8 @@ INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI) { const MCInstrDesc &Desc = OldMI.getDesc(); - for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; - ++i) { - const MachineOperand &MO = OldMI.getOperand(i); + for (const MachineOperand &MO : + llvm::drop_begin(OldMI.operands(), Desc.getNumOperands())) { assert(MO.isReg() && MO.getReg()); if (MO.isUse()) UseMI.add(MO); @@ -733,8 +732,9 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER( MOP.getReg(), /*Def=*/false, /*Implicit=*/true)); RegMaskStartIdx++; } - for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx) - OriginalCall->addOperand(MI.getOperand(RegMaskStartIdx)); + for (const MachineOperand &MO : + llvm::drop_begin(MI.operands(), RegMaskStartIdx)) + OriginalCall->addOperand(MO); auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) .addReg(AArch64::FP, RegState::Define) diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6e9e61c8e7ac..72461aa1f772 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -890,7 +890,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::ABS); setTargetDAGCombine(ISD::SUB); - setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); @@ -930,6 +929,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::VECREDUCE_ADD); setTargetDAGCombine(ISD::STEP_VECTOR); + setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::GlobalAddress); // In case of strict alignment, avoid an excessive number of byte wide stores. @@ -1323,6 +1324,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); @@ -1504,6 +1506,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) { } } +bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, + EVT OpVT) const { + // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). + if (!Subtarget->hasSVE()) + return true; + + // We can only support legal predicate result types. + if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && + ResVT != MVT::nxv16i1) + return true; + + // The whilelo instruction only works with i32 or i64 scalar inputs. + if (OpVT != MVT::i32 && OpVT != MVT::i64) + return true; + + return false; +} + void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); @@ -1528,7 +1548,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setCondCodeAction(ISD::SETUNE, VT, Expand); } - // Mark integer truncating stores as having custom lowering + // Mark integer truncating stores/extending loads as having custom lowering if (VT.isInteger()) { MVT InnerVT = VT.changeVectorElementType(MVT::i8); while (InnerVT != VT) { @@ -1540,6 +1560,18 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { } } + // Mark floating-point truncating stores/extending loads as having custom + // lowering + if (VT.isFloatingPoint()) { + MVT InnerVT = VT.changeVectorElementType(MVT::f16); + while (InnerVT != VT) { + setTruncStoreAction(VT, InnerVT, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); + InnerVT = InnerVT.changeVectorElementType( + MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); + } + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); @@ -1950,6 +1982,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::UDIV_PRED) MAKE_CASE(AArch64ISD::UMAX_PRED) MAKE_CASE(AArch64ISD::UMIN_PRED) + MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1) MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) @@ -2316,6 +2349,8 @@ static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG); +static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT); /// isZerosVector - Check whether SDNode N is a zero-filled vector. static bool isZerosVector(const SDNode *N) { @@ -4288,6 +4323,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } + case Intrinsic::get_active_lane_mask: { + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, + Op.getOperand(1), Op.getOperand(2)); + } } } @@ -4506,7 +4547,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, } InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); Mask = DAG.getNode( - ISD::ZERO_EXTEND, DL, + ISD::SIGN_EXTEND, DL, VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } @@ -4618,7 +4659,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); Mask = DAG.getNode( - ISD::ZERO_EXTEND, DL, + ISD::SIGN_EXTEND, DL, VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } else if (VT.isFloatingPoint()) { // Handle FP data by casting the data so an integer scatter can be used. @@ -10963,8 +11004,40 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, return SDValue(); } +static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) { + if (Op.getOpcode() != AArch64ISD::DUP && + Op.getOpcode() != ISD::SPLAT_VECTOR && + Op.getOpcode() != ISD::BUILD_VECTOR) + return false; + + if (Op.getOpcode() == ISD::BUILD_VECTOR && + !isAllConstantBuildVector(Op, SplatVal)) + return false; + + if (Op.getOpcode() != ISD::BUILD_VECTOR && + !isa<ConstantSDNode>(Op->getOperand(0))) + return false; + + SplatVal = Op->getConstantOperandVal(0); + if (Op.getValueType().getVectorElementType() != MVT::i64) + SplatVal = (int32_t)SplatVal; + + Negated = false; + if (isPowerOf2_64(SplatVal)) + return true; + + Negated = true; + if (isPowerOf2_64(-SplatVal)) { + SplatVal = -SplatVal; + return true; + } + + return false; +} + SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + SDLoc dl(Op); if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); @@ -10974,6 +11047,19 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { bool Signed = Op.getOpcode() == ISD::SDIV; unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; + bool Negated; + uint64_t SplatVal; + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { + SDValue Pg = getPredicateForScalableVector(DAG, dl, VT); + SDValue Res = + DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0), + DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32)); + if (Negated) + Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res); + + return Res; + } + if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) return LowerToPredicatedOp(Op, DAG, PredOpcode); @@ -10987,7 +11073,6 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { else llvm_unreachable("Unexpected Custom DIV operation"); - SDLoc dl(Op); unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); @@ -11924,6 +12009,12 @@ static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); } +static bool isSplatShuffle(Value *V) { + if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) + return is_splat(Shuf->getShuffleMask()); + return false; +} + /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). @@ -11934,12 +12025,24 @@ bool AArch64TargetLowering::shouldSinkOperands( if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: - if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) - return false; - Ops.push_back(&II->getOperandUse(0)); - Ops.push_back(&II->getOperandUse(1)); - return true; + if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) { + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + } + LLVM_FALLTHROUGH; + + case Intrinsic::aarch64_neon_sqdmull: + case Intrinsic::aarch64_neon_sqdmulh: + case Intrinsic::aarch64_neon_sqrdmulh: + // Sink splats for index lane variants + if (isSplatShuffle(II->getOperand(0))) + Ops.push_back(&II->getOperandUse(0)); + if (isSplatShuffle(II->getOperand(1))) + Ops.push_back(&II->getOperandUse(1)); + return !Ops.empty(); case Intrinsic::aarch64_neon_pmull64: if (!areOperandsOfVmullHighP64(II->getArgOperand(0), @@ -12961,8 +13064,14 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV - // fold (sdiv X, pow2) EVT VT = N->getValueType(0); + + // For scalable and fixed types, mark them as cheap so we can handle it much + // later. This allows us to handle larger than legal types. + if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) + return SDValue(N, 0); + + // fold (sdiv X, pow2) if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) return SDValue(); @@ -13858,34 +13967,6 @@ static SDValue performANDCombine(SDNode *N, return SDValue(); } -static SDValue performSRLCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); - - // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the - // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) - // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. - SDValue N0 = N->getOperand(0); - if (N0.getOpcode() == ISD::BSWAP) { - SDLoc DL(N); - SDValue N1 = N->getOperand(1); - SDValue N00 = N0.getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { - uint64_t ShiftAmt = C->getZExtValue(); - if (VT == MVT::i32 && ShiftAmt == 16 && - DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) - return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); - if (VT == MVT::i64 && ShiftAmt == 32 && - DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) - return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); - } - } - return SDValue(); -} - // Attempt to form urhadd(OpA, OpB) from // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)) // or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)). @@ -14031,6 +14112,9 @@ static SDValue performConcatVectorsCombine(SDNode *N, SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); + if (VT.isScalableVector()) + return SDValue(); + // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), @@ -15089,6 +15173,9 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_sve_uqsub_x: return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_sve_asrd: + return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2), N->getOperand(3)); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), @@ -15883,6 +15970,22 @@ static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { + StoreSDNode *ST = cast<StoreSDNode>(N); + SDValue Chain = ST->getChain(); + SDValue Value = ST->getValue(); + SDValue Ptr = ST->getBasePtr(); + + // If this is an FP_ROUND followed by a store, fold this into a truncating + // store. We can do this even if this is already a truncstore. + // We purposefully don't care about legality of the nodes here as we know + // they can be split down into something legal. + if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && + Value.getNode()->hasOneUse() && ST->isUnindexed() && + Subtarget->useSVEForFixedLengthVectors() && + Value.getValueType().isFixedLengthVector()) + return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getMemoryVT(), ST->getMemOperand()); + if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; @@ -17225,6 +17328,37 @@ SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getBitcast(Ty, Trunc); } +SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) + return SDValue(); + + // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) + // We purposefully don't care about legality of the nodes here as we know + // they can be split down into something legal. + if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && + N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && + VT.isFixedLengthVector()) { + LoadSDNode *LN0 = cast<LoadSDNode>(N0); + SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, + LN0->getChain(), LN0->getBasePtr(), + N0.getValueType(), LN0->getMemOperand()); + DCI.CombineTo(N, ExtLoad); + DCI.CombineTo(N0.getNode(), + DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), + ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0))), + ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + + return SDValue(); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17253,8 +17387,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performORCombine(N, DCI, Subtarget); case ISD::AND: return performANDCombine(N, DCI); - case ISD::SRL: - return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -17283,6 +17415,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performSTORECombine(N, DCI, DAG, Subtarget); case ISD::VECTOR_SPLICE: return performSVESpliceCombine(N, DAG); + case ISD::FP_EXTEND: + return performFPExtendCombine(N, DAG, DCI, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: @@ -18414,6 +18548,15 @@ bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { return VT.isScalarInteger(); } +bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, + EVT VT) const { + // v8f16 without fp16 need to be extended to v8f32, which is more difficult to + // legalize. + if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16()) + return false; + return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT); +} + bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); } @@ -18591,12 +18734,29 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( SDLoc DL(Op); EVT VT = Op.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + EVT LoadVT = ContainerVT; + EVT MemVT = Load->getMemoryVT(); + + auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); + + if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { + LoadVT = ContainerVT.changeTypeToInteger(); + MemVT = MemVT.changeTypeToInteger(); + } auto NewLoad = DAG.getMaskedLoad( - ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), - getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT), - Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), - Load->getExtensionType()); + LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg, + DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(), + Load->getAddressingMode(), Load->getExtensionType()); + + if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { + EVT ExtendVT = ContainerVT.changeVectorElementType( + Load->getMemoryVT().getVectorElementType()); + + NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG); + NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, + Pg, NewLoad, DAG.getUNDEF(ContainerVT)); + } auto Result = convertFromScalableVector(DAG, VT, NewLoad); SDValue MergedValues[2] = {Result, Load->getChain()}; @@ -18609,12 +18769,15 @@ static SDValue convertFixedMaskToScalableVector(SDValue Mask, EVT InVT = Mask.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); + + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return Pg; + auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); auto Op2 = DAG.getConstant(0, DL, ContainerVT); - auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); - EVT CmpVT = Pg.getValueType(); - return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(), {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); } @@ -18668,13 +18831,26 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( SDLoc DL(Op); EVT VT = Store->getValue().getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + EVT MemVT = Store->getMemoryVT(); + auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); - return DAG.getMaskedStore( - Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), - getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(), - Store->getMemOperand(), Store->getAddressingMode(), - Store->isTruncatingStore()); + + if (VT.isFloatingPoint() && Store->isTruncatingStore()) { + EVT TruncVT = ContainerVT.changeVectorElementType( + Store->getMemoryVT().getVectorElementType()); + MemVT = MemVT.changeTypeToInteger(); + NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg, + NewValue, DAG.getTargetConstant(0, DL, MVT::i64), + DAG.getUNDEF(TruncVT)); + NewValue = + getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); + } + + return DAG.getMaskedStore(Store->getChain(), DL, NewValue, + Store->getBasePtr(), Store->getOffset(), Pg, MemVT, + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore()); } SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( @@ -18706,6 +18882,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( bool Signed = Op.getOpcode() == ISD::SDIV; unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; + bool Negated; + uint64_t SplatVal; + if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); + SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32); + + SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT); + SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2); + if (Negated) + Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res); + + return convertFromScalableVector(DAG, VT, Res); + } + // Scalable vector i32/i64 DIV is supported. if (EltVT == MVT::i32 || EltVT == MVT::i64) return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 392e22b68366..ea884cdccd28 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -104,6 +104,8 @@ enum NodeType : unsigned { // Unpredicated vector instructions BIC, + SRAD_MERGE_OP1, + // Predicated instructions with the result of inactive lanes provided by the // last operand. FABS_MERGE_PASSTHRU, @@ -774,6 +776,8 @@ public: bool preferIncOfAddToSubOfNot(EVT VT) const override; + bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; @@ -842,6 +846,8 @@ public: EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown = false) const override; + bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td index db8e0c5dac4a..decee117d2d5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -437,6 +437,18 @@ def non_temporal_store : cast<MaskedStoreSDNode>(N)->isNonTemporal(); }]>; +// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise +def top16Zero: PatLeaf<(i32 GPR32:$src), [{ + return SDValue(N,0)->getValueType(0) == MVT::i32 && + CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); + }]>; + +// top32Zero - answer true if the upper 32 bits of $src are 0, false otherwise +def top32Zero: PatLeaf<(i64 GPR64:$src), [{ + return SDValue(N,0)->getValueType(0) == MVT::i64 && + CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 32)); + }]>; + // Node definitions. def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>; @@ -2046,6 +2058,10 @@ def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>; def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>; def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; +// Match (srl (bswap x), C) -> revC if the upper bswap bits are known zero. +def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; +def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; + //===----------------------------------------------------------------------===// // Bitfield immediate extraction instruction. //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 67d8fbb45cf5..25d53f4ab065 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -199,6 +199,13 @@ def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>; +def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3,i32>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2> +]>; + +def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>; + def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>, SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4> @@ -1575,7 +1582,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", AArch64asrd_m1>; defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; @@ -1586,7 +1593,7 @@ let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>; defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>; defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>; - defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>; + defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>; } // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos let Predicates = [HasSVEorStreamingSVE] in { diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 63d6fa5bbb26..34015d2dbd49 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -833,17 +833,12 @@ static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); }; - // The OpMultiplier variable should always point to the dup (if any), so - // swap if necessary. - if (IsUnitDup(OpMultiplicand) || IsUnitSplat(OpMultiplicand)) - std::swap(OpMultiplier, OpMultiplicand); - if (IsUnitSplat(OpMultiplier)) { - // [f]mul pg (dupx 1) %n => %n + // [f]mul pg %n, (dupx 1) => %n OpMultiplicand->takeName(&II); return IC.replaceInstUsesWith(II, OpMultiplicand); } else if (IsUnitDup(OpMultiplier)) { - // [f]mul pg (dup pg 1) %n => %n + // [f]mul pg %n, (dup pg 1) => %n auto *DupInst = cast<IntrinsicInst>(OpMultiplier); auto *DupPg = DupInst->getOperand(1); // TODO: this is naive. The optimization is still valid if DupPg @@ -2142,6 +2137,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( case RecurKind::FMax: case RecurKind::SelectICmp: case RecurKind::SelectFCmp: + case RecurKind::FMulAdd: return true; default: return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index e090d87d59a2..3d9a626d3ac3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1920,35 +1920,6 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { - case TargetOpcode::G_SHL: - case TargetOpcode::G_ASHR: - case TargetOpcode::G_LSHR: { - // These shifts are legalized to have 64 bit shift amounts because we want - // to take advantage of the existing imported selection patterns that assume - // the immediates are s64s. However, if the shifted type is 32 bits and for - // some reason we receive input GMIR that has an s64 shift amount that's not - // a G_CONSTANT, insert a truncate so that we can still select the s32 - // register-register variant. - Register SrcReg = I.getOperand(1).getReg(); - Register ShiftReg = I.getOperand(2).getReg(); - const LLT ShiftTy = MRI.getType(ShiftReg); - const LLT SrcTy = MRI.getType(SrcReg); - if (SrcTy.isVector()) - return false; - assert(!ShiftTy.isVector() && "unexpected vector shift ty"); - if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64) - return false; - auto *AmtMI = MRI.getVRegDef(ShiftReg); - assert(AmtMI && "could not find a vreg definition for shift amount"); - if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) { - // Insert a subregister copy to implement a 64->32 trunc - auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) - .addReg(ShiftReg, 0, AArch64::sub_32); - MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); - I.getOperand(2).setReg(Trunc.getReg(0)); - } - return true; - } case TargetOpcode::G_STORE: { bool Changed = contractCrossBankCopyIntoStore(I, MRI); MachineOperand &SrcOp = I.getOperand(0); @@ -2950,6 +2921,28 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (Opcode == TargetOpcode::G_SHL && MRI.getType(I.getOperand(0).getReg()).isVector()) return selectVectorSHL(I, MRI); + + // These shifts were legalized to have 64 bit shift amounts because we + // want to take advantage of the selection patterns that assume the + // immediates are s64s, however, selectBinaryOp will assume both operands + // will have the same bit size. + { + Register SrcReg = I.getOperand(1).getReg(); + Register ShiftReg = I.getOperand(2).getReg(); + const LLT ShiftTy = MRI.getType(ShiftReg); + const LLT SrcTy = MRI.getType(SrcReg); + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && + ShiftTy.getSizeInBits() == 64) { + assert(!ShiftTy.isVector() && "unexpected vector shift ty"); + assert(MRI.getVRegDef(ShiftReg) && + "could not find a vreg definition for shift amount"); + // Insert a subregister copy to implement a 64->32 trunc + auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) + .addReg(ShiftReg, 0, AArch64::sub_32); + MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + I.getOperand(2).setReg(Trunc.getReg(0)); + } + } LLVM_FALLTHROUGH; case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: @@ -6452,8 +6445,7 @@ static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder MIB(MI); // Go through each operand and ensure it has the same regbank. - for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { - MachineOperand &MO = MI.getOperand(OpIdx); + for (MachineOperand &MO : llvm::drop_begin(MI.operands())) { if (!MO.isReg()) continue; Register OpReg = MO.getReg(); @@ -6511,8 +6503,7 @@ void AArch64InstructionSelector::processPHIs(MachineFunction &MF) { // %endbb: // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2 bool HasGPROp = false, HasFPROp = false; - for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) { - const auto &MO = MI->getOperand(OpIdx); + for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) { if (!MO.isReg()) continue; const LLT &Ty = MRI.getType(MO.getReg()); diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index f2a470857d21..78c0e90b1384 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -177,8 +177,8 @@ public: // We can't just use EmitIntValue here, as that will emit a data mapping // symbol, and swap the endianness on big-endian systems (instructions are // always little-endian). - for (unsigned I = 0; I < 4; ++I) { - Buffer[I] = uint8_t(Inst); + for (char &C : Buffer) { + C = uint8_t(Inst); Inst >>= 8; } diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index cf1a60643efd..92552c3d41d5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -101,8 +101,8 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) { // We can't just use EmitIntValue here, as that will swap the // endianness on big-endian systems (instructions are always // little-endian). - for (unsigned I = 0; I < 4; ++I) { - Buffer[I] = uint8_t(Inst); + for (char &C : Buffer) { + C = uint8_t(Inst); Inst >>= 8; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index df2f9a0fa3a9..c7c5ff7bcbe7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -26,6 +26,14 @@ def uchar_to_float : GICombineRule< [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]), (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>; + +def rcp_sqrt_to_rsq : GICombineRule< + (defs root:$rcp, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp, + [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; + + def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">; def cvt_f32_ubyteN : GICombineRule< @@ -86,7 +94,8 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, + rcp_sqrt_to_rsq]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index cee56ee97294..8236e6672247 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -654,6 +654,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectMAD_64_32(N); return; } + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: + return SelectMUL_LOHI(N); case ISD::CopyToReg: { const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); @@ -719,6 +722,18 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { Term->getMetadata("structurizecfg.uniform"); } +bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N, + unsigned ShAmtBits) const { + assert(N->getOpcode() == ISD::AND); + + const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); + if (RHS.countTrailingOnes() >= ShAmtBits) + return true; + + const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero; + return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits; +} + static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1) { if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST && @@ -1001,6 +1016,32 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) { + SDLoc SL(N); + bool Signed = N->getOpcode() == ISD::SMUL_LOHI; + unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + + SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64); + SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); + SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp}; + SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops); + if (!SDValue(N, 0).use_empty()) { + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32); + SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, + MVT::i32, SDValue(Mad, 0), Sub0); + ReplaceUses(SDValue(N, 0), SDValue(Lo, 0)); + } + if (!SDValue(N, 1).use_empty()) { + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32); + SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, + MVT::i32, SDValue(Mad, 0), Sub1); + ReplaceUses(SDValue(N, 1), SDValue(Hi, 0)); + } + CurDAG->RemoveDeadNode(N); +} + bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const { if (!isUInt<16>(Offset)) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index c1d9673f067e..d638d9877a9b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -136,6 +136,10 @@ private: bool isUniformLoad(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; + // Returns true if ISD::AND SDNode `N`'s masking of the shift amount operand's + // `ShAmtBits` bits is unneeded. + bool isUnneededShiftMask(const SDNode *N, unsigned ShAmtBits) const; + bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS, SDValue &RHS) const; @@ -231,6 +235,7 @@ private: void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); void SelectMAD_64_32(SDNode *N); + void SelectMUL_LOHI(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 523fa2d3724b..54177564afbc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -594,6 +594,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SMUL_LOHI); + setTargetDAGCombine(ISD::UMUL_LOHI); setTargetDAGCombine(ISD::MULHU); setTargetDAGCombine(ISD::MULHS); setTargetDAGCombine(ISD::SELECT); @@ -3462,6 +3464,50 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, return DAG.getSExtOrTrunc(Mul, DL, VT); } +SDValue +AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // SimplifyDemandedBits has the annoying habit of turning useful zero_extends + // in the source into any_extends if the result of the mul is truncated. Since + // we can assume the high bits are whatever we want, use the underlying value + // to avoid the unknown high bits from interfering. + if (N0.getOpcode() == ISD::ANY_EXTEND) + N0 = N0.getOperand(0); + if (N1.getOpcode() == ISD::ANY_EXTEND) + N1 = N1.getOperand(0); + + // Try to use two fast 24-bit multiplies (one for each half of the result) + // instead of one slow extending multiply. + unsigned LoOpcode, HiOpcode; + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + LoOpcode = AMDGPUISD::MUL_U24; + HiOpcode = AMDGPUISD::MULHI_U24; + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + LoOpcode = AMDGPUISD::MUL_I24; + HiOpcode = AMDGPUISD::MULHI_I24; + } else { + return SDValue(); + } + + SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1); + SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1); + DCI.CombineTo(N, Lo, Hi); + return SDValue(N, 0); +} + SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -4103,6 +4149,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performTruncateCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: + return performMulLoHiCombine(N, DCI); case ISD::MULHS: return performMulhsCombine(N, DCI); case ISD::MULHU: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 03632ac18598..daaca8737c5d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -91,6 +91,7 @@ protected: SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 28cb2fc57ac7..e16bead81b65 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3880,6 +3880,22 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, return KnownBits->signBitIsZero(Base); } +bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, + unsigned ShAmtBits) const { + assert(MI.getOpcode() == TargetOpcode::G_AND); + + Optional<APInt> RHS = getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI); + if (!RHS) + return false; + + if (RHS->countTrailingOnes() >= ShAmtBits) + return true; + + const APInt &LHSKnownZeros = + KnownBits->getKnownZeroes(MI.getOperand(1).getReg()); + return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index b70e6883bae2..26996e42af53 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -300,6 +300,10 @@ private: bool isInlineImmediate64(int64_t Imm) const; bool isInlineImmediate(const APFloat &Imm) const; + // Returns true if TargetOpcode::G_AND MachineInstr `MI`'s masking of the + // shift amount operand's `ShAmtBits` bits is unneeded. + bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index bad9f6265b36..0528b552f475 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -242,25 +242,41 @@ def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>; //===----------------------------------------------------------------------===// // Constrained shift PatFrags. + +def csh_mask_16 : PatFrag<(ops node:$src0), (and node:$src0, imm), + [{ return isUnneededShiftMask(N, 4); }]> { + let GISelPredicateCode = [{ return isUnneededShiftMask(MI, 4); }]; + } + +def csh_mask_32 : PatFrag<(ops node:$src0), (and node:$src0, imm), + [{ return isUnneededShiftMask(N, 5); }]> { + let GISelPredicateCode = [{ return isUnneededShiftMask(MI, 5); }]; + } + +def csh_mask_64 : PatFrag<(ops node:$src0), (and node:$src0, imm), + [{ return isUnneededShiftMask(N, 6); }]> { + let GISelPredicateCode = [{ return isUnneededShiftMask(MI, 6); }]; + } + foreach width = [16, 32, 64] in { -defvar mask = !sub(width, 1); +defvar csh_mask = !cast<SDPatternOperator>("csh_mask_"#width); def cshl_#width : PatFrags<(ops node:$src0, node:$src1), - [(shl node:$src0, node:$src1), (shl node:$src0, (and node:$src1, mask))]>; + [(shl node:$src0, node:$src1), (shl node:$src0, (csh_mask node:$src1))]>; defvar cshl = !cast<SDPatternOperator>("cshl_"#width); def cshl_#width#_oneuse : HasOneUseBinOp<cshl>; def clshl_rev_#width : PatFrag <(ops node:$src0, node:$src1), (cshl $src1, $src0)>; def csrl_#width : PatFrags<(ops node:$src0, node:$src1), - [(srl node:$src0, node:$src1), (srl node:$src0, (and node:$src1, mask))]>; + [(srl node:$src0, node:$src1), (srl node:$src0, (csh_mask node:$src1))]>; defvar csrl = !cast<SDPatternOperator>("csrl_"#width); def csrl_#width#_oneuse : HasOneUseBinOp<csrl>; def clshr_rev_#width : PatFrag <(ops node:$src0, node:$src1), (csrl $src1, $src0)>; def csra_#width : PatFrags<(ops node:$src0, node:$src1), - [(sra node:$src0, node:$src1), (sra node:$src0, (and node:$src1, mask))]>; + [(sra node:$src0, node:$src1), (sra node:$src0, (csh_mask node:$src1))]>; defvar csra = !cast<SDPatternOperator>("csra_"#width); def csra_#width#_oneuse : HasOneUseBinOp<csra>; def cashr_rev_#width : PatFrag <(ops node:$src0, node:$src1), @@ -696,11 +712,6 @@ class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat < (RcpInst $src) >; -class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) ->; - // Instructions which select to the same v_min_f* def fminnum_like : PatFrags<(ops node:$src0, node:$src1), [(fminnum_ieee node:$src0, node:$src1), diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index fc984d2dda64..1479933a2850 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" @@ -58,6 +59,9 @@ public: bool matchUCharToFloat(MachineInstr &MI); void applyUCharToFloat(MachineInstr &MI); + bool matchRcpSqrtToRsq(MachineInstr &MI, + std::function<void(MachineIRBuilder &)> &MatchInfo); + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -203,6 +207,48 @@ void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + + auto getRcpSrc = [=](const MachineInstr &MI) { + MachineInstr *ResMI = nullptr; + if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && + MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) + ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + + return ResMI; + }; + + auto getSqrtSrc = [=](const MachineInstr &MI) { + MachineInstr *SqrtSrcMI = nullptr; + mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); + return SqrtSrcMI; + }; + + MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; + // rcp(sqrt(x)) + if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { + MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(SqrtSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); + }; + return true; + } + + // sqrt(rcp(x)) + if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { + MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(RcpSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); + }; + return true; + } + + return false; +} + bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { Register SrcReg = MI.getOperand(1).getReg(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index d560d2043f42..7c4eb71882c7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -280,10 +280,10 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { } LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str() << '\n'); - for (size_t I = 0; I < Str.size(); ++I) { + for (char C : Str) { // Rest of the C escape sequences (e.g. \') are handled correctly // by the MDParser - switch (Str[I]) { + switch (C) { case '\a': Sizes << "\\a"; break; @@ -308,7 +308,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { Sizes << "\\72"; break; default: - Sizes << Str[I]; + Sizes << C; break; } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index d7dc9ee4117b..12b5830ef930 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -45,6 +45,7 @@ public: TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); + Register getAsVgpr(Register Reg); struct MinMaxMedOpc { unsigned Min, Max, Med; @@ -69,6 +70,23 @@ bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; } +Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) { + if (isVgprRegBank(Reg)) + return Reg; + + // Search for existing copy of Reg to vgpr. + for (MachineInstr &Use : MRI.use_instructions(Reg)) { + Register Def = Use.getOperand(0).getReg(); + if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def)) + return Def; + } + + // Copy Reg to vgpr. + Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); + MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID)); + return VgprReg; +} + AMDGPURegBankCombinerHelper::MinMaxMedOpc AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { switch (Opc) { @@ -134,7 +152,9 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) { B.setInstrAndDebugLoc(MI); B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, - {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags()); + {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1), + getAsVgpr(MatchInfo.Val2)}, + MI.getFlags()); MI.eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index ab3ce980c3f6..5988403c0a29 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3189,10 +3189,10 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, const MachineInstr &MI) const { unsigned RegBank = AMDGPU::InvalidRegBankID; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isReg()) + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) continue; - Register Reg = MI.getOperand(i).getReg(); + Register Reg = MO.getReg(); if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { RegBank = regBankUnion(RegBank, Bank->getID()); if (RegBank == AMDGPU::VGPRRegBankID) @@ -3206,10 +3206,10 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { - if (!MI.getOperand(i).isReg()) + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) continue; - Register Reg = MI.getOperand(i).getReg(); + Register Reg = MO.getReg(); if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { if (Bank->getID() != AMDGPU::SGPRRegBankID) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 1a9255f3240f..712f6dece911 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -706,9 +706,7 @@ bool AMDGPUCFGStructurizer::prepare() { // Remove unconditional branch instr. // Add dummy exit block iff there are multiple returns. - for (SmallVectorImpl<MachineBasicBlock *>::const_iterator - It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { - MachineBasicBlock *MBB = *It; + for (MachineBasicBlock *MBB : OrderedBlks) { removeUnconditionalBranch(MBB); removeRedundantConditionalBranch(MBB); if (isReturnBlock(MBB)) { @@ -851,14 +849,10 @@ bool AMDGPUCFGStructurizer::run() { void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { int SccNum = 0; - MachineBasicBlock *MBB; for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd(); ++It, ++SccNum) { const std::vector<MachineBasicBlock *> &SccNext = *It; - for (std::vector<MachineBasicBlock *>::const_iterator - blockIter = SccNext.begin(), blockEnd = SccNext.end(); - blockIter != blockEnd; ++blockIter) { - MBB = *blockIter; + for (MachineBasicBlock *MBB : SccNext) { OrderedBlks.push_back(MBB); recordSccnum(MBB, SccNum); } @@ -1601,11 +1595,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock( FuncRep->push_back(DummyExitBlk); //insert to function insertInstrEnd(DummyExitBlk, R600::RETURN); - for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(), - E = RetMBB.end(); It != E; ++It) { - MachineBasicBlock *MBB = *It; - MachineInstr *MI = getReturnInstr(MBB); - if (MI) + for (MachineBasicBlock *MBB : RetMBB) { + if (MachineInstr *MI = getReturnInstr(MBB)) MI->eraseFromParent(); MBB->addSuccessor(DummyExitBlk); LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td index f4ddbf1131c3..d18dab0554bd 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -48,8 +48,6 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -def : RsqPat<RECIPSQRT_IEEE_cm, f32>; - def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>; def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 12224cb3f797..a9a3421e8192 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -126,7 +126,6 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -def : RsqPat<RECIPSQRT_IEEE_eg, f32>; def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>; def SIN_eg : SIN_Common<0x8D>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index ff5d0b0af6a4..0f8dd0b3bf58 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1442,12 +1442,10 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { bool FullReg; const MachineInstr *MI1; - auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1, - this](const MachineInstr &MI) { + auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1, + this](const MachineInstr &MI) { if (!IsMFMAFn(MI)) return false; - if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI)) - return false; Register DstReg = MI.getOperand(0).getReg(); FullReg = (DstReg == Reg); MI1 = &MI; @@ -1458,8 +1456,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); - int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn, - MaxWaitStates); + int NumWaitStates = + getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates); if (NumWaitStates == std::numeric_limits<int>::max()) continue; @@ -1619,12 +1617,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { const MachineInstr *MFMA = nullptr; unsigned Reg; - auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA, - this](const MachineInstr &MI) { + auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) { if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) return false; - if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI)) - return false; MFMA = &MI; return true; }; @@ -1675,8 +1670,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { } MFMA = nullptr; - WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, - MaxWaitStates); + WaitStatesSinceDef = + getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); if (!MFMA) continue; @@ -1750,8 +1745,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { WaitStatesSinceDef); MFMA = nullptr; - WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, - MaxWaitStates); + WaitStatesSinceDef = + getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); if (MFMA) { int NeedWaitStates = MaxWaitStates; switch (TSchedModel.computeInstrLatency(MFMA)) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 3456f9a6156c..82c09378acac 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -74,11 +74,11 @@ unsigned GCNRegPressure::getRegKind(Register Reg, assert(Reg.isVirtual()); const auto RC = MRI.getRegClass(Reg); auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); - return STI->isSGPRClass(RC) ? - (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : - STI->hasAGPRs(RC) ? - (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) : - (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); + return STI->isSGPRClass(RC) + ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) + : STI->isAGPRClass(RC) + ? (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) + : (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 1d93165f9eec..715fd69fc7ae 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -177,9 +177,7 @@ bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); TII = ST.getInstrInfo(); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); MachineBasicBlock::iterator LatestCFAlu = E; while (I != E) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index d5eaa33ef964..b9ca7f928d56 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -301,9 +301,7 @@ public: const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); TII = ST.getInstrInfo(); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator I = MBB.begin(); if (I != MBB.end() && I->getOpcode() == R600::CF_ALU) continue; // BB was already parsed diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 838a497b4df1..194879fef53c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -73,9 +73,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator I = MBB.begin(); while (I != MBB.end()) { MachineInstr &MI = *I; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 0215eb9f9fea..bd757e9e3d70 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -285,9 +285,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::getLDSNoRetOp(MI.getOpcode()))); - for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { - NewMI.add(MI.getOperand(i)); - } + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + NewMI.add(MO); } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td index 4487864888b6..b3da2fdefacc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1265,7 +1265,6 @@ let Predicates = [isR600] in { defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; - def : RsqPat<RECIPSQRT_IEEE_r600, f32>; def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>; def R600_ExportSwz : ExportSwzInst { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp index 36acfafa72aa..6aee2f591b56 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -124,11 +124,9 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { DAG->dumpNode(*SU); } else { dbgs() << "NO NODE \n"; - for (unsigned i = 0; i < DAG->SUnits.size(); i++) { - const SUnit &S = DAG->SUnits[i]; + for (const SUnit &S : DAG->SUnits) if (!S.isScheduled) DAG->dumpNode(S); - } }); return SU; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 1a723279dc9f..72cf48c04e7f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -323,14 +323,12 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { TII = ST.getInstrInfo(); MRI = &Fn.getRegInfo(); - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock *MB = &*MBB; + for (MachineBasicBlock &MB : Fn) { PreviousRegSeq.clear(); PreviousRegSeqByReg.clear(); PreviousRegSeqByUndefCount.clear(); - for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); + for (MachineBasicBlock::iterator MII = MB.begin(), MIIE = MB.end(); MII != MIIE; ++MII) { MachineInstr &MI = *MII; if (MI.getOpcode() != R600::REG_SEQUENCE) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index e858bba2983c..beb0aad86e89 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -343,20 +343,11 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // dependence between Insn 0 and Insn 2. This can lead to incorrect // packetization // - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock::iterator End = MBB->end(); - MachineBasicBlock::iterator MI = MBB->begin(); - while (MI != End) { - if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF || - (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) { - MachineBasicBlock::iterator DeleteMI = MI; - ++MI; - MBB->erase(DeleteMI); - End = MBB->end(); - continue; - } - ++MI; + for (MachineBasicBlock &MBB : Fn) { + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + if (MI.isKill() || MI.getOpcode() == R600::IMPLICIT_DEF || + (MI.getOpcode() == R600::CF_ALU && !MI.getOperand(8).getImm())) + MBB.erase(MI); } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h index 777744f08cde..580e4bc417a4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h @@ -18,7 +18,8 @@ namespace llvm { enum SIRCFlags : uint8_t { // For vector registers. HasVGPR = 1 << 0, - HasAGPR = 1 << 1 + HasAGPR = 1 << 1, + HasSGPR = 1 << 2 }; // enum SIRCFlags namespace SIInstrFlags { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index cf93a63f26a0..f54778535b7c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -127,11 +127,11 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() { static bool hasVectorOperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) continue; - if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) + if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg()))) return true; } return false; @@ -259,7 +259,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); - bool IsAGPR = TRI->hasAGPRs(DstRC); + bool IsAGPR = TRI->isAGPRClass(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { Register SrcReg = MI.getOperand(I).getReg(); @@ -853,7 +853,7 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { Register PHIRes = MI.getOperand(0).getReg(); const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); - if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { + if (AllAGPRUses && numVGPRUses && !TRI->isAGPRClass(RC0)) { LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index a3a0e9c9b9ac..200e00ee5521 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1586,17 +1586,9 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; - switch (OpInfo.RegClass) { - case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH; - case AMDGPU::AV_160RegClassID: - break; - default: + if (!TRI->isVectorSuperClass( + TRI->getRegClass(InstDesc.OpInfo[OpIdx].RegClass))) return false; - } const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); auto Dst = MRI->createVirtualRegister(NewDstRC); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 882b9a203755..4706c74be721 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1364,6 +1364,34 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots( return false; } +bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint64_t EstStackSize = MFI.estimateStackSize(MF); + uint64_t MaxOffset = EstStackSize - 1; + + // We need the emergency stack slots to be allocated in range of the + // MUBUF/flat scratch immediate offset from the base register, so assign these + // first at the incoming SP position. + // + // TODO: We could try sorting the objects to find a hole in the first bytes + // rather than allocating as close to possible. This could save a lot of space + // on frames with alignment requirements. + if (ST.enableFlatScratch()) { + const SIInstrInfo *TII = ST.getInstrInfo(); + if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch)) + return false; + } else { + if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) + return false; + } + + return true; +} + MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 951ea79b2809..56fbb875ffd9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -43,6 +43,9 @@ public: const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const override; + bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const override; + bool isSupportedStackID(TargetStackID::Value ID) const override; void processFunctionBeforeFrameFinalized( diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 519c5b936536..35b72f5d201b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -809,6 +809,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMULO, MVT::i64, Custom); setOperationAction(ISD::UMULO, MVT::i64, Custom); + if (Subtarget->hasMad64_32()) { + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom); + } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); @@ -919,6 +924,16 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, !hasFP32Denormals(DAG.getMachineFunction()); } +bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, + LLT DestTy, LLT SrcTy) const { + return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && + DestTy.getScalarSizeInBits() == 32 && + SrcTy.getScalarSizeInBits() == 16 && + // TODO: This probably only requires no input flushing? + !hasFP32Denormals(*MI.getMF()); +} + bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -4290,8 +4305,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstrBuilder MIB; MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); + for (const MachineOperand &MO : MI.operands()) + MIB.add(MO); MIB.cloneMemRefs(MI); MI.eraseFromParent(); @@ -4457,6 +4472,8 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { return true; } +bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } + EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { @@ -4522,6 +4539,34 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, return false; } +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + LLT Ty) const { + switch (Ty.getScalarSizeInBits()) { + case 16: + return isFMAFasterThanFMulAndFAdd(MF, MVT::f16); + case 32: + return isFMAFasterThanFMulAndFAdd(MF, MVT::f32); + case 64: + return isFMAFasterThanFMulAndFAdd(MF, MVT::f64); + default: + break; + } + + return false; +} + +bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { + if (!Ty.isScalar()) + return false; + + if (Ty.getScalarSizeInBits() == 16) + return Subtarget->hasMadF16() && !hasFP64FP16Denormals(*MI.getMF()); + if (Ty.getScalarSizeInBits() == 32) + return Subtarget->hasMadMacF32Insts() && !hasFP32Denormals(*MI.getMF()); + + return false; +} + bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const { // TODO: Check future ftz flag @@ -4691,6 +4736,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMULO: case ISD::UMULO: return lowerXMULO(Op, DAG); + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: + return lowerXMUL_LOHI(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } @@ -5304,6 +5352,21 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues({ Result, Overflow }, SL); } +SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { + if (Op->isDivergent()) { + // Select to V_MAD_[IU]64_[IU]32. + return Op; + } + if (Subtarget->hasSMulHi()) { + // Expand to S_MUL_I32 + S_MUL_HI_[IU]32. + return SDValue(); + } + // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to + // calculate the high part, so we might as well do the whole thing with + // V_MAD_[IU]64_[IU]32. + return Op; +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->isTrapHandlerEnabled() || Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) @@ -9790,10 +9853,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, if (Subtarget->supportsMinMaxDenormModes() || denormalsEnabledForType(MRI.getType(Reg), MF)) return true; - for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) { - if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1)) + for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) + if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1)) return false; - } return true; } default: @@ -11460,15 +11522,15 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (I == -1) break; MachineOperand &Op = MI.getOperand(I); - if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && - OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg())) + if (!Op.isReg() || !Op.getReg().isVirtual()) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + if (!TRI->hasAGPRs(RC)) continue; auto *Src = MRI.getUniqueVRegDef(Op.getReg()); if (!Src || !Src->isCopy() || !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); auto *NewRC = TRI->getEquivalentVGPRClass(RC); // All uses of agpr64 and agpr32 can also accept vgpr except for // v_accvgpr_read, but we do not produce agpr reads during selection, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1e48c96ad3c8..1315cc15dd02 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -135,6 +135,7 @@ private: SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const; @@ -252,6 +253,9 @@ public: bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override; + bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, LLT DestTy, + LLT SrcTy) const override; + bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override; bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, @@ -377,6 +381,7 @@ public: bool hasBitPreservingFPLogic(EVT VT) const override; bool enableAggressiveFMAFusion(EVT VT) const override; + bool enableAggressiveFMAFusion(LLT Ty) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; @@ -384,7 +389,10 @@ public: bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + const LLT Ty) const override; bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override; + bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const override; SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f4e5771d2a2a..c9d9dd1fb82c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -150,6 +150,8 @@ enum VmemType { VMEM_NOSAMPLER, // MIMG instructions with a sampler. VMEM_SAMPLER, + // BVH instructions + VMEM_BVH }; VmemType getVmemType(const MachineInstr &Inst) { @@ -157,9 +159,10 @@ VmemType getVmemType(const MachineInstr &Inst) { if (!SIInstrInfo::isMIMG(Inst)) return VMEM_NOSAMPLER; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); - return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler - ? VMEM_SAMPLER - : VMEM_NOSAMPLER; + const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + return BaseInfo->BVH ? VMEM_BVH + : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; } void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4a928123b68f..92f5322b8ad2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -898,10 +898,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; - if (RI.hasAGPRs(RC)) { + if (RI.isAGPRClass(RC)) { Opcode = (RI.hasVGPRs(SrcRC)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; - } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) { + } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && (RI.isProperlyAlignedRC(*RC) && @@ -1205,7 +1205,7 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - if (RI.hasAGPRs(DstRC)) + if (RI.isAGPRClass(DstRC)) return AMDGPU::COPY; if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; @@ -1435,6 +1435,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, FrameInfo.getObjectAlign(FrameIndex)); unsigned SpillSize = TRI->getSpillSize(*RC); + MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); @@ -1448,7 +1449,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. if (SrcReg.isVirtual() && SpillSize == 4) { - MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } @@ -1463,10 +1463,21 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); + if (RI.isVectorSuperClass(RC)) { + // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an + // equivalent VGPR register beforehand. Regalloc might want to introduce + // AV spills only to be relevant until rewriter at which they become + // either spills of VGPRs or AGPRs. + Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); + BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg) + .addReg(SrcReg, RegState::Kill); + SrcReg = TmpVReg; + } + BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr @@ -1598,13 +1609,26 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); + unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); + + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); + Register TmpReg = DestReg; + if (IsVectorSuperClass) { + // For AV classes, insert the spill restore to a VGPR followed by a copy + // into an equivalent AV register. + MachineRegisterInfo &MRI = MF->getRegInfo(); + DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); + } BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); + + if (IsVectorSuperClass) + BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg) + .addReg(DestReg, RegState::Kill); } void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, @@ -2802,12 +2826,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } if (Is16Bit) { - if (isVGPRCopy) - return false; // Do not clobber vgpr_hi16 + if (isVGPRCopy) + return false; // Do not clobber vgpr_hi16 - if (DstReg.isVirtual() && - UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) - return false; + if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) + return false; UseMI.getOperand(0).setSubReg(0); if (DstReg.isPhysical()) { @@ -3896,9 +3919,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // verification is broken anyway if (ST.needsAlignedVGPRs()) { const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); - const bool IsVGPR = RI.hasVGPRs(RC); - const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); - if ((IsVGPR || IsAGPR) && MO.getSubReg()) { + if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { const TargetRegisterClass *SubRC = RI.getSubRegClass(RC, MO.getSubReg()); RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); @@ -5522,13 +5543,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { VRC = &AMDGPU::VReg_1RegClass; } else - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) : RI.getEquivalentVGPRClass(SRC); } else { - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) - ? RI.getEquivalentAGPRClass(VRC) - : RI.getEquivalentVGPRClass(VRC); + VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(VRC) + : RI.getEquivalentVGPRClass(VRC); } RC = VRC; } else { @@ -7065,8 +7086,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::STRICT_WWM: case AMDGPU::STRICT_WQM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); - if (RI.hasAGPRs(SrcRC)) { - if (RI.hasAGPRs(NewDstRC)) + if (RI.isAGPRClass(SrcRC)) { + if (RI.isAGPRClass(NewDstRC)) return nullptr; switch (Inst.getOpcode()) { @@ -7082,7 +7103,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( if (!NewDstRC) return nullptr; } else { - if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) + if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8c24268e379e..47ee83eb9351 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2246,7 +2246,7 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExtSDWA9 = 0; } -class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> { +class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> { let NeedPatGen = mode; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index d5f9cb8ba493..d55d8da8699a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -827,10 +827,6 @@ def : Pat < let OtherPredicates = [UnsafeFPMath] in { -//defm : RsqPat<V_RSQ_F32_e32, f32>; - -def : RsqPat<V_RSQ_F32_e32, f32>; - // Convert (x - floor(x)) to fract(x) def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), @@ -1372,61 +1368,48 @@ def : GCNPat < >; } + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ -// Prevent expanding both fneg and fabs. -// TODO: Add IgnoredBySelectionDAG bit? -let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG - def : GCNPat < - (fneg (fabs (f32 SReg_32:$src))), + (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit >; def : GCNPat < - (fabs (f32 SReg_32:$src)), + (UniformUnaryFrag<fabs> (f32 SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) >; def : GCNPat < - (fneg (f32 SReg_32:$src)), + (UniformUnaryFrag<fneg> (f32 SReg_32:$src)), (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) >; def : GCNPat < - (fneg (f16 SReg_32:$src)), + (UniformUnaryFrag<fneg> (f16 SReg_32:$src)), (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) >; def : GCNPat < - (fneg (f16 VGPR_32:$src)), - (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) ->; - -def : GCNPat < - (fabs (f16 SReg_32:$src)), + (UniformUnaryFrag<fabs> (f16 SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) >; def : GCNPat < - (fneg (fabs (f16 SReg_32:$src))), + (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit >; def : GCNPat < - (fneg (fabs (f16 VGPR_32:$src))), - (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit ->; - -def : GCNPat < - (fneg (v2f16 SReg_32:$src)), + (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)), (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) >; def : GCNPat < - (fabs (v2f16 SReg_32:$src)), + (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)), (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) >; @@ -1435,51 +1418,20 @@ def : GCNPat < // fabs is not reported as free because there is modifier for it in // VOP3P instructions, so it is turned into the bit op. def : GCNPat < - (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), + (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; def : GCNPat < - (fneg (v2f16 (fabs SReg_32:$src))), + (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))), (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; -// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled - // def : GCNPat < -// (fneg (f64 SReg_64:$src)), -// (REG_SEQUENCE SReg_64, -// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), -// sub0, -// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), -// (i32 (S_MOV_B32 (i32 0x80000000)))), -// sub1) -// >; - -// def : GCNPat < -// (fneg (fabs (f64 SReg_64:$src))), -// (REG_SEQUENCE SReg_64, -// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), -// sub0, -// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), -// (S_MOV_B32 (i32 0x80000000))), // Set sign bit. -// sub1) -// >; - -// FIXME: Use S_BITSET0_B32/B64? -// def : GCNPat < -// (fabs (f64 SReg_64:$src)), -// (REG_SEQUENCE SReg_64, -// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), -// sub0, -// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), -// (i32 (S_MOV_B32 (i32 0x7fffffff)))), -// sub1) -// >; // COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead // of the real value. def : GCNPat < - (fneg (v2f32 SReg_64:$src)), + (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)), (v2f32 (REG_SEQUENCE SReg_64, (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), (i32 (S_MOV_B32 (i32 0x80000000)))), @@ -1489,36 +1441,103 @@ def : GCNPat < SReg_32)), sub1)) >; -} // End let AddedComplexity = 1 +def : GCNPat < + (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)), + (v2f32 (REG_SEQUENCE SReg_64, + (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), + (i32 (S_MOV_B32 (i32 0x7fffffff)))), + SReg_32)), sub0, + (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), + (i32 (S_MOV_B32 (i32 0x7fffffff)))), + SReg_32)), sub1)) +>; + +def : GCNPat < + (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))), + (v2f32 (REG_SEQUENCE SReg_64, + (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub0, + (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub1)) +>; + +// FIXME: Use S_BITSET0_B32/B64? +def : GCNPat < + (UniformUnaryFrag<fabs> (f64 SReg_64:$src)), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), + sub0, + (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), + (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. + sub1) +>; + +def : GCNPat < + (UniformUnaryFrag<fneg> (f64 SReg_64:$src)), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), + sub0, + (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), + (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), + sub1) +>; + +def : GCNPat < + (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), + sub0, + (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), + (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. + sub1) +>; + + +def : GCNPat < + (fneg (fabs (f32 VGPR_32:$src))), + (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit +>; def : GCNPat < (fabs (f32 VGPR_32:$src)), - (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) + (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) >; def : GCNPat < (fneg (f32 VGPR_32:$src)), - (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) + (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) >; def : GCNPat < (fabs (f16 VGPR_32:$src)), - (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) + (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (f16 VGPR_32:$src)), + (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (fabs (f16 VGPR_32:$src))), + (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit >; def : GCNPat < (fneg (v2f16 VGPR_32:$src)), - (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) + (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; def : GCNPat < (fabs (v2f16 VGPR_32:$src)), - (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) + (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) >; def : GCNPat < (fneg (v2f16 (fabs VGPR_32:$src))), - (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit + (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; def : GCNPat < @@ -1526,30 +1545,28 @@ def : GCNPat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. + (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), sub1) >; -// TODO: Use SGPR for constant def : GCNPat < (fneg (f64 VReg_64:$src)), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), - (i32 (V_MOV_B32_e32 (i32 0x80000000)))), + (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), sub1) >; -// TODO: Use SGPR for constant def : GCNPat < (fneg (fabs (f64 VReg_64:$src))), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. + (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), sub1) >; @@ -1681,14 +1698,9 @@ def : GCNPat < /********** Intrinsic Patterns **********/ /********** ================== **********/ -let OtherPredicates = [isNotGFX90APlus] in -// FIXME: Should use _e64 and select source modifiers. -def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; - -let OtherPredicates = [isGFX90APlus] in def : GCNPat < - (fpow f32:$src0, f32:$src1), - (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0)) + (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))), + (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) >; def : GCNPat < diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 34cbb49dcd16..f4d9002e930e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1609,7 +1609,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, } unsigned BitWidth = 32 * (CI.Width + Paired.Width); - return TRI->hasAGPRs(getDataRegClass(*CI.I)) + return TRI->isAGPRClass(getDataRegClass(*CI.I)) ? TRI->getAGPRClassForBitWidth(BitWidth) : TRI->getVGPRClassForBitWidth(BitWidth); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 29f072ca1e6c..fff4f6729c99 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -795,6 +795,8 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + // Set L1 cache policy to MISS_EVICT. + // Note: there is no L2 cache bypass policy at the ISA level. Changed |= enableGLCBit(MI); break; case SIAtomicScope::WORKGROUP: @@ -837,8 +839,10 @@ bool SIGfx6CacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// The L1 cache is write through so does not need to be bypassed. There is no - /// bypass control for the L2 cache at the isa level. + /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically + /// bypassed, and the GLC bit is instead used to indicate if they are + /// return or no-return. + /// Note: there is no L2 cache coherent bypass control at the ISA level. return Changed; } @@ -860,6 +864,9 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; if (IsVolatile) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. if (Op == SIMemOp::LOAD) Changed |= enableGLCBit(MI); @@ -875,7 +882,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( } if (IsNonTemporal) { - // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); return Changed; @@ -1097,6 +1105,8 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass( switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + // Set the L1 cache policy to MISS_LRU. + // Note: there is no L2 cache bypass policy at the ISA level. Changed |= enableGLCBit(MI); break; case SIAtomicScope::WORKGROUP: @@ -1206,6 +1216,9 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; if (IsVolatile) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. if (Op == SIMemOp::LOAD) Changed |= enableGLCBit(MI); @@ -1221,7 +1234,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( } if (IsNonTemporal) { - // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); return Changed; @@ -1380,12 +1394,11 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( bool Changed = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { - /// TODO Do not set glc for rmw atomic operations as they - /// implicitly bypass the L0/L1 caches. - switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + // Set the L0 and L1 cache policies to MISS_EVICT. + // Note: there is no L2 cache coherent bypass control at the ISA level. Changed |= enableGLCBit(MI); Changed |= enableDLCBit(MI); break; @@ -1434,6 +1447,9 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; if (IsVolatile) { + // Set L0 and L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache coherent bypass control at the ISA level. if (Op == SIMemOp::LOAD) { Changed |= enableGLCBit(MI); Changed |= enableDLCBit(MI); @@ -1450,8 +1466,14 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( } if (IsNonTemporal) { - // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. + // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT + // and L2 cache policy to STREAM. + // For stores setting both GLC and SLC configures L0 and L1 cache policy + // to MISS_EVICT and the L2 cache policy to STREAM. + if (Op == SIMemOp::STORE) + Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); + return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 6a698348d389..da41a5e2478a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1170,7 +1170,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, unsigned I = MI.getOperandNo(&Op); if (Desc.OpInfo[I].RegClass == -1 || - !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) + !TRI->isVSSuperClass(TRI->getRegClass(Desc.OpInfo[I].RegClass))) continue; if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index d1b8e217471e..b0e45dd3e3e3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -291,20 +291,19 @@ bool SIPreEmitPeephole::mustRetainExeczBranch( MBBI != End && MBBI != ToI; ++MBBI) { const MachineBasicBlock &MBB = *MBBI; - for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { + for (const MachineInstr &MI : MBB) { // When a uniform loop is inside non-uniform control flow, the branch // leaving the loop might never be taken when EXEC = 0. // Hence we should retain cbranch out of the loop lest it become infinite. - if (I->isConditionalBranch()) + if (MI.isConditionalBranch()) return true; - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) return true; // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) + if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) || + TII->isDS(MI) || MI.getOpcode() == AMDGPU::S_WAITCNT) return true; ++NumInstr; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index bfbe84f696f8..a1d9a23a5084 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -402,6 +402,62 @@ const uint32_t *SIRegisterInfo::getNoPreservedMask() const { return CSR_AMDGPU_NoRegs_RegMask; } +const TargetRegisterClass * +SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + // FIXME: Should have a helper function like getEquivalentVGPRClass to get the + // equivalent AV class. If used one, the verifier will crash after + // RegBankSelect in the GISel flow. The aligned regclasses are not fully given + // until Instruction selection. + if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() && + (isVGPRClass(RC) || isAGPRClass(RC))) { + if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) + return &AMDGPU::AV_32RegClass; + if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) + return &AMDGPU::AV_64RegClass; + if (RC == &AMDGPU::VReg_64_Align2RegClass || + RC == &AMDGPU::AReg_64_Align2RegClass) + return &AMDGPU::AV_64_Align2RegClass; + if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) + return &AMDGPU::AV_96RegClass; + if (RC == &AMDGPU::VReg_96_Align2RegClass || + RC == &AMDGPU::AReg_96_Align2RegClass) + return &AMDGPU::AV_96_Align2RegClass; + if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) + return &AMDGPU::AV_128RegClass; + if (RC == &AMDGPU::VReg_128_Align2RegClass || + RC == &AMDGPU::AReg_128_Align2RegClass) + return &AMDGPU::AV_128_Align2RegClass; + if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) + return &AMDGPU::AV_160RegClass; + if (RC == &AMDGPU::VReg_160_Align2RegClass || + RC == &AMDGPU::AReg_160_Align2RegClass) + return &AMDGPU::AV_160_Align2RegClass; + if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) + return &AMDGPU::AV_192RegClass; + if (RC == &AMDGPU::VReg_192_Align2RegClass || + RC == &AMDGPU::AReg_192_Align2RegClass) + return &AMDGPU::AV_192_Align2RegClass; + if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) + return &AMDGPU::AV_256RegClass; + if (RC == &AMDGPU::VReg_256_Align2RegClass || + RC == &AMDGPU::AReg_256_Align2RegClass) + return &AMDGPU::AV_256_Align2RegClass; + if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) + return &AMDGPU::AV_512RegClass; + if (RC == &AMDGPU::VReg_512_Align2RegClass || + RC == &AMDGPU::AReg_512_Align2RegClass) + return &AMDGPU::AV_512_Align2RegClass; + if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) + return &AMDGPU::AV_1024RegClass; + if (RC == &AMDGPU::VReg_1024_Align2RegClass || + RC == &AMDGPU::AReg_1024_Align2RegClass) + return &AMDGPU::AV_1024_Align2RegClass; + } + + return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); +} + Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const SIFrameLowering *TFI = MF.getSubtarget<GCNSubtarget>().getFrameLowering(); @@ -994,10 +1050,22 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, unsigned Dst = IsStore ? Reg : ValueReg; unsigned Src = IsStore ? ValueReg : Reg; - unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 - : AMDGPU::V_ACCVGPR_READ_B32_e64; + bool IsVGPR = TRI->isVGPR(MRI, Reg); + DebugLoc DL = MI->getDebugLoc(); + if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { + // Spiller during regalloc may restore a spilled register to its superclass. + // It could result in AGPR spills restored to VGPRs or the other way around, + // making the src and dst with identical regclasses at this point. It just + // needs a copy in such cases. + auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) + .addReg(Src, getKillRegState(IsKill)); + CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); + return CopyMIB; + } + unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 + : AMDGPU::V_ACCVGPR_READ_B32_e64; - auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) .addReg(Src, getKillRegState(IsKill)); MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); return MIB; @@ -1099,7 +1167,7 @@ void SIRegisterInfo::buildSpillLoadStore( const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. - const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); + const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; // Always use 4 byte operations for AGPRs because we need to scavenge @@ -2163,6 +2231,65 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { : getAnyAGPRClassForBitWidth(BitWidth); } +static const TargetRegisterClass * +getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::AV_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::AV_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::AV_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::AV_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::AV_192RegClass; + if (BitWidth <= 224) + return &AMDGPU::AV_224RegClass; + if (BitWidth <= 256) + return &AMDGPU::AV_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::AV_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AV_1024RegClass; + + return nullptr; +} + +static const TargetRegisterClass * +getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 64) + return &AMDGPU::AV_64_Align2RegClass; + if (BitWidth <= 96) + return &AMDGPU::AV_96_Align2RegClass; + if (BitWidth <= 128) + return &AMDGPU::AV_128_Align2RegClass; + if (BitWidth <= 160) + return &AMDGPU::AV_160_Align2RegClass; + if (BitWidth <= 192) + return &AMDGPU::AV_192_Align2RegClass; + if (BitWidth <= 224) + return &AMDGPU::AV_224_Align2RegClass; + if (BitWidth <= 256) + return &AMDGPU::AV_256_Align2RegClass; + if (BitWidth <= 512) + return &AMDGPU::AV_512_Align2RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AV_1024_Align2RegClass; + + return nullptr; +} + +const TargetRegisterClass * +SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { + if (BitWidth <= 16) + return &AMDGPU::VGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::AV_32RegClass; + return ST.needsAlignedVGPRs() + ? getAlignedVectorSuperClassForBitWidth(BitWidth) + : getAnyVectorSuperClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth <= 16) @@ -2305,15 +2432,14 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( // We can assume that each lane corresponds to one 32-bit register. unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; - if (isSGPRClass(RC)) { - if (Size == 32) - RC = &AMDGPU::SGPR_32RegClass; - else - RC = getSGPRClassForBitWidth(Size); - } else if (hasAGPRs(RC)) { + if (isAGPRClass(RC)) { RC = getAGPRClassForBitWidth(Size); - } else { + } else if (isVGPRClass(RC)) { RC = getVGPRClassForBitWidth(Size); + } else if (isVectorSuperClass(RC)) { + RC = getVectorSuperClassForBitWidth(Size); + } else { + RC = getSGPRClassForBitWidth(Size); } assert(RC && "Invalid sub-register class size"); return RC; @@ -2626,10 +2752,13 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { if (!ST.needsAlignedVGPRs()) return true; - if (hasVGPRs(&RC)) + if (isVGPRClass(&RC)) return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC))); - if (hasAGPRs(&RC)) + if (isAGPRClass(&RC)) return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC))); + if (isVectorSuperClass(&RC)) + return RC.hasSuperClassEq( + getVectorSuperClassForBitWidth(getRegSizeInBits(RC))); return true; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 8d90ddb1cf4c..f1fe0a1d9329 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -77,6 +77,10 @@ public: return 100; } + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; @@ -156,6 +160,10 @@ public: const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const; LLVM_READONLY + const TargetRegisterClass * + getVectorSuperClassForBitWidth(unsigned BitWidth) const; + + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); /// Return the 'base' register class for this register. @@ -164,7 +172,7 @@ public: /// \returns true if this class contains only SGPR registers static bool isSGPRClass(const TargetRegisterClass *RC) { - return !hasVGPRs(RC) && !hasAGPRs(RC); + return hasSGPRs(RC) && !hasVGPRs(RC) && !hasAGPRs(RC); } /// \returns true if this class ID contains only SGPR registers @@ -176,12 +184,22 @@ public: /// \returns true if this class contains only VGPR registers static bool isVGPRClass(const TargetRegisterClass *RC) { - return hasVGPRs(RC) && !hasAGPRs(RC); + return hasVGPRs(RC) && !hasAGPRs(RC) && !hasSGPRs(RC); } /// \returns true if this class contains only AGPR registers static bool isAGPRClass(const TargetRegisterClass *RC) { - return hasAGPRs(RC) && !hasVGPRs(RC); + return hasAGPRs(RC) && !hasVGPRs(RC) && !hasSGPRs(RC); + } + + /// \returns true only if this class contains both VGPR and AGPR registers + bool isVectorSuperClass(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) && hasAGPRs(RC) && !hasSGPRs(RC); + } + + /// \returns true only if this class contains both VGPR and SGPR registers + bool isVSSuperClass(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) && hasSGPRs(RC) && !hasAGPRs(RC); } /// \returns true if this class contains VGPR registers. @@ -194,6 +212,11 @@ public: return RC->TSFlags & SIRCFlags::HasAGPR; } + /// \returns true if this class contains SGPR registers. + static bool hasSGPRs(const TargetRegisterClass *RC) { + return RC->TSFlags & SIRCFlags::HasSGPR; + } + /// \returns true if this class contains any vector registers. static bool hasVectorRegisters(const TargetRegisterClass *RC) { return hasVGPRs(RC) || hasAGPRs(RC); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index cf1d90484228..340e2b48e5cd 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -133,9 +133,13 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> field bit HasVGPR = 0; field bit HasAGPR = 0; + // For scalar register classes. + field bit HasSGPR = 0; + // These need to be kept in sync with the enum SIRCFlags. let TSFlags{0} = HasVGPR; let TSFlags{1} = HasAGPR; + let TSFlags{2} = HasSGPR; } multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1, @@ -307,45 +311,51 @@ foreach Index = 0...255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// -def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { +def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> { let CopyCost = -1; let isAllocatable = 0; + let HasSGPR = 1; } -def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { +def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> { let CopyCost = 1; let isAllocatable = 0; + let HasSGPR = 1; } -def M0_CLASS_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> { +def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> { let CopyCost = 1; let Size = 16; let isAllocatable = 0; + let HasSGPR = 1; } // TODO: Do we need to set DwarfRegAlias on register tuples? -def SGPR_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, +def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "SGPR%u_LO16", 0, 105))> { let AllocationPriority = 9; let Size = 16; let GeneratePressureSet = 0; + let HasSGPR = 1; } -def SGPR_HI16 : RegisterClass<"AMDGPU", [i16, f16], 16, +def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "SGPR%u_HI16", 0, 105))> { let isAllocatable = 0; let Size = 16; let GeneratePressureSet = 0; + let HasSGPR = 1; } // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. let AllocationPriority = 9; let GeneratePressureSet = 0; + let HasSGPR = 1; } // SGPR 64-bit registers @@ -376,16 +386,18 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s" def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">; // Trap handler TMP 32-bit registers -def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, +def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, (add (sequence "TTMP%u", 0, 15))> { let isAllocatable = 0; + let HasSGPR = 1; } // Trap handler TMP 16-bit registers -def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, +def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "TTMP%u_LO16", 0, 15))> { let Size = 16; let isAllocatable = 0; + let HasSGPR = 1; } // Trap handler TMP 64-bit registers @@ -598,16 +610,18 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">; // Register classes used as source and destination //===----------------------------------------------------------------------===// -def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; + let HasSGPR = 1; } -def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; + let HasSGPR = 1; } def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32, @@ -616,10 +630,10 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32, let CopyCost = -1; } -let GeneratePressureSet = 0 in { +let GeneratePressureSet = 0, HasSGPR = 1 in { // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, @@ -627,7 +641,7 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1 let AllocationPriority = 10; } -def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16, XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16, @@ -637,29 +651,29 @@ def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16, let AllocationPriority = 10; } -def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { let AllocationPriority = 10; } -def SReg_LO16_XEXEC_HI : RegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> { let Size = 16; let AllocationPriority = 10; } -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 10; } -def SReg_LO16_XM0 : RegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> { let Size = 16; let AllocationPriority = 10; } -def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, +def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> { let Size = 16; let AllocationPriority = 10; @@ -667,65 +681,75 @@ def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, +def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 10; + let HasSGPR = 1; } let GeneratePressureSet = 0 in { -def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; + let HasSGPR = 1; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, +def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 11; + let HasSGPR = 1; } // CCR (call clobbered registers) SGPR 64-bit registers -def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, +def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 16))> { let CopyCost = SGPR_64.CopyCost; let AllocationPriority = SGPR_64.AllocationPriority; + let HasSGPR = 1; } // Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC -def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, +def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc (shl SGPR_64, 15), 1), // s[30:31] (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63] let CopyCost = SGPR_64.CopyCost; let AllocationPriority = SGPR_64.AllocationPriority; + let HasSGPR = 1; } -def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, +def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; + let HasSGPR = 1; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, +def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 13; + let HasSGPR = 1; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, +def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 13; + let HasSGPR = 1; } -def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32, +def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, (add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> { let CopyCost = 1; let isAllocatable = 0; + let HasSGPR = 1; } -def SReg_1 : RegisterClass<"AMDGPU", [i1], 32, +def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add SReg_1_XEXEC, EXEC, EXEC_LO)> { let CopyCost = 1; let isAllocatable = 0; + let HasSGPR = 1; } multiclass SRegClass<int numRegs, int priority, @@ -738,18 +762,18 @@ multiclass SRegClass<int numRegs, int priority, defvar sgprName = !strconcat("SGPR_", suffix); defvar ttmpName = !strconcat("TTMP_", suffix); - let AllocationPriority = priority, CopyCost = copyCost in { - def "" # sgprName : RegisterClass<"AMDGPU", regTypes, 32, (add regList)> { + let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in { + def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> { } if hasTTMP then { - def "" # ttmpName : RegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> { + def "" # ttmpName : SIRegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> { let isAllocatable = 0; } } def SReg_ # suffix : - RegisterClass<"AMDGPU", regTypes, 32, + SIRegisterClass<"AMDGPU", regTypes, 32, !con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]), !if(hasTTMP, !dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]), @@ -855,44 +879,45 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; let HasVGPR = 1; + let HasSGPR = 1; } def VS_64 : SIRegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> { let isAllocatable = 0; let HasVGPR = 1; + let HasSGPR = 1; } -def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, - (add AGPR_32, VGPR_32)> { - let isAllocatable = 0; - let HasVGPR = 1; - let HasAGPR = 1; -} - -def AV_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, - (add AReg_64, VReg_64)> { - let isAllocatable = 0; +def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; } } // End GeneratePressureSet = 0 -let HasVGPR = 1, HasAGPR = 1 in { -def AV_96 : SIRegisterClass<"AMDGPU", VReg_96.RegTypes, 32, - (add AReg_96, VReg_96)> { - let isAllocatable = 0; -} +// Define a register tuple class, along with one requiring an even +// aligned base register. +multiclass AVRegClass<int numRegs, list<ValueType> regTypes, + dag vregList, dag aregList> { + let HasVGPR = 1, HasAGPR = 1 in { + // Define the regular class. + def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>; -def AV_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32, - (add AReg_128, VReg_128)> { - let isAllocatable = 0; + // Define 2-aligned variant + def _Align2 : VRegClassBase<numRegs, regTypes, + (add (decimate vregList, 2), + (decimate aregList, 2))>; + } } -def AV_160 : SIRegisterClass<"AMDGPU", VReg_160.RegTypes, 32, - (add AReg_160, VReg_160)> { - let isAllocatable = 0; -} -} // End HasVGPR = 1, HasAGPR = 1 +defm AV_64 : AVRegClass<2, VReg_64.RegTypes, (add VGPR_64), (add AGPR_64)>; +defm AV_96 : AVRegClass<3, VReg_96.RegTypes, (add VGPR_96), (add AGPR_96)>; +defm AV_128 : AVRegClass<4, VReg_128.RegTypes, (add VGPR_128), (add AGPR_128)>; +defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>; +defm AV_192 : AVRegClass<6, VReg_160.RegTypes, (add VGPR_192), (add AGPR_192)>; +defm AV_224 : AVRegClass<7, VReg_160.RegTypes, (add VGPR_224), (add AGPR_224)>; +defm AV_256 : AVRegClass<8, VReg_160.RegTypes, (add VGPR_256), (add AGPR_256)>; +defm AV_512 : AVRegClass<16, VReg_160.RegTypes, (add VGPR_512), (add AGPR_512)>; +defm AV_1024 : AVRegClass<32, VReg_160.RegTypes, (add VGPR_1024), (add AGPR_1024)>; //===----------------------------------------------------------------------===// // Register operands diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td index 0792b303b830..18d424a3bc9f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td @@ -93,16 +93,16 @@ def HWBranch : ProcResource<1> { let BufferSize = 1; } def HWExport : ProcResource<1> { - let BufferSize = 7; // Taken from S_WAITCNT + let BufferSize = 1; } def HWLGKM : ProcResource<1> { - let BufferSize = 31; // Taken from S_WAITCNT + let BufferSize = 1; } def HWSALU : ProcResource<1> { let BufferSize = 1; } def HWVMEM : ProcResource<1> { - let BufferSize = 15; // Taken from S_WAITCNT + let BufferSize = 1; } def HWVALU : ProcResource<1> { let BufferSize = 1; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 6f63f686635a..46012e5d7d97 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -487,6 +487,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); SmallVector<MachineInstr *, 4> SetInactiveInstrs; SmallVector<MachineInstr *, 4> SoftWQMInstrs; + bool HasImplicitDerivatives = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -497,8 +499,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, MachineBasicBlock &MBB = **BI; BlockInfo &BBI = Blocks[&MBB]; - for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { - MachineInstr &MI = *II; + for (MachineInstr &MI : MBB) { InstrInfo &III = Instructions[&MI]; unsigned Opcode = MI.getOpcode(); char Flags = 0; @@ -507,6 +508,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // If LOD is not supported WQM is not needed. if (!ST->hasExtendedImageInsts()) continue; + // Only generate implicit WQM if implicit derivatives are required. + // This avoids inserting unintended WQM if a shader type without + // implicit derivatives uses an image sampling instruction. + if (!HasImplicitDerivatives) + continue; // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 9da7b9f5145d..d20eaaaa65e8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1626,13 +1626,14 @@ unsigned getRegBitWidth(unsigned RCID) { return 32; case AMDGPU::SGPR_64RegClassID: case AMDGPU::VS_64RegClassID: - case AMDGPU::AV_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: case AMDGPU::AReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: case AMDGPU::VReg_64_Align2RegClassID: case AMDGPU::AReg_64_Align2RegClassID: + case AMDGPU::AV_64RegClassID: + case AMDGPU::AV_64_Align2RegClassID: return 64; case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: @@ -1641,6 +1642,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VReg_96_Align2RegClassID: case AMDGPU::AReg_96_Align2RegClassID: case AMDGPU::AV_96RegClassID: + case AMDGPU::AV_96_Align2RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: @@ -1649,6 +1651,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VReg_128_Align2RegClassID: case AMDGPU::AReg_128_Align2RegClassID: case AMDGPU::AV_128RegClassID: + case AMDGPU::AV_128_Align2RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: @@ -1657,6 +1660,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VReg_160_Align2RegClassID: case AMDGPU::AReg_160_Align2RegClassID: case AMDGPU::AV_160RegClassID: + case AMDGPU::AV_160_Align2RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: @@ -1664,6 +1668,8 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_192RegClassID: case AMDGPU::VReg_192_Align2RegClassID: case AMDGPU::AReg_192_Align2RegClassID: + case AMDGPU::AV_192RegClassID: + case AMDGPU::AV_192_Align2RegClassID: return 192; case AMDGPU::SGPR_224RegClassID: case AMDGPU::SReg_224RegClassID: @@ -1671,6 +1677,8 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_224RegClassID: case AMDGPU::VReg_224_Align2RegClassID: case AMDGPU::AReg_224_Align2RegClassID: + case AMDGPU::AV_224RegClassID: + case AMDGPU::AV_224_Align2RegClassID: return 224; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: @@ -1678,6 +1686,8 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_256RegClassID: case AMDGPU::VReg_256_Align2RegClassID: case AMDGPU::AReg_256_Align2RegClassID: + case AMDGPU::AV_256RegClassID: + case AMDGPU::AV_256_Align2RegClassID: return 256; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: @@ -1685,6 +1695,8 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_512RegClassID: case AMDGPU::VReg_512_Align2RegClassID: case AMDGPU::AReg_512_Align2RegClassID: + case AMDGPU::AV_512RegClassID: + case AMDGPU::AV_512_Align2RegClassID: return 512; case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: @@ -1692,6 +1704,8 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::AReg_1024RegClassID: case AMDGPU::VReg_1024_Align2RegClassID: case AMDGPU::AReg_1024_Align2RegClassID: + case AMDGPU::AV_1024RegClassID: + case AMDGPU::AV_1024_Align2RegClassID: return 1024; default: llvm_unreachable("Unexpected register class"); diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp index 62462b77eccf..50ba9fe75232 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCMCInstLower.cpp @@ -104,8 +104,7 @@ MCOperand ARCMCInstLower::LowerOperand(const MachineOperand &MO, void ARCMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp = LowerOperand(MO); if (MCOp.isValid()) diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h index 5500783f74db..1d5e45aec06c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h @@ -44,6 +44,7 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMBranchTargetsPass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); @@ -66,6 +67,7 @@ void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +void initializeARMBranchTargetsPass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td index 8cbd80f1bf65..e03dd597eb65 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td @@ -442,6 +442,10 @@ def FeatureFixCMSE_CVE_2021_35465 : SubtargetFeature<"fix-cmse-cve-2021-35465", "Mitigate against the cve-2021-35465 " "security vulnurability">; +def FeaturePACBTI : SubtargetFeature<"pacbti", "HasPACBTI", "true", + "Enable Pointer Authentication and Branch " + "Target Identification">; + //===----------------------------------------------------------------------===// // ARM architecture class // diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 9901b86b0e87..6a88ac485e69 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -763,6 +763,32 @@ void ARMAsmPrinter::emitAttributes() { int EnumBuildAttr = EnumWidth == 1 ? 1 : 2; ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr); } + + auto *PACValue = mdconst::extract_or_null<ConstantInt>( + SourceModule->getModuleFlag("sign-return-address")); + if (PACValue && PACValue->getZExtValue() == 1) { + // If "+pacbti" is used as an architecture extension, + // Tag_PAC_extension is emitted in + // ARMTargetStreamer::emitTargetAttributes(). + if (!STI.hasPACBTI()) { + ATS.emitAttribute(ARMBuildAttrs::PAC_extension, + ARMBuildAttrs::AllowPACInNOPSpace); + } + ATS.emitAttribute(ARMBuildAttrs::PACRET_use, ARMBuildAttrs::PACRETUsed); + } + + auto *BTIValue = mdconst::extract_or_null<ConstantInt>( + SourceModule->getModuleFlag("branch-target-enforcement")); + if (BTIValue && BTIValue->getZExtValue() == 1) { + // If "+pacbti" is used as an architecture extension, + // Tag_BTI_extension is emitted in + // ARMTargetStreamer::emitTargetAttributes(). + if (!STI.hasPACBTI()) { + ATS.emitAttribute(ARMBuildAttrs::BTI_extension, + ARMBuildAttrs::AllowBTIInNOPSpace); + } + ATS.emitAttribute(ARMBuildAttrs::BTI_use, ARMBuildAttrs::BTIUsed); + } } } @@ -1535,17 +1561,17 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { MCInst.addExpr(BranchTarget); } - if (Opc == ARM::t2BFic) { - const MCExpr *ElseLabel = MCSymbolRefExpr::create( - getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getIndex(), OutContext), - OutContext); - MCInst.addExpr(ElseLabel); - MCInst.addImm(MI->getOperand(3).getImm()); - } else { - MCInst.addImm(MI->getOperand(2).getImm()) - .addReg(MI->getOperand(3).getReg()); - } + if (Opc == ARM::t2BFic) { + const MCExpr *ElseLabel = MCSymbolRefExpr::create( + getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(2).getIndex(), OutContext), + OutContext); + MCInst.addExpr(ElseLabel); + MCInst.addImm(MI->getOperand(3).getImm()); + } else { + MCInst.addImm(MI->getOperand(2).getImm()) + .addReg(MI->getOperand(3).getReg()); + } EmitToStreamer(*OutStreamer, MCInst); return; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 2d981be4cfc1..2a12947d24a8 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -310,8 +310,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, // Transfer LiveVariables states, kill / dead info. if (LV) { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) { Register Reg = MO.getReg(); @@ -634,8 +633,7 @@ bool ARMBaseInstrInfo::ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred, bool SkipDead) const { bool Found = false; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { bool ClobbersCPSR = MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR); bool IsCPSR = MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR; if (ClobbersCPSR || IsCPSR) { @@ -732,8 +730,7 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const { namespace llvm { template <> bool IsCPSRDead<MachineInstr>(const MachineInstr *MI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.isUndef() || MO.isUse()) continue; if (MO.getReg() != ARM::CPSR) @@ -1860,15 +1857,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1, const MachineRegisterInfo *MRI) const { unsigned Opcode = MI0.getOpcode(); - if (Opcode == ARM::t2LDRpci || - Opcode == ARM::t2LDRpci_pic || - Opcode == ARM::tLDRpci || - Opcode == ARM::tLDRpci_pic || - Opcode == ARM::LDRLIT_ga_pcrel || - Opcode == ARM::LDRLIT_ga_pcrel_ldr || - Opcode == ARM::tLDRLIT_ga_pcrel || - Opcode == ARM::MOV_ga_pcrel || - Opcode == ARM::MOV_ga_pcrel_ldr || + if (Opcode == ARM::t2LDRpci || Opcode == ARM::t2LDRpci_pic || + Opcode == ARM::tLDRpci || Opcode == ARM::tLDRpci_pic || + Opcode == ARM::LDRLIT_ga_pcrel || Opcode == ARM::LDRLIT_ga_pcrel_ldr || + Opcode == ARM::tLDRLIT_ga_pcrel || Opcode == ARM::t2LDRLIT_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel || Opcode == ARM::MOV_ga_pcrel_ldr || Opcode == ARM::t2MOV_ga_pcrel) { if (MI1.getOpcode() != Opcode) return false; @@ -1880,11 +1873,9 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0, if (MO0.getOffset() != MO1.getOffset()) return false; - if (Opcode == ARM::LDRLIT_ga_pcrel || - Opcode == ARM::LDRLIT_ga_pcrel_ldr || - Opcode == ARM::tLDRLIT_ga_pcrel || - Opcode == ARM::MOV_ga_pcrel || - Opcode == ARM::MOV_ga_pcrel_ldr || + if (Opcode == ARM::LDRLIT_ga_pcrel || Opcode == ARM::LDRLIT_ga_pcrel_ldr || + Opcode == ARM::tLDRLIT_ga_pcrel || Opcode == ARM::t2LDRLIT_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel || Opcode == ARM::MOV_ga_pcrel_ldr || Opcode == ARM::t2MOV_ga_pcrel) // Ignore the PC labels. return MO0.getGlobal() == MO1.getGlobal(); @@ -2312,8 +2303,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, return nullptr; // Check if MI has any non-dead defs or physreg uses. This also detects // predicated instructions which will be reading CPSR. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 1)) { // Reject frame index operands, PEI can't handle the predicated pseudos. if (MO.isFI() || MO.isCPI() || MO.isJTI()) return nullptr; @@ -4857,11 +4847,10 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, if (MI.getOpcode() == ARM::tPUSH || MI.getOpcode() == ARM::tPOP || MI.getOpcode() == ARM::tPOP_RET) { - for (int i = 2, e = MI.getNumOperands(); i < e; ++i) { - if (MI.getOperand(i).isImplicit() || - !MI.getOperand(i).isReg()) + for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), 2)) { + if (MO.isImplicit() || !MO.isReg()) continue; - Register Reg = MI.getOperand(i).getReg(); + Register Reg = MO.getReg(); if (Reg < ARM::R0 || Reg > ARM::R7) { if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) && !(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) { @@ -5748,17 +5737,17 @@ enum MachineOutlinerMBBFlags { }; struct OutlinerCosts { - const int CallTailCall; - const int FrameTailCall; - const int CallThunk; - const int FrameThunk; - const int CallNoLRSave; - const int FrameNoLRSave; - const int CallRegSave; - const int FrameRegSave; - const int CallDefault; - const int FrameDefault; - const int SaveRestoreLROnStack; + int CallTailCall; + int FrameTailCall; + int CallThunk; + int FrameThunk; + int CallNoLRSave; + int FrameNoLRSave; + int CallRegSave; + int FrameRegSave; + int CallDefault; + int FrameDefault; + int SaveRestoreLROnStack; OutlinerCosts(const ARMSubtarget &target) : CallTailCall(target.isThumb() ? 4 : 4), @@ -5879,6 +5868,24 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( return outliner::OutlinedFunction(); } + // Partition the candidates in two sets: one with BTI enabled and one with BTI + // disabled. Remove the candidates from the smaller set. We expect the + // majority of the candidates to be in consensus with regard to branch target + // enforcement with just a few oddballs, but if they are the same number + // prefer the non-BTI ones for outlining, since they have less overhead. + auto NoBTI = + llvm::partition(RepeatedSequenceLocs, [](const outliner::Candidate &C) { + const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>(); + return AFI.branchTargetEnforcement(); + }); + if (std::distance(RepeatedSequenceLocs.begin(), NoBTI) > + std::distance(NoBTI, RepeatedSequenceLocs.end())) + RepeatedSequenceLocs.erase(NoBTI, RepeatedSequenceLocs.end()); + else + RepeatedSequenceLocs.erase(RepeatedSequenceLocs.begin(), NoBTI); + if (RepeatedSequenceLocs.size() < 2) + return outliner::OutlinedFunction(); + // At this point, we have only "safe" candidates to outline. Figure out // frame + call instruction information. @@ -5892,6 +5899,16 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( }; OutlinerCosts Costs(Subtarget); + const auto &SomeMFI = + *RepeatedSequenceLocs.front().getMF()->getInfo<ARMFunctionInfo>(); + // Adjust costs to account for the BTI instructions. + if (SomeMFI.branchTargetEnforcement()) { + Costs.FrameDefault += 4; + Costs.FrameNoLRSave += 4; + Costs.FrameRegSave += 4; + Costs.FrameTailCall += 4; + Costs.FrameThunk += 4; + } unsigned FrameID = MachineOutlinerDefault; unsigned NumBytesToCreateFrame = Costs.FrameDefault; @@ -6004,16 +6021,18 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI, // Stack might be involved but addressing mode doesn't handle any offset. // Rq: AddrModeT1_[1|2|4] don't operate on SP - if (AddrMode == ARMII::AddrMode1 // Arithmetic instructions - || AddrMode == ARMII::AddrMode4 // Load/Store Multiple - || AddrMode == ARMII::AddrMode6 // Neon Load/Store Multiple - || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register - || AddrMode == ARMII::AddrModeT2_pc // PCrel access - || AddrMode == ARMII::AddrMode2 // Used by PRE and POST indexed LD/ST - || AddrMode == ARMII::AddrModeT2_i7 // v8.1-M MVE - || AddrMode == ARMII::AddrModeT2_i7s2 // v8.1-M MVE - || AddrMode == ARMII::AddrModeT2_i7s4 // v8.1-M sys regs VLDR/VSTR - || AddrMode == ARMII::AddrModeNone) + if (AddrMode == ARMII::AddrMode1 || // Arithmetic instructions + AddrMode == ARMII::AddrMode4 || // Load/Store Multiple + AddrMode == ARMII::AddrMode6 || // Neon Load/Store Multiple + AddrMode == ARMII::AddrModeT2_so || // SP can't be used as based register + AddrMode == ARMII::AddrModeT2_pc || // PCrel access + AddrMode == ARMII::AddrMode2 || // Used by PRE and POST indexed LD/ST + AddrMode == ARMII::AddrModeT2_i7 || // v8.1-M MVE + AddrMode == ARMII::AddrModeT2_i7s2 || // v8.1-M MVE + AddrMode == ARMII::AddrModeT2_i7s4 || // v8.1-M sys regs VLDR/VSTR + AddrMode == ARMII::AddrModeNone || + AddrMode == ARMII::AddrModeT2_i8 || // Pre/Post inc instructions + AddrMode == ARMII::AddrModeT2_i8neg) // Always negative imm return false; unsigned NumOps = MI->getDesc().getNumOperands(); @@ -6051,7 +6070,7 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI, NumBits = 8; Scale = 2; break; - case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i8pos: NumBits = 8; break; case ARMII::AddrModeT2_i8s4: @@ -6089,7 +6108,18 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI, } return false; +} + +void ARMBaseInstrInfo::mergeOutliningCandidateAttributes( + Function &F, std::vector<outliner::Candidate> &Candidates) const { + outliner::Candidate &C = Candidates.front(); + // branch-target-enforcement is guaranteed to be consistent between all + // candidates, so we only need to look at one. + const Function &CFn = C.getMF()->getFunction(); + if (CFn.hasFnAttribute("branch-target-enforcement")) + F.addFnAttr(CFn.getFnAttribute("branch-target-enforcement")); + ARMGenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates); } bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom( diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index db9320962e81..5fa912ae35d7 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -349,6 +349,8 @@ public: bool OutlineFromLinkOnceODRs) const override; outliner::OutlinedFunction getOutliningCandidateInfo( std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override; + void mergeOutliningCandidateAttributes( + Function &F, std::vector<outliner::Candidate> &Candidates) const override; outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override; bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, @@ -877,19 +879,23 @@ inline bool isLegalAddressImm(unsigned Opcode, int Imm, unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); switch (AddrMode) { case ARMII::AddrModeT2_i7: - return std::abs(Imm) < (((1 << 7) * 1) - 1); + return std::abs(Imm) < ((1 << 7) * 1); case ARMII::AddrModeT2_i7s2: - return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0; + return std::abs(Imm) < ((1 << 7) * 2) && Imm % 2 == 0; case ARMII::AddrModeT2_i7s4: - return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0; + return std::abs(Imm) < ((1 << 7) * 4) && Imm % 4 == 0; case ARMII::AddrModeT2_i8: - return std::abs(Imm) < (((1 << 8) * 1) - 1); - case ARMII::AddrMode2: - return std::abs(Imm) < (((1 << 12) * 1) - 1); - case ARMII::AddrModeT2_i12: - return Imm >= 0 && Imm < (((1 << 12) * 1) - 1); + return std::abs(Imm) < ((1 << 8) * 1); + case ARMII::AddrModeT2_i8pos: + return Imm >= 0 && Imm < ((1 << 8) * 1); + case ARMII::AddrModeT2_i8neg: + return Imm < 0 && -Imm < ((1 << 8) * 1); case ARMII::AddrModeT2_i8s4: - return std::abs(Imm) < (((1 << 8) * 4) - 1) && Imm % 4 == 0; + return std::abs(Imm) < ((1 << 8) * 4) && Imm % 4 == 0; + case ARMII::AddrModeT2_i12: + return Imm >= 0 && Imm < ((1 << 12) * 1); + case ARMII::AddrMode2: + return std::abs(Imm) < ((1 << 12) * 1); default: llvm_unreachable("Unhandled Addressing mode"); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBranchTargets.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBranchTargets.cpp new file mode 100644 index 000000000000..1091c1f970fa --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBranchTargets.cpp @@ -0,0 +1,135 @@ +//===-- ARMBranchTargets.cpp -- Harden code using v8.1-M BTI extension -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass inserts BTI instructions at the start of every function and basic +// block which could be indirectly called. The hardware will (when enabled) +// trap when an indirect branch or call instruction targets an instruction +// which is not a valid BTI instruction. This is intended to guard against +// control-flow hijacking attacks. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-branch-targets" +#define ARM_BRANCH_TARGETS_NAME "ARM Branch Targets" + +namespace { +class ARMBranchTargets : public MachineFunctionPass { +public: + static char ID; + ARMBranchTargets() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return ARM_BRANCH_TARGETS_NAME; } + +private: + void addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB, bool IsFirstBB); +}; +} // end anonymous namespace + +char ARMBranchTargets::ID = 0; + +INITIALIZE_PASS(ARMBranchTargets, "arm-branch-targets", ARM_BRANCH_TARGETS_NAME, + false, false) + +void ARMBranchTargets::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +FunctionPass *llvm::createARMBranchTargetsPass() { + return new ARMBranchTargets(); +} + +bool ARMBranchTargets::runOnMachineFunction(MachineFunction &MF) { + if (!MF.getInfo<ARMFunctionInfo>()->branchTargetEnforcement()) + return false; + + LLVM_DEBUG(dbgs() << "********** ARM Branch Targets **********\n" + << "********** Function: " << MF.getName() << '\n'); + const ARMInstrInfo &TII = + *static_cast<const ARMInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + // LLVM does not consider basic blocks which are the targets of jump tables + // to be address-taken (the address can't escape anywhere else), but they are + // used for indirect branches, so need BTI instructions. + SmallPtrSet<const MachineBasicBlock *, 8> JumpTableTargets; + if (const MachineJumpTableInfo *JTI = MF.getJumpTableInfo()) + for (const MachineJumpTableEntry &JTE : JTI->getJumpTables()) + for (const MachineBasicBlock *MBB : JTE.MBBs) + JumpTableTargets.insert(MBB); + + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + bool NeedBTI = false; + bool IsFirstBB = &MBB == &MF.front(); + + // Every function can potentially be called indirectly (even if it has + // static linkage, due to linker-generated veneers). + if (IsFirstBB) + NeedBTI = true; + + // If the block itself is address-taken, or is an exception landing pad, it + // could be indirectly branched to. + if (MBB.hasAddressTaken() || MBB.isEHPad() || JumpTableTargets.count(&MBB)) + NeedBTI = true; + + if (NeedBTI) { + addBTI(TII, MBB, IsFirstBB); + MadeChange = true; + } + } + + return MadeChange; +} + +/// Insert a BTI/PACBTI instruction into a given basic block \c MBB. If +/// \c IsFirstBB is true (meaning that this is the first BB in a function) try +/// to find a PAC instruction and replace it with PACBTI. Otherwise just insert +/// a BTI instruction. +/// The point of insertion is in the beginning of the BB, immediately after meta +/// instructions (such labels in exception handling landing pads). +void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB, + bool IsFirstBB) { + // Which instruction to insert: BTI or PACBTI + unsigned OpCode = ARM::t2BTI; + + // Skip meta instructions, including EH labels + auto MBBI = llvm::find_if_not(MBB.instrs(), [](const MachineInstr &MI) { + return MI.isMetaInstruction(); + }); + + // If this is the first BB in a function, check if it starts with a PAC + // instruction and in that case remove the PAC instruction. + if (IsFirstBB) { + if (MBBI != MBB.instr_end() && MBBI->getOpcode() == ARM::t2PAC) { + LLVM_DEBUG(dbgs() << "Removing a 'PAC' instr from BB '" << MBB.getName() + << "' to replace with PACBTI\n"); + OpCode = ARM::t2PACBTI; + auto NextMBBI = std::next(MBBI); + MBBI->eraseFromParent(); + MBBI = NextMBBI; + } + } + + LLVM_DEBUG(dbgs() << "Inserting a '" + << (OpCode == ARM::t2BTI ? "BTI" : "PACBTI") + << "' instr into BB '" << MBB.getName() << "'\n"); + // Finally, insert a new instruction (either PAC or PACBTI) + BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII.get(OpCode)); +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 121558276c3e..c2ca4708c208 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -184,6 +184,9 @@ namespace { /// base address. DenseMap<int, int> JumpTableUserIndices; + // Maps a MachineBasicBlock to the number of jump tables entries. + DenseMap<const MachineBasicBlock *, int> BlockJumpTableRefCount; + /// ImmBranch - One per immediate branch, keeping the machine instruction /// pointer, conditional or unconditional, the max displacement, /// and (if isCond is true) the corresponding unconditional branch @@ -274,7 +277,10 @@ namespace { unsigned &DeadSize, bool &CanDeleteLEA, bool &BaseRegKill); bool optimizeThumb2JumpTables(); - MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB, + void fixupBTI(unsigned JTI, MachineBasicBlock &OldBB, + MachineBasicBlock &NewBB); + MachineBasicBlock *adjustJTTargetBlockForward(unsigned JTI, + MachineBasicBlock *BB, MachineBasicBlock *JTBB); unsigned getUserOffset(CPUser&) const; @@ -518,6 +524,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { CPEntries.clear(); JumpTableEntryIndices.clear(); JumpTableUserIndices.clear(); + BlockJumpTableRefCount.clear(); ImmBranches.clear(); PushPopMIs.clear(); T2JumpTables.clear(); @@ -720,6 +727,14 @@ Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) { return MCP->getConstants()[CPI].getAlign(); } +// Exception landing pads, blocks that has their adress taken, and function +// entry blocks will always be (potential) indirect jump targets, regardless of +// whether they are referenced by or not by jump tables. +static bool isAlwaysIndirectTarget(const MachineBasicBlock &MBB) { + return MBB.isEHPad() || MBB.hasAddressTaken() || + &MBB == &MBB.getParent()->front(); +} + /// scanFunctionJumpTables - Do a scan of the function, building up /// information about the sizes of each block and the locations of all /// the jump tables. @@ -730,6 +745,20 @@ void ARMConstantIslands::scanFunctionJumpTables() { (I.getOpcode() == ARM::t2BR_JT || I.getOpcode() == ARM::tBR_JTr)) T2JumpTables.push_back(&I); } + + if (!MF->getInfo<ARMFunctionInfo>()->branchTargetEnforcement()) + return; + + if (const MachineJumpTableInfo *JTI = MF->getJumpTableInfo()) + for (const MachineJumpTableEntry &JTE : JTI->getJumpTables()) + for (const MachineBasicBlock *MBB : JTE.MBBs) { + if (isAlwaysIndirectTarget(*MBB)) + // Set the reference count essentially to infinity, it will never + // reach zero and the BTI Instruction will never be removed. + BlockJumpTableRefCount[MBB] = std::numeric_limits<int>::max(); + else + ++BlockJumpTableRefCount[MBB]; + } } /// initializeFunctionInfo - Do the initial scan of the function, building up @@ -1219,9 +1248,9 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) { // Point the CPUser node to the replacement U.CPEMI = CPEs[i].CPEMI; // Change the CPI in the instruction operand to refer to the clone. - for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) - if (UserMI->getOperand(j).isCPI()) { - UserMI->getOperand(j).setIndex(CPEs[i].CPI); + for (MachineOperand &MO : UserMI->operands()) + if (MO.isCPI()) { + MO.setIndex(CPEs[i].CPI); break; } // Adjust the refcount of the clone... @@ -1601,9 +1630,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex, BBUtils->adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. - for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) - if (UserMI->getOperand(i).isCPI()) { - UserMI->getOperand(i).setIndex(ID); + for (MachineOperand &MO : UserMI->operands()) + if (MO.isCPI()) { + MO.setIndex(ID); break; } @@ -2211,8 +2240,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { unsigned JTOffset = BBUtils->getOffsetOf(MI) + 4; const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; BBInfoVector &BBInfo = BBUtils->getBBInfo(); - for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { - MachineBasicBlock *MBB = JTBBs[j]; + for (MachineBasicBlock *MBB : JTBBs) { unsigned DstOffset = BBInfo[MBB->getNumber()].Offset; // Negative offset is not ok. FIXME: We should change BB layout to make // sure all the branches are forward. @@ -2405,17 +2433,16 @@ bool ARMConstantIslands::reorderThumb2JumpTables() { // and try to adjust them such that that's true. int JTNumber = MI->getParent()->getNumber(); const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; - for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { - MachineBasicBlock *MBB = JTBBs[j]; + for (MachineBasicBlock *MBB : JTBBs) { int DTNumber = MBB->getNumber(); if (DTNumber < JTNumber) { // The destination precedes the switch. Try to move the block forward // so we have a positive offset. MachineBasicBlock *NewBB = - adjustJTTargetBlockForward(MBB, MI->getParent()); + adjustJTTargetBlockForward(JTI, MBB, MI->getParent()); if (NewBB) - MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB); + MJTI->ReplaceMBBInJumpTable(JTI, MBB, NewBB); MadeChange = true; } } @@ -2424,8 +2451,40 @@ bool ARMConstantIslands::reorderThumb2JumpTables() { return MadeChange; } -MachineBasicBlock *ARMConstantIslands:: -adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { +void ARMConstantIslands::fixupBTI(unsigned JTI, MachineBasicBlock &OldBB, + MachineBasicBlock &NewBB) { + assert(isThumb2 && "BTI in Thumb1?"); + + // Insert a BTI instruction into NewBB + BuildMI(NewBB, NewBB.begin(), DebugLoc(), TII->get(ARM::t2BTI)); + + // Update jump table reference counts. + const MachineJumpTableInfo &MJTI = *MF->getJumpTableInfo(); + const MachineJumpTableEntry &JTE = MJTI.getJumpTables()[JTI]; + for (const MachineBasicBlock *MBB : JTE.MBBs) { + if (MBB != &OldBB) + continue; + --BlockJumpTableRefCount[MBB]; + ++BlockJumpTableRefCount[&NewBB]; + } + + // If the old basic block reference count dropped to zero, remove + // the BTI instruction at its beginning. + if (BlockJumpTableRefCount[&OldBB] > 0) + return; + + // Skip meta instructions + auto BTIPos = llvm::find_if_not(OldBB.instrs(), [](const MachineInstr &MI) { + return MI.isMetaInstruction(); + }); + assert(BTIPos->getOpcode() == ARM::t2BTI && + "BasicBlock is mentioned in a jump table but does start with BTI"); + if (BTIPos->getOpcode() == ARM::t2BTI) + BTIPos->eraseFromParent(); +} + +MachineBasicBlock *ARMConstantIslands::adjustJTTargetBlockForward( + unsigned JTI, MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // If the destination block is terminated by an unconditional branch, // try to move it; otherwise, create a new block following the jump // table that branches back to the actual target. This is a very simple @@ -2483,6 +2542,9 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { NewBB->addSuccessor(BB); JTBB->replaceSuccessor(BB, NewBB); + if (MF->getInfo<ARMFunctionInfo>()->branchTargetEnforcement()) + fixupBTI(JTI, *BB, *NewBB); + ++NumJTInserted; return NewBB; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index a8f09969e948..7a35f252b22a 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -125,9 +125,8 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI) { const MCInstrDesc &Desc = OldMI.getDesc(); - for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); - i != e; ++i) { - const MachineOperand &MO = OldMI.getOperand(i); + for (const MachineOperand &MO : + llvm::drop_begin(OldMI.operands(), Desc.getNumOperands())) { assert(MO.isReg() && MO.getReg()); if (MO.isUse()) UseMI.add(MO); @@ -2252,8 +2251,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, .add(predOps(ARMCC::AL)) .addReg(JumpReg, RegState::Kill); - for (int I = 1, E = MI.getNumOperands(); I != E; ++I) - NewCall->addOperand(MI.getOperand(I)); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + NewCall->addOperand(MO); if (MI.isCandidateForCallSiteEntry()) MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr()); @@ -2524,17 +2523,21 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::LDRLIT_ga_pcrel: case ARM::LDRLIT_ga_pcrel_ldr: case ARM::tLDRLIT_ga_abs: + case ARM::t2LDRLIT_ga_pcrel: case ARM::tLDRLIT_ga_pcrel: { Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO1 = MI.getOperand(1); auto Flags = MO1.getTargetFlags(); const GlobalValue *GV = MO1.getGlobal(); - bool IsARM = - Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs; + bool IsARM = Opcode != ARM::tLDRLIT_ga_pcrel && + Opcode != ARM::tLDRLIT_ga_abs && + Opcode != ARM::t2LDRLIT_ga_pcrel; bool IsPIC = Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs; unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci; + if (Opcode == ARM::t2LDRLIT_ga_pcrel) + LDRLITOpc = ARM::t2LDRpci; unsigned PICAddOpc = IsARM ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) @@ -3065,7 +3068,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL)); } MIB.cloneMemRefs(MI); - for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + MIB.add(MO); MI.eraseFromParent(); return true; } @@ -3080,8 +3084,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, Opcode == ARM::LOADDUAL ? RegState::Define : 0) .addReg(TRI->getSubReg(PairReg, ARM::gsub_1), Opcode == ARM::LOADDUAL ? RegState::Define : 0); - for (unsigned i = 1; i < MI.getNumOperands(); i++) - MIB.add(MI.getOperand(i)); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + MIB.add(MO); MIB.add(predOps(ARMCC::AL)); MIB.cloneMemRefs(MI); MI.eraseFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 025e43444f9c..b866cf952ff1 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -523,9 +523,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } // Determine spill area sizes. - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - int FI = CSI[i].getFrameIdx(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); + int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: case ARM::R9: @@ -1317,11 +1317,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // Mark the D-register spill slots as properly aligned. Since MFI computes // stack slot layout backwards, this can actually mean that the d-reg stack // slot offsets can be wrong. The offset for d8 will always be correct. - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned DNum = CSI[i].getReg() - ARM::D8; + for (const CalleeSavedInfo &I : CSI) { + unsigned DNum = I.getReg() - ARM::D8; if (DNum > NumAlignedDPRCS2Regs - 1) continue; - int FI = CSI[i].getFrameIdx(); + int FI = I.getFrameIdx(); // The even-numbered registers will be 16-byte aligned, the odd-numbered // registers will be 8-byte aligned. MFI.setObjectAlignment(FI, DNum % 2 ? Align(8) : Align(16)); @@ -1488,9 +1488,9 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, // Find the frame index assigned to d8. int D8SpillFI = 0; - for (unsigned i = 0, e = CSI.size(); i != e; ++i) - if (CSI[i].getReg() == ARM::D8) { - D8SpillFI = CSI[i].getFrameIdx(); + for (const CalleeSavedInfo &I : CSI) + if (I.getReg() == ARM::D8) { + D8SpillFI = I.getFrameIdx(); break; } @@ -1693,7 +1693,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, // Default 12 bit limit. break; case ARMII::AddrMode3: - case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i8neg: Limit = std::min(Limit, (1U << 8) - 1); break; case ARMII::AddrMode5FP16: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 2b83a292db76..bb2859c766c2 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -3274,7 +3274,8 @@ bool ARMDAGToDAGISel::tryFP_TO_INT(SDNode *N, SDLoc dl) { return false; unsigned int ScalarBits = Type.getScalarSizeInBits(); - bool IsUnsigned = N->getOpcode() == ISD::FP_TO_UINT; + bool IsUnsigned = N->getOpcode() == ISD::FP_TO_UINT || + N->getOpcode() == ISD::FP_TO_UINT_SAT; SDNode *Node = N->getOperand(0).getNode(); // floating-point to fixed-point with one fractional bit gets turned into an @@ -3764,6 +3765,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { break; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT_SAT: + case ISD::FP_TO_SINT_SAT: if (tryFP_TO_INT(N, dl)) return; break; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp index e7e10ce07a44..33d115945614 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1016,6 +1016,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FP_EXTEND); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::SETCC); + } + if (Subtarget->hasMVEFloatOps()) { + setTargetDAGCombine(ISD::FADD); } if (!Subtarget->hasFP64()) { @@ -10587,10 +10591,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned I = 1; I <= MaxCSNum; ++I) { SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; - for (SmallVectorImpl<MachineBasicBlock*>::iterator - II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { - LPadList.push_back(*II); - InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); + for (MachineBasicBlock *MBB : MBBList) { + LPadList.push_back(MBB); + InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end()); } } @@ -10879,9 +10882,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Add the jump table entries as successors to the MBB. SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; - for (std::vector<MachineBasicBlock*>::iterator - I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { - MachineBasicBlock *CurMBB = *I; + for (MachineBasicBlock *CurMBB : LPadList) { if (SeenMBBs.insert(CurMBB).second) DispContBB->addSuccessor(CurMBB); } @@ -10943,9 +10944,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. - for (SmallVectorImpl<MachineBasicBlock*>::iterator - I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) - (*I)->setIsEHPad(false); + for (MachineBasicBlock *MBBLPad : MBBLPads) + MBBLPad->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); @@ -11771,8 +11771,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); - for (unsigned i = 0; i < MI.getNumOperands(); ++i) - MIB.add(MI.getOperand(i)); + for (const MachineOperand &MO : MI.operands()) + MIB.add(MO); MI.eraseFromParent(); return BB; } @@ -13083,6 +13083,65 @@ static SDValue PerformVSELECTCombine(SDNode *N, return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); } +// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n +static SDValue PerformVSetCCToVCTPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + EVT VT = N->getValueType(0); + + if (!Subtarget->hasMVEIntegerOps() || + !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + if (CC == ISD::SETUGE) { + std::swap(Op0, Op1); + CC = ISD::SETULT; + } + + if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 || + Op0.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // Check first operand is BuildVector of 0,1,2,... + for (unsigned I = 0; I < VT.getVectorNumElements(); I++) { + if (!Op0.getOperand(I).isUndef() && + !(isa<ConstantSDNode>(Op0.getOperand(I)) && + Op0.getConstantOperandVal(I) == I)) + return SDValue(); + } + + // The second is a Splat of Op1S + SDValue Op1S = DCI.DAG.getSplatValue(Op1); + if (!Op1S) + return SDValue(); + + unsigned Opc; + switch (VT.getVectorNumElements()) { + case 2: + Opc = Intrinsic::arm_mve_vctp64; + break; + case 4: + Opc = Intrinsic::arm_mve_vctp32; + break; + case 8: + Opc = Intrinsic::arm_mve_vctp16; + break; + case 16: + Opc = Intrinsic::arm_mve_vctp8; + break; + default: + return SDValue(); + } + + SDLoc DL(N); + return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DCI.DAG.getConstant(Opc, DL, MVT::i32), + DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32)); +} + static SDValue PerformABSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -13427,6 +13486,26 @@ bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { return VT.isScalarInteger(); } +bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, + EVT VT) const { + if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple()) + return false; + + switch (FPVT.getSimpleVT().SimpleTy) { + case MVT::f16: + return Subtarget->hasVFP2Base(); + case MVT::f32: + return Subtarget->hasVFP2Base(); + case MVT::f64: + return Subtarget->hasFP64(); + case MVT::v4f32: + case MVT::v8f16: + return Subtarget->hasMVEFloatOps(); + default: + return false; + } +} + static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -14485,6 +14564,52 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// Check that N is CMPZ(CSINC(0, 0, CC, X)), return X if valid. +static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { + if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1))) + return SDValue(); + SDValue CSInc = Cmp->getOperand(0); + if (CSInc.getOpcode() != ARMISD::CSINC || + !isNullConstant(CSInc.getOperand(0)) || + !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse()) + return SDValue(); + CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2); + return CSInc.getOperand(3); +} + +static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) { + // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in + // t92: glue = ARMISD::CMPZ t74, 0 + // t93: i32 = ARMISD::CSINC 0, 0, 1, t92 + // t96: glue = ARMISD::CMPZ t93, 0 + // t114: i32 = ARMISD::CSINV 0, 0, 0, t96 + ARMCC::CondCodes Cond; + if (SDValue C = IsCMPZCSINC(N, Cond)) + if (Cond == ARMCC::EQ) + return C; + return SDValue(); +} + +static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) { + // Fold away an unneccessary CMPZ/CSINC + // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) -> + // if C1==EQ -> CSXYZ A, B, C2, D + // if C1==NE -> CSXYZ A, B, NOT(C2), D + ARMCC::CondCodes Cond; + if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) { + if (N->getConstantOperandVal(2) == ARMCC::EQ) + return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), + N->getOperand(1), + DAG.getConstant(Cond, SDLoc(N), MVT::i32), C); + if (N->getConstantOperandVal(2) == ARMCC::NE) + return DAG.getNode( + N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), + N->getOperand(1), + DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C); + } + return SDValue(); +} + /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, @@ -16411,6 +16536,42 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, return FixConv; } +static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEFloatOps()) + return SDValue(); + + // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x) + // The second form can be more easily turned into a predicated vadd, and + // possibly combined into a fma to become a predicated vfma. + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // The identity element for a fadd is -0.0, which these VMOV's represent. + auto isNegativeZeroSplat = [&](SDValue Op) { + if (Op.getOpcode() != ISD::BITCAST || + Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM) + return false; + if (VT == MVT::v4f32 && Op.getOperand(0).getConstantOperandVal(0) == 1664) + return true; + if (VT == MVT::v8f16 && Op.getOperand(0).getConstantOperandVal(0) == 2688) + return true; + return false; + }; + + if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT) + std::swap(Op0, Op1); + + if (Op1.getOpcode() != ISD::VSELECT || + !isNegativeZeroSplat(Op1.getOperand(2))) + return SDValue(); + SDValue FAdd = + DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), N->getFlags()); + return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0); +} + /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) /// can replace combinations of VCVT (integer to floating-point) and VDIV /// when the VDIV has a constant operand that is a power of 2. @@ -17049,18 +17210,6 @@ static SDValue PerformShiftCombine(SDNode *N, const ARMSubtarget *ST) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { - // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high - // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. - SDValue N1 = N->getOperand(1); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { - SDValue N0 = N->getOperand(0); - if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && - DAG.MaskedValueIsZero(N0.getOperand(0), - APInt::getHighBitsSet(32, 16))) - return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); - } - } if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && N->getOperand(0)->getOpcode() == ISD::AND && @@ -18173,6 +18322,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); + case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); @@ -18205,6 +18355,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); + case ISD::FADD: + return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -18228,6 +18380,12 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); + case ARMISD::CMPZ: + return PerformCMPZCombine(N, DCI.DAG); + case ARMISD::CSINC: + case ARMISD::CSINV: + case ARMISD::CSNEG: + return PerformCSETCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI, Subtarget); case ARMISD::VLD1DUP: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h index 0fddd58e178e..e3b422358cae 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h @@ -736,6 +736,8 @@ class VectorType; bool preferIncOfAddToSubOfNot(EVT VT) const override; + bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td index de351372abf2..ff5afd787c82 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -103,15 +103,17 @@ def AddrModeT1_4 : AddrMode<9>; def AddrModeT1_s : AddrMode<10>; def AddrModeT2_i12 : AddrMode<11>; def AddrModeT2_i8 : AddrMode<12>; -def AddrModeT2_so : AddrMode<13>; -def AddrModeT2_pc : AddrMode<14>; -def AddrModeT2_i8s4 : AddrMode<15>; -def AddrMode_i12 : AddrMode<16>; -def AddrMode5FP16 : AddrMode<17>; -def AddrModeT2_ldrex : AddrMode<18>; -def AddrModeT2_i7s4 : AddrMode<19>; -def AddrModeT2_i7s2 : AddrMode<20>; -def AddrModeT2_i7 : AddrMode<21>; +def AddrModeT2_i8pos : AddrMode<13>; +def AddrModeT2_i8neg : AddrMode<14>; +def AddrModeT2_so : AddrMode<15>; +def AddrModeT2_pc : AddrMode<16>; +def AddrModeT2_i8s4 : AddrMode<17>; +def AddrMode_i12 : AddrMode<18>; +def AddrMode5FP16 : AddrMode<19>; +def AddrModeT2_ldrex : AddrMode<20>; +def AddrModeT2_i7s4 : AddrMode<21>; +def AddrModeT2_i7s2 : AddrMode<22>; +def AddrModeT2_i7 : AddrMode<23>; // Load / store index mode. class IndexMode<bits<2> val> { @@ -1392,9 +1394,12 @@ class T2I<dag oops, dag iops, InstrItinClass itin, class T2Ii12<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>; -class T2Ii8<dag oops, dag iops, InstrItinClass itin, - string opc, string asm, list<dag> pattern> - : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>; +class T2Ii8p<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8pos, 4, itin, opc, asm, "", pattern>; +class T2Ii8n<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8neg, 4, itin, opc, asm, "", pattern>; class T2Iso<dag oops, dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td index 7d0bc756e882..1c1db473f866 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -420,6 +420,12 @@ def lo16AllZero : PatLeaf<(i32 imm), [{ return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; }], hi16>; +// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise +def top16Zero: PatLeaf<(i32 GPR:$src), [{ + return !SDValue(N,0)->getValueType(0).isVector() && + CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); + }]>; + class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>; @@ -4748,6 +4754,8 @@ def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)), (REV16 (LDRH addrmode3:$addr))>; def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr), (STRH (REV16 GPR:$Rn), addrmode3:$addr)>; +def : ARMV6Pat<(srl (bswap top16Zero:$Rn), (i32 16)), + (REV16 GPR:$Rn)>; let AddedComplexity = 5 in def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td index 697730037277..f53814a80e01 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3621,21 +3621,24 @@ class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[ let validForTailPredication = 1; } -multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { +multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI, SDNode Op, + Intrinsic PredInt, SDPatternOperator IdentityVec> { def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEFloat] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), IdentityVec>; } } -multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI> - : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated>; +multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> + : MVE_VMULT_fp_m<"vmul", VTI, fmul, int_arm_mve_mul_predicated, IdentityVec>; + +def ARMimmOneF: PatLeaf<(bitconvert (v4f32 (ARMvmovFPImm (i32 112))))>; // 1.0 float +def ARMimmOneH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2620))))>; // 1.0 half -defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>; -defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>; +defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32, ARMimmOneF>; +defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16, ARMimmOneH>; class MVE_VCMLA<string suffix, bits<2> size> : MVEFloatArithNeon<"vcmla", suffix, size{1}, (outs MQPR:$Qd), @@ -3747,27 +3750,30 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>; defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>; multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> { def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> { let validForTailPredication = 1; } defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEFloat] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), IdentityVec>; } } -multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI> - : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated>; -multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI> - : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated>; +multiclass MVE_VADD_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> + : MVE_VADDSUB_fp_m<"vadd", 0, VTI, fadd, int_arm_mve_add_predicated, IdentityVec>; +multiclass MVE_VSUB_fp_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> + : MVE_VADDSUB_fp_m<"vsub", 1, VTI, fsub, int_arm_mve_sub_predicated, IdentityVec>; -defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32>; -defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>; +def ARMimmMinusZeroF: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 1664))))>; // -0.0 float +def ARMimmMinusZeroH: PatLeaf<(bitconvert (v8i16 (ARMvmovImm (i32 2688))))>; // -0.0 half -defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>; -defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>; +defm MVE_VADDf32 : MVE_VADD_fp_m<MVE_v4f32, ARMimmMinusZeroF>; +defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16, ARMimmMinusZeroH>; + +defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32, ARMimmAllZerosV>; +defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16, ARMimmAllZerosV>; class MVE_VCADD<string suffix, bits<2> size, string cstr=""> : MVEFloatArithNeon<"vcadd", suffix, size{1}, (outs MQPR:$Qd), @@ -5373,22 +5379,22 @@ defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>; defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>; multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> { def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>; defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), - !cast<Instruction>(NAME)>; + !cast<Instruction>(NAME), IdentityVec>; } let Predicates = [HasMVEFloat] in { defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd, - int_arm_mve_add_predicated>; + int_arm_mve_add_predicated, ARMimmMinusZeroF>; defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd, - int_arm_mve_add_predicated>; + int_arm_mve_add_predicated, ARMimmMinusZeroH>; defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub, - int_arm_mve_sub_predicated>; + int_arm_mve_sub_predicated, ARMimmAllZerosV>; defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub, - int_arm_mve_sub_predicated>; + int_arm_mve_sub_predicated, ARMimmAllZerosV>; } class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, @@ -5567,16 +5573,16 @@ defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>; defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>; defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>; -multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> { +multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI, SDPatternOperator IdentityVec> { let validForTailPredication = 1 in def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>; defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ), - !cast<Instruction>(NAME)>; + !cast<Instruction>(NAME), IdentityVec>; } let Predicates = [HasMVEFloat] in { - defm MVE_VMUL_qr_f16 : MVE_VxxMUL_qr_f_m<MVE_v8f16>; - defm MVE_VMUL_qr_f32 : MVE_VxxMUL_qr_f_m<MVE_v4f32>; + defm MVE_VMUL_qr_f16 : MVE_VxxMUL_qr_f_m<MVE_v8f16, ARMimmOneH>; + defm MVE_VMUL_qr_f32 : MVE_VxxMUL_qr_f_m<MVE_v4f32, ARMimmOneF>; } class MVE_VFMAMLA_qr<string iname, string suffix, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td index bf717a4056e9..f09ad8167600 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1576,6 +1576,8 @@ def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)), (tREV16 (tLDRHi t_addrmode_is2:$addr))>; def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)), (tREV16 (tLDRHr t_addrmode_rr:$addr))>; +def : T1Pat<(srl (bswap top16Zero:$Rn), (i32 16)), + (tREV16 tGPR:$Rn)>; def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), t_addrmode_is2:$addr), (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td index 783db9dde17f..4471317f4ea4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -1191,9 +1191,9 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, let DecoderMethod = "DecodeT2LoadImm12"; } - def i8 : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii, - opc, "\t$Rt, $addr", - [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>, + def i8 : T2Ii8n <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii, + opc, "\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>, Sched<[WriteLd]> { bits<4> Rt; bits<13> addr; @@ -1284,9 +1284,9 @@ multiclass T2I_st<bits<2> opcod, string opc, let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm } - def i8 : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii, - opc, "\t$Rt, $addr", - [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>, + def i8 : T2Ii8n <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii, + opc, "\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>, Sched<[WriteST]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0000; @@ -1580,8 +1580,8 @@ def t2LDR_POST_imm : t2AsmPseudo<"ldr${p}.w $Rt, $Rn, $imm", // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110). // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc, - "\t$Rt, $addr", []>, Sched<[WriteLd]> { + : T2Ii8p<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc, + "\t$Rt, $addr", []>, Sched<[WriteLd]> { bits<4> Rt; bits<13> addr; let Inst{31-27} = 0b11111; @@ -1747,8 +1747,8 @@ def t2STR_POST_imm : t2AsmPseudo<"str${p}.w $Rt, $Rn, $imm", // only. // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 class T2IstT<bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs), (ins rGPR:$Rt, t2addrmode_imm8:$addr), ii, opc, - "\t$Rt, $addr", []>, Sched<[WriteST]> { + : T2Ii8p<(outs), (ins rGPR:$Rt, t2addrmode_posimm8:$addr), ii, opc, + "\t$Rt, $addr", []>, Sched<[WriteST]> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; let Inst{24} = 0; // not signed @@ -1851,8 +1851,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> { let DecoderMethod = "DecodeT2LoadImm12"; } - def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc, - "\t$addr", + def i8 : T2Ii8n<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc, + "\t$addr", [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>, Sched<[WritePreLd]> { let Inst{31-25} = 0b1111100; @@ -2926,18 +2926,11 @@ let AddedComplexity = 1 in def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm), (t2BICri rGPR:$src, t2_so_imm_not:$imm)>; -// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise -def top16Zero: PatLeaf<(i32 rGPR:$src), [{ - return !SDValue(N,0)->getValueType(0).isVector() && - CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); - }]>; - // so_imm_notSext is needed instead of so_imm_not, as the value of imm // will match the extended, not the original bitWidth for $src. def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm), (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>; - // FIXME: Disable this pattern on Darwin to workaround an assembler bug. def : T2Pat<(or rGPR:$src, t2_so_imm_not:$imm), (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>, @@ -3283,6 +3276,9 @@ def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>, Sched<[WriteALU]>; +def : T2Pat<(srl (bswap top16Zero:$Rn), (i32 16)), + (t2REV16 rGPR:$Rn)>; + def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, "revsh", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>, @@ -4059,6 +4055,8 @@ def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm", bits<8> imm; let Inst{31-3} = 0b11110011101011111000000000000; let Inst{7-0} = imm; + + let DecoderMethod = "DecodeT2HintSpaceInstruction"; } def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p), 0>; @@ -4079,6 +4077,11 @@ def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> { def : t2InstAlias<"csdb$p.w", (t2HINT 20, pred:$p), 0>; def : t2InstAlias<"csdb$p", (t2HINT 20, pred:$p), 1>; +def : t2InstAlias<"pacbti$p r12,lr,sp", (t2HINT 13, pred:$p), 1>; +def : t2InstAlias<"bti$p", (t2HINT 15, pred:$p), 1>; +def : t2InstAlias<"pac$p r12,lr,sp", (t2HINT 29, pred:$p), 1>; +def : t2InstAlias<"aut$p r12,lr,sp", (t2HINT 45, pred:$p), 1>; + def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", [(int_arm_dbg imm0_15:$opt)]> { bits<4> opt; @@ -4254,6 +4257,19 @@ def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, def : T2Pat<(ARMWrapperJT tjumptable:$dst), (t2LEApcrelJT tjumptable:$dst)>; +let hasNoSchedulingInfo = 1 in { +def t2LDRLIT_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), + IIC_iLoadiALU, + [(set rGPR:$dst, + (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsThumb, HasV8MBaseline, DontUseMovtInPic]>; +} + +// TLS globals +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (t2LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>, + Requires<[IsThumb, HasV8MBaseline, DontUseMovtInPic]>; + // Pseudo instruction that combines ldr from constpool and add pc. This should // be expanded into two instructions late to allow if-conversion and // scheduling. @@ -5607,6 +5623,15 @@ let Predicates = [HasV8_1MMainline] in { defm : CSPats<ARMcsinv, t2CSINV>; defm : CSPats<ARMcsneg, t2CSNEG>; + def : T2Pat<(ARMcmov (i32 1), (i32 0), cmovpred:$imm), + (t2CSINC ZR, ZR, imm0_31:$imm)>; + def : T2Pat<(ARMcmov (i32 -1), (i32 0), cmovpred:$imm), + (t2CSINV ZR, ZR, imm0_31:$imm)>; + def : T2Pat<(ARMcmov (i32 0), (i32 1), cmovpred:$imm), + (t2CSINC ZR, ZR, (inv_cond_XFORM imm:$imm))>; + def : T2Pat<(ARMcmov (i32 0), (i32 -1), cmovpred:$imm), + (t2CSINV ZR, ZR, (inv_cond_XFORM imm:$imm))>; + multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> { def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm), (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>; @@ -5636,3 +5661,78 @@ let Predicates = [HasV8_1MMainline] in { def : InstAlias<"cneg\t$Rd, $Rn, $fcond", (t2CSNEG rGPR:$Rd, GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rn, pred_noal_inv:$fcond)>; } + + +// PACBTI +let Predicates = [IsThumb2, HasV8_1MMainline, HasPACBTI] in { +def t2PACG : V8_1MI<(outs rGPR:$Rd), + (ins pred:$p, GPRnopc:$Rn, GPRnopc:$Rm), + AddrModeNone, NoItinerary, "pacg${p}", "$Rd, $Rn, $Rm", "", []> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111110110110; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b1111; + let Inst{11-8} = Rd; + let Inst{7-4} = 0b0000; + let Inst{3-0} = Rm; +} + +let hasSideEffects = 1 in { +class PACBTIAut<dag iops, string asm, bit b> + : V8_1MI<(outs), iops, + AddrModeNone, NoItinerary, asm, "$Ra, $Rn, $Rm", "", []> { + bits<4> Ra; + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111110110101; + let Inst{19-16} = Rn; + let Inst{15-12} = Ra; + let Inst{11-5} = 0b1111000; + let Inst{4} = b; + let Inst{3-0} = Rm; +} +} + +def t2AUTG : PACBTIAut<(ins pred:$p, GPRnosp:$Ra, GPRnopc:$Rn, GPRnopc:$Rm), + "autg${p}", 0>; + +let isBranch = 1, isTerminator = 1, isIndirectBranch = 1 in { + def t2BXAUT : PACBTIAut<(ins pred:$p, GPRnosp:$Ra, rGPR:$Rn, GPRnopc:$Rm), + "bxaut${p}", 1>; +} +} + + +class PACBTIHintSpaceInst<string asm, string ops, bits<8> imm> + : V8_1MI<(outs), (ins), AddrModeNone, NoItinerary, asm, ops, "", []> { + let Inst{31-8} = 0b111100111010111110000000; + let Inst{7-0} = imm; + + let Unpredictable{19-16} = 0b1111; + let Unpredictable{13-11} = 0b101; + + let DecoderMethod = "DecodeT2HintSpaceInstruction"; +} + +class PACBTIHintSpaceNoOpsInst<string asm, bits<8> imm> + : PACBTIHintSpaceInst<asm, "", imm>; + +class PACBTIHintSpaceDefInst<string asm, bits<8> imm> + : PACBTIHintSpaceInst<asm, "r12, lr, sp", imm> { + let Defs = [R12]; + let Uses = [LR, SP]; +} + +class PACBTIHintSpaceUseInst<string asm, bits<8> imm> + : PACBTIHintSpaceInst<asm, "r12, lr, sp", imm> { + let Uses = [R12, LR, SP]; +} + +def t2PAC : PACBTIHintSpaceDefInst<"pac", 0b00011101>; +def t2PACBTI : PACBTIHintSpaceDefInst<"pacbti", 0b00001101>; +def t2BTI : PACBTIHintSpaceNoOpsInst<"bti", 0b00001111>; +def t2AUT : PACBTIHintSpaceUseInst<"aut", 0b00101101> { + let hasSideEffects = 1; +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 6e259b1baf97..3b10c60a0654 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1298,8 +1298,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // Can't use an updating ld/st if the base register is also a dest // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. - for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) - if (MI->getOperand(i).getReg() == Base) + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2)) + if (MO.getReg() == Base) return false; int Bytes = getLSMultipleTransferSize(MI); @@ -1326,8 +1326,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { return false; bool HighRegsUsed = false; - for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) - if (MI->getOperand(i).getReg() >= ARM::R8) { + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2)) + if (MO.getReg() >= ARM::R8) { HighRegsUsed = true; break; } @@ -1350,8 +1350,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { .addImm(Pred).addReg(PredReg); // Transfer the rest of operands. - for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum) - MIB.add(MI->getOperand(OpNum)); + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 3)) + MIB.add(MO); // Transfer memoperands. MIB.setMemRefs(MI->memoperands()); @@ -2119,9 +2119,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { isThumb1 = AFI->isThumbFunction() && !isThumb2; bool Modified = false; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) { - MachineBasicBlock &MBB = *MFI; + for (MachineBasicBlock &MBB : Fn) { Modified |= LoadStoreMultipleOpti(MBB); if (STI->hasV5TOps()) Modified |= MergeReturnIntoLDM(MBB); @@ -2710,13 +2708,13 @@ static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm, if (isLegalAddressImm(Opcode, Imm, TII)) return true; - // We can convert AddrModeT2_i12 to AddrModeT2_i8. + // We can convert AddrModeT2_i12 to AddrModeT2_i8neg. const MCInstrDesc &Desc = TII->get(Opcode); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); switch (AddrMode) { case ARMII::AddrModeT2_i12: CodesizeEstimate += 1; - return std::abs(Imm) < (((1 << 8) * 1) - 1); + return Imm < 0 && -Imm < ((1 << 8) * 1); } return false; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp index 507c3e69b3a4..308d5e7889f2 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -13,8 +13,63 @@ using namespace llvm; void ARMFunctionInfo::anchor() {} +static bool GetBranchTargetEnforcement(MachineFunction &MF) { + const auto &Subtarget = MF.getSubtarget<ARMSubtarget>(); + if (!Subtarget.isMClass() || !Subtarget.hasV7Ops()) + return false; + + const Function &F = MF.getFunction(); + if (!F.hasFnAttribute("branch-target-enforcement")) { + if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( + F.getParent()->getModuleFlag("branch-target-enforcement"))) + return BTE->getZExtValue(); + return false; + } + + const StringRef BTIEnable = + F.getFnAttribute("branch-target-enforcement").getValueAsString(); + assert(BTIEnable.equals_insensitive("true") || + BTIEnable.equals_insensitive("false")); + return BTIEnable.equals_insensitive("true"); +} + +// The pair returns values for the ARMFunctionInfo members +// SignReturnAddress and SignReturnAddressAll respectively. +static std::pair<bool, bool> GetSignReturnAddress(const Function &F) { + if (!F.hasFnAttribute("sign-return-address")) { + const Module &M = *F.getParent(); + if (const auto *Sign = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("sign-return-address"))) { + if (Sign->getZExtValue()) { + if (const auto *All = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("sign-return-address-all"))) + return {true, All->getZExtValue()}; + return {true, false}; + } + } + return {false, false}; + } + + StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString(); + if (Scope.equals("none")) + return {false, false}; + + if (Scope.equals("all")) + return {true, true}; + + assert(Scope.equals("non-leaf")); + return {true, false}; +} + ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()), hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()), IsCmseNSEntry(MF.getFunction().hasFnAttribute("cmse_nonsecure_entry")), - IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")) {} + IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")), + BranchTargetEnforcement(GetBranchTargetEnforcement(MF)) { + + const auto &Subtarget = MF.getSubtarget<ARMSubtarget>(); + if (Subtarget.isMClass() && Subtarget.hasV7Ops()) + std::tie(SignReturnAddress, SignReturnAddressAll) = + GetSignReturnAddress(MF.getFunction()); +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 851655284060..4077fc058217 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -142,6 +142,17 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// con/destructors). bool PreservesR0 = false; + /// True if the function should sign its return address. + bool SignReturnAddress = false; + + /// True if the fucntion should sign its return address, even if LR is not + /// saved. + bool SignReturnAddressAll = false; + + /// True if BTI instructions should be placed at potential indirect jump + /// destinations. + bool BranchTargetEnforcement = false; + public: ARMFunctionInfo() = default; @@ -268,6 +279,20 @@ public: void setPreservesR0() { PreservesR0 = true; } bool getPreservesR0() const { return PreservesR0; } + + bool shouldSignReturnAddress() const { + return shouldSignReturnAddress(LRSpilled); + } + + bool shouldSignReturnAddress(bool SpillsLR) const { + if (!SignReturnAddress) + return false; + if (SignReturnAddressAll) + return true; + return LRSpilled; + } + + bool branchTargetEnforcement() const { return BranchTargetEnforcement; } }; } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td index 2dc097566d14..c0dc6a363471 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td @@ -107,6 +107,8 @@ def HasRAS : Predicate<"Subtarget->hasRAS()">, AssemblerPredicate<(all_of FeatureRAS), "ras">; def HasLOB : Predicate<"Subtarget->hasLOB()">, AssemblerPredicate<(all_of FeatureLOB), "lob">; +def HasPACBTI : Predicate<"Subtarget->hasPACBTI()">, + AssemblerPredicate<(all_of FeaturePACBTI), "pacbti">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<(all_of FeatureFP16),"half-float conversions">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td index 9752b3166b45..760a5a5a20cf 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -277,6 +277,16 @@ def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV) let DiagnosticString = "operand must be a register in range [r0, r14] or apsr_nzcv"; } +// GPRs without the SP register. Used for BXAUT and AUTG +def GPRnosp : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, PC)> { + let AltOrders = [(add LR, GPRnosp), (trunc GPRnosp, 8), + (add (trunc GPRnosp, 8), R12, LR, (shl GPRnosp, 8))]; + let AltOrderSelect = [{ + return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF); + }]; + let DiagnosticString = "operand must be a register in range [r0, r12] or LR or PC"; +} + // GPRs without the PC and SP registers but with APSR. Used by CLRM instruction. def GPRwithAPSRnosp : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR)> { let isAllocatable = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h index 5e1217b6a468..d51a888c951f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h @@ -373,6 +373,8 @@ protected: /// HasLOB - if true, the processor supports the Low Overhead Branch extension bool HasLOB = false; + bool HasPACBTI = false; + /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are /// particularly effective at zeroing a VFP register. bool HasZeroCycleZeroing = false; @@ -671,6 +673,7 @@ public: bool hasCRC() const { return HasCRC; } bool hasRAS() const { return HasRAS; } bool hasLOB() const { return HasLOB; } + bool hasPACBTI() const { return HasPACBTI; } bool hasVirtualization() const { return HasVirtualization; } bool useNEONForSinglePrecisionFP() const { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td index f21c7f0246f9..c03db15d1041 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSystemRegister.td @@ -106,6 +106,24 @@ def : MClassSysReg<0, 0, 1, 0x894, "control_ns">; def : MClassSysReg<0, 0, 1, 0x898, "sp_ns">; } +let Requires = [{ {ARM::FeaturePACBTI} }] in { +def : MClassSysReg<0, 0, 1, 0x820, "pac_key_p_0">; +def : MClassSysReg<0, 0, 1, 0x821, "pac_key_p_1">; +def : MClassSysReg<0, 0, 1, 0x822, "pac_key_p_2">; +def : MClassSysReg<0, 0, 1, 0x823, "pac_key_p_3">; +def : MClassSysReg<0, 0, 1, 0x824, "pac_key_u_0">; +def : MClassSysReg<0, 0, 1, 0x825, "pac_key_u_1">; +def : MClassSysReg<0, 0, 1, 0x826, "pac_key_u_2">; +def : MClassSysReg<0, 0, 1, 0x827, "pac_key_u_3">; +def : MClassSysReg<0, 0, 1, 0x8a0, "pac_key_p_0_ns">; +def : MClassSysReg<0, 0, 1, 0x8a1, "pac_key_p_1_ns">; +def : MClassSysReg<0, 0, 1, 0x8a2, "pac_key_p_2_ns">; +def : MClassSysReg<0, 0, 1, 0x8a3, "pac_key_p_3_ns">; +def : MClassSysReg<0, 0, 1, 0x8a4, "pac_key_u_0_ns">; +def : MClassSysReg<0, 0, 1, 0x8a5, "pac_key_u_1_ns">; +def : MClassSysReg<0, 0, 1, 0x8a6, "pac_key_u_2_ns">; +def : MClassSysReg<0, 0, 1, 0x8a7, "pac_key_u_3_ns">; +} // Banked Registers // diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 833c7effd31c..0b314ac2a41e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -92,6 +92,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() { initializeARMLoadStoreOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMParallelDSPPass(Registry); + initializeARMBranchTargetsPass(Registry); initializeARMConstantIslandsPass(Registry); initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); @@ -571,6 +572,7 @@ void ARMPassConfig::addPreEmitPass() { } void ARMPassConfig::addPreEmitPass2() { + addPass(createARMBranchTargetsPass()); addPass(createARMConstantIslandPass()); addPass(createARMLowOverheadLoopsPass()); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 88de84a4fd78..602c6745d310 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -334,8 +334,9 @@ InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, } // Checks whether Inst is part of a min(max()) or max(min()) pattern -// that will match to an SSAT instruction -static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { +// that will match to an SSAT instruction. Returns the instruction being +// saturated, or null if no saturation pattern was found. +static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { Value *LHS, *RHS; ConstantInt *C; SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; @@ -358,12 +359,27 @@ static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { return false; }; - if (isSSatMin(Inst->getOperand(1)) || - (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || - isSSatMin(*(++Inst->user_begin()))))) - return true; + if (isSSatMin(Inst->getOperand(1))) + return cast<Instruction>(Inst->getOperand(1))->getOperand(1); + if (Inst->hasNUses(2) && + (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin())))) + return Inst->getOperand(1); } - return false; + return nullptr; +} + +// Look for a FP Saturation pattern, where the instruction can be simplified to +// a fptosi.sat. max(min(fptosi)). The constant in this case is always free. +static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) { + if (Imm.getBitWidth() != 64 || + Imm != APInt::getHighBitsSet(64, 33)) // -2147483648 + return false; + Value *FP = isSSATMinMaxPattern(Inst, Imm); + if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse()) + FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm); + if (!FP) + return false; + return isa<FPToSIInst>(FP); } InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, @@ -423,6 +439,9 @@ InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, return 0; } + if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm)) + return 0; + // We can convert <= -1 to < 0, which is generally quite cheap. if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) { ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate(); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 64d2e1bfa9b2..39f407ba7149 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6429,15 +6429,17 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" || Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" || Mnemonic == "bxns" || Mnemonic == "blxns" || - Mnemonic == "vdot" || Mnemonic == "vmmla" || + Mnemonic == "vdot" || Mnemonic == "vmmla" || Mnemonic == "vudot" || Mnemonic == "vsdot" || Mnemonic == "vcmla" || Mnemonic == "vcadd" || Mnemonic == "vfmal" || Mnemonic == "vfmsl" || - Mnemonic == "wls" || Mnemonic == "le" || Mnemonic == "dls" || - Mnemonic == "csel" || Mnemonic == "csinc" || + Mnemonic == "wls" || Mnemonic == "le" || Mnemonic == "dls" || + Mnemonic == "csel" || Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" || - Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" || - Mnemonic == "csetm") + Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" || + Mnemonic == "csetm" || + Mnemonic == "aut" || Mnemonic == "pac" || Mnemonic == "pacbti" || + Mnemonic == "bti") return Mnemonic; // First, split out any predication code. Ignore mnemonics we know aren't @@ -6581,9 +6583,11 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" || Mnemonic == "csetm" || - Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") || (hasCDE() && MS.isCDEInstr(Mnemonic) && !MS.isITPredicableCDEInstr(Mnemonic)) || + Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") || + Mnemonic == "pac" || Mnemonic == "pacbti" || Mnemonic == "aut" || + Mnemonic == "bti" || (hasMVE() && (Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") || Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") || @@ -12272,6 +12276,7 @@ bool ARMAsmParser::enableArchExtFeature(StringRef Name, SMLoc &ExtLoc) { {ARM::FeatureFPARMv8, ARM::FeatureFullFP16}}, {ARM::AEK_RAS, {Feature_HasV8Bit}, {ARM::FeatureRAS}}, {ARM::AEK_LOB, {Feature_HasV8_1MMainlineBit}, {ARM::FeatureLOB}}, + {ARM::AEK_PACBTI, {Feature_HasV8_1MMainlineBit}, {ARM::FeaturePACBTI}}, // FIXME: Unsupported extensions. {ARM::AEK_OS, {}, {}}, {ARM::AEK_IWMMXT, {}, {}}, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 9caef9f09ea9..c3df7dc88d79 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -185,8 +185,11 @@ static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, - unsigned RegNo, uint64_t Address, +static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -287,6 +290,9 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val, @@ -1172,6 +1178,19 @@ DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, return S; } +static DecodeStatus DecodeGPRnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo == 13) + S = MCDisassembler::SoftFail; + + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); + + return S; +} + static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { @@ -2441,6 +2460,31 @@ static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeT2HintSpaceInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + unsigned imm = fieldFromInstruction(Insn, 0, 8); + + unsigned Opcode = ARM::t2HINT; + + if (imm == 0x0D) { + Opcode = ARM::t2PACBTI; + } else if (imm == 0x1D) { + Opcode = ARM::t2PAC; + } else if (imm == 0x2D) { + Opcode = ARM::t2AUT; + } else if (imm == 0x0F) { + Opcode = ARM::t2BTI; + } + + Inst.setOpcode(Opcode); + if (Opcode == ARM::t2HINT) { + Inst.addOperand(MCOperand::createImm(imm)); + } + + return MCDisassembler::Success; +} + static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -4726,6 +4770,25 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, if (!(FeatureBits[ARM::Feature8MSecExt])) return MCDisassembler::Fail; break; + case 0x20: // pac_key_p_0 + case 0x21: // pac_key_p_1 + case 0x22: // pac_key_p_2 + case 0x23: // pac_key_p_3 + case 0x24: // pac_key_u_0 + case 0x25: // pac_key_u_1 + case 0x26: // pac_key_u_2 + case 0x27: // pac_key_u_3 + case 0xa0: // pac_key_p_0_ns + case 0xa1: // pac_key_p_1_ns + case 0xa2: // pac_key_p_2_ns + case 0xa3: // pac_key_p_3_ns + case 0xa4: // pac_key_u_0_ns + case 0xa5: // pac_key_u_1_ns + case 0xa6: // pac_key_u_2_ns + case 0xa7: // pac_key_u_3_ns + if (!(FeatureBits[ARM::FeaturePACBTI])) + return MCDisassembler::Fail; + break; default: // Architecturally defined as unpredictable S = MCDisassembler::SoftFail; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 43f7575df6db..f8de0320166a 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -195,16 +195,18 @@ namespace ARMII { AddrModeT1_4 = 9, AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data AddrModeT2_i12 = 11, - AddrModeT2_i8 = 12, - AddrModeT2_so = 13, - AddrModeT2_pc = 14, // +/- i12 for pc relative data - AddrModeT2_i8s4 = 15, // i8 * 4 - AddrMode_i12 = 16, - AddrMode5FP16 = 17, // i8 * 2 - AddrModeT2_ldrex = 18, // i8 * 4, with unscaled offset in MCInst - AddrModeT2_i7s4 = 19, // i7 * 4 - AddrModeT2_i7s2 = 20, // i7 * 2 - AddrModeT2_i7 = 21, // i7 * 1 + AddrModeT2_i8 = 12, // +/- i8 + AddrModeT2_i8pos = 13, // + i8 + AddrModeT2_i8neg = 14, // - i8 + AddrModeT2_so = 15, + AddrModeT2_pc = 16, // +/- i12 for pc relative data + AddrModeT2_i8s4 = 17, // i8 * 4 + AddrMode_i12 = 18, + AddrMode5FP16 = 19, // i8 * 2 + AddrModeT2_ldrex = 20, // i8 * 4, with unscaled offset in MCInst + AddrModeT2_i7s4 = 21, // i7 * 4 + AddrModeT2_i7s2 = 22, // i7 * 2 + AddrModeT2_i7 = 23, // i7 * 1 }; inline static const char *AddrModeToString(AddrMode addrmode) { @@ -223,6 +225,8 @@ namespace ARMII { case AddrModeT1_s: return "AddrModeT1_s"; case AddrModeT2_i12: return "AddrModeT2_i12"; case AddrModeT2_i8: return "AddrModeT2_i8"; + case AddrModeT2_i8pos: return "AddrModeT2_i8pos"; + case AddrModeT2_i8neg: return "AddrModeT2_i8neg"; case AddrModeT2_so: return "AddrModeT2_so"; case AddrModeT2_pc: return "AddrModeT2_pc"; case AddrModeT2_i8s4: return "AddrModeT2_i8s4"; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 3e4c97630af6..02a2d01176fc 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -299,4 +299,9 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) { else if (STI.hasFeature(ARM::FeatureVirtualization)) emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowVirtualization); + + if (STI.hasFeature(ARM::FeaturePACBTI)) { + emitAttribute(ARMBuildAttrs::PAC_extension, ARMBuildAttrs::AllowPAC); + emitAttribute(ARMBuildAttrs::BTI_extension, ARMBuildAttrs::AllowBTI); + } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index e4e95f63f0a6..224c61b9f065 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -205,9 +205,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, return; } - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - int FI = CSI[i].getFrameIdx(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); + int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: case ARM::R9: @@ -266,10 +266,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } - for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), - E = CSI.end(); I != E; ++I) { - unsigned Reg = I->getReg(); - int FI = I->getFrameIdx(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); + int FI = I.getFrameIdx(); switch (Reg) { case ARM::R8: case ARM::R9: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index bdb167a08e61..ebd139af2219 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -261,7 +261,7 @@ void Thumb2InstrInfo::expandLoadStackGuard( cast<GlobalValue>((*MI->memoperands_begin())->getValue()); if (MF.getSubtarget<ARMSubtarget>().isGVInGOT(GV)) - expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::t2LDRi12); + expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12); else if (MF.getTarget().isPositionIndependent()) expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12); else @@ -634,7 +634,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned NumBits = 0; unsigned Scale = 1; - if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) { + if (AddrMode == ARMII::AddrModeT2_i8neg || + AddrMode == ARMII::AddrModeT2_i12) { // i8 supports only negative, and i12 supports only positive, so // based on Offset sign convert Opcode to the appropriate // instruction diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index 132516694f4e..1164b6ebbac3 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -502,8 +502,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // For the non-writeback version (this one), the base register must be // one of the registers being loaded. bool isOK = false; - for (unsigned i = 3; i < MI->getNumOperands(); ++i) { - if (MI->getOperand(i).getReg() == BaseReg) { + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 3)) { + if (MO.getReg() == BaseReg) { isOK = true; break; } @@ -527,8 +527,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // numbered register (i.e. it's in operand 4 onwards) then with writeback // the stored value is unknown, so we can't convert to tSTMIA_UPD. Register BaseReg = MI->getOperand(0).getReg(); - for (unsigned i = 4; i < MI->getNumOperands(); ++i) - if (MI->getOperand(i).getReg() == BaseReg) + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 4)) + if (MO.getReg() == BaseReg) return false; break; @@ -611,8 +611,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, } // Transfer the rest of operands. - for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum) - MIB.add(MI->getOperand(OpNum)); + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), OpNum)) + MIB.add(MO); // Transfer memoperands. MIB.setMemRefs(MI->memoperands()); diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp index 846798a63cb7..2ce9c386f24c 100644 --- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFMCInstLower.cpp @@ -47,9 +47,7 @@ MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO, void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; switch (MO.getType()) { default: diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp index 8bced3cec082..685bafd785df 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/BitTracker.cpp @@ -214,9 +214,9 @@ bool BT::RegisterCell::meet(const RegisterCell &RC, Register SelfR) { BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC, const BitMask &M) { uint16_t B = M.first(), E = M.last(), W = width(); - // Sanity: M must be a valid mask for *this. + // M must be a valid mask for *this. assert(B < W && E < W); - // Sanity: the masked part of *this must have the same number of bits + // The masked part of *this must have the same number of bits // as the source. assert(B > E || E-B+1 == RC.width()); // B <= E => E-B+1 = |RC|. assert(B <= E || E+(W-B)+1 == RC.width()); // E < B => E+(W-B)+1 = |RC|. @@ -850,8 +850,7 @@ void BT::visitNonBranch(const MachineInstr &MI) { bool Eval = ME.evaluate(MI, Map, ResMap); if (Trace && Eval) { - for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isUse()) continue; RegisterRef RU(MO); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp index 0f6dedeb28c3..1938a5c259da 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -189,7 +189,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI, unsigned NumDefs = 0; - // Sanity verification: there should not be any defs with subregisters. + // Basic correctness check: there should not be any defs with subregisters. for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 43f0758f6598..8c3b9572201e 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -476,10 +476,10 @@ namespace { } // end anonymous namespace static const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) { - for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I) - if (I->count(N)) - return &*I; - return nullptr; + for (const NodeSet &S : Rel) + if (S.count(N)) + return &S; + return nullptr; } // Create an ordered pair of GepNode pointers. The pair will be used in @@ -589,9 +589,8 @@ void HexagonCommonGEP::common() { dbgs() << "{ " << I->first << ", " << I->second << " }\n"; dbgs() << "Gep equivalence classes:\n"; - for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) { + for (const NodeSet &S : EqRel) { dbgs() << '{'; - const NodeSet &S = *I; for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) { if (J != S.begin()) dbgs() << ','; @@ -604,8 +603,7 @@ void HexagonCommonGEP::common() { // Create a projection from a NodeSet to the minimal element in it. using ProjMap = std::map<const NodeSet *, GepNode *>; ProjMap PM; - for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) { - const NodeSet &S = *I; + for (const NodeSet &S : EqRel) { GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder); std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min)); (void)Ins; @@ -1280,8 +1278,8 @@ bool HexagonCommonGEP::runOnFunction(Function &F) { return false; // For now bail out on C++ exception handling. - for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A) - for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I) + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) if (isa<InvokeInst>(I) || isa<LandingPadInst>(I)) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp index a774baaa48e6..d3fcdb6ae9a8 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -1254,7 +1254,7 @@ void HCE::collect(MachineFunction &MF) { void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End, AssignmentMap &IMap) { - // Sanity check: make sure that all extenders in the range [Begin..End) + // Basic correctness: make sure that all extenders in the range [Begin..End) // share the same root ER. for (unsigned I = Begin; I != End; ++I) assert(ER == ExtRoot(Extenders[I].getOp())); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index 23d0cc829e52..03b0f75b2dc1 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -237,12 +237,9 @@ static bool isEvenReg(unsigned Reg) { } static void removeKillInfo(MachineInstr &MI, unsigned RegNotKilled) { - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill()) - continue; - Op.setIsKill(false); - } + for (MachineOperand &Op : MI.operands()) + if (Op.isReg() && Op.getReg() == RegNotKilled && Op.isKill()) + Op.setIsKill(false); } /// Returns true if it is unsafe to move a copy instruction from \p UseReg to @@ -403,10 +400,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) { // Mark TFRs that feed a potential new value store as such. if (TII->mayBeNewStore(MI)) { // Look for uses of TFR instructions. - for (unsigned OpdIdx = 0, OpdE = MI.getNumOperands(); OpdIdx != OpdE; - ++OpdIdx) { - MachineOperand &Op = MI.getOperand(OpdIdx); - + for (const MachineOperand &Op : MI.operands()) { // Skip over anything except register uses. if (!Op.isReg() || !Op.isUse() || !Op.getReg()) continue; @@ -484,14 +478,13 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) { IsConst64Disabled = true; // Traverse basic blocks. - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; - ++BI) { + for (MachineBasicBlock &MBB : MF) { PotentiallyNewifiableTFR.clear(); - findPotentialNewifiableTFRs(*BI); + findPotentialNewifiableTFRs(MBB); // Traverse instructions in basic block. - for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end(); - MI != End;) { + for (MachineBasicBlock::iterator MI = MBB.begin(), End = MBB.end(); + MI != End;) { MachineInstr &I1 = *MI++; if (I1.isDebugInstr()) diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index bff596e69efd..12ceac545e9d 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1404,18 +1404,18 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, // Add callee-saved registers as use. addCalleeSaveRegistersAsImpOperand(SaveRegsCall, CSI, false, true); // Add live in registers. - for (unsigned I = 0; I < CSI.size(); ++I) - MBB.addLiveIn(CSI[I].getReg()); + for (const CalleeSavedInfo &I : CSI) + MBB.addLiveIn(I.getReg()); return true; } - for (unsigned i = 0, n = CSI.size(); i < n; ++i) { - unsigned Reg = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); // Add live in registers. We treat eh_return callee saved register r0 - r3 // specially. They are not really callee saved registers as they are not // supposed to be killed. bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg); - int FI = CSI[i].getFrameIdx(); + int FI = I.getFrameIdx(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI); if (IsKill) @@ -1478,10 +1478,10 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, return true; } - for (unsigned i = 0; i < CSI.size(); ++i) { - unsigned Reg = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); - int FI = CSI[i].getFrameIdx(); + int FI = I.getFrameIdx(); HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI); } @@ -1619,8 +1619,8 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, // (1) For each callee-saved register, add that register and all of its // sub-registers to SRegs. LLVM_DEBUG(dbgs() << "Initial CS registers: {"); - for (unsigned i = 0, n = CSI.size(); i < n; ++i) { - unsigned R = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned R = I.getReg(); LLVM_DEBUG(dbgs() << ' ' << printReg(R, TRI)); for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR) SRegs[*SR] = true; @@ -1720,10 +1720,10 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, LLVM_DEBUG({ dbgs() << "CS information: {"; - for (unsigned i = 0, n = CSI.size(); i < n; ++i) { - int FI = CSI[i].getFrameIdx(); + for (const CalleeSavedInfo &I : CSI) { + int FI = I.getFrameIdx(); int Off = MFI.getObjectOffset(FI); - dbgs() << ' ' << printReg(CSI[i].getReg(), TRI) << ":fi#" << FI << ":sp"; + dbgs() << ' ' << printReg(I.getReg(), TRI) << ":fi#" << FI << ":sp"; if (Off >= 0) dbgs() << '+'; dbgs() << Off; @@ -2634,8 +2634,8 @@ bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF, // Check if CSI only has double registers, and if the registers form // a contiguous block starting from D8. BitVector Regs(Hexagon::NUM_TARGET_REGS); - for (unsigned i = 0, n = CSI.size(); i < n; ++i) { - unsigned R = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned R = I.getReg(); if (!Hexagon::DoubleRegsRegClass.contains(R)) return true; Regs[R] = true; diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 02da2f29591a..46c1fbc6eeb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -597,19 +597,12 @@ void HexagonGenInsert::dump_map() const { void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const { unsigned Index = 0; - using mf_iterator = MachineFunction::const_iterator; - - for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) { - const MachineBasicBlock &B = *A; + for (const MachineBasicBlock &B : *MFN) { if (!CMS->BT.reached(&B)) continue; - using mb_iterator = MachineBasicBlock::const_iterator; - - for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) { - const MachineInstr *MI = &*I; - for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineInstr &MI : B) { + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.isDef()) { Register R = MO.getReg(); assert(MO.getSubReg() == 0 && "Unexpected subregister in definition"); @@ -725,8 +718,7 @@ bool HexagonGenInsert::findNonSelfReference(unsigned VR) const { void HexagonGenInsert::getInstrDefs(const MachineInstr *MI, RegisterSet &Defs) const { - for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; Register R = MO.getReg(); @@ -738,8 +730,7 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI, void HexagonGenInsert::getInstrUses(const MachineInstr *MI, RegisterSet &Uses) const { - for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isUse()) continue; Register R = MO.getReg(); @@ -942,12 +933,11 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B, // can remove them from the list of available registers once all DT // successors have been processed. RegisterSet BlockDefs, InsDefs; - for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { - MachineInstr *MI = &*I; + for (MachineInstr &MI : *B) { InsDefs.clear(); - getInstrDefs(MI, InsDefs); + getInstrDefs(&MI, InsDefs); // Leave those alone. They are more transparent than "insert". - bool Skip = MI->isCopy() || MI->isRegSequence(); + bool Skip = MI.isCopy() || MI.isRegSequence(); if (!Skip) { // Visit all defined registers, and attempt to find the corresponding @@ -1458,8 +1448,7 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) { for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) Instrs.push_back(&*I); - for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) { - MachineInstr *MI = *I; + for (MachineInstr *MI : Instrs) { unsigned Opc = MI->getOpcode(); // Do not touch lifetime markers. This is why the target-independent DCE // cannot be used. @@ -1501,7 +1490,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail; bool Changed = false; - // Sanity check: one, but not both. + // Verify: one, but not both. assert(!OptSelectAll0 || !OptSelectHas0); IFMap.clear(); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp index cf4f13fb8c0d..55de02816fb8 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenMux.cpp @@ -328,7 +328,7 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF); if (!MxOpc) continue; - // Basic sanity check: since we are deleting instructions, validate the + // Basic correctness check: since we are deleting instructions, validate the // iterators. There is a possibility that one of Def1 or Def2 is translated // to "mux" and being considered for other "mux" instructions. if (!MX.At->getParent() || !MX.Def1->getParent() || !MX.Def2->getParent()) diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index d8d2025c5d27..1a66394e9757 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -205,16 +205,14 @@ bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) { } void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) { - for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) { - MachineBasicBlock &B = *A; - for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) { - MachineInstr *MI = &*I; - unsigned Opc = MI->getOpcode(); + for (MachineBasicBlock &B : MF) { + for (MachineInstr &MI : B) { + unsigned Opc = MI.getOpcode(); switch (Opc) { case Hexagon::C2_tfrpr: case TargetOpcode::COPY: - if (isPredReg(MI->getOperand(1).getReg())) { - RegisterSubReg RD = MI->getOperand(0); + if (isPredReg(MI.getOperand(1).getReg())) { + RegisterSubReg RD = MI.getOperand(0); if (RD.R.isVirtual()) PredGPRs.insert(RD); } @@ -411,7 +409,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) { NumOps = 2; } - // Some sanity: check that def is in operand #0. + // Check that def is in operand #0. MachineOperand &Op0 = MI->getOperand(0); assert(Op0.isDef()); RegisterSubReg OutR(Op0); @@ -488,8 +486,8 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) { } } - for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I) - (*I)->eraseFromParent(); + for (MachineInstr *MI : Erase) + MI->eraseFromParent(); return Changed; } @@ -515,11 +513,8 @@ bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) { Again = false; VectOfInst Processed, Copy; - using iterator = VectOfInst::iterator; - Copy = PUsers; - for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) { - MachineInstr *MI = *I; + for (MachineInstr *MI : Copy) { bool Done = convertToPredForm(MI); if (Done) { Processed.insert(MI); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index a4971ad712eb..5d2e1b259449 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -1014,12 +1014,10 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L, LLVM_DEBUG(dbgs() << "\nhw_loop head, " << printMBBReference(**L->block_begin())); for (MachineBasicBlock *MBB : L->getBlocks()) { - for (MachineBasicBlock::iterator - MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) { - const MachineInstr *MI = &*MII; - if (isInvalidLoopOperation(MI, IsInnerHWLoop)) { + for (const MachineInstr &MI : *MBB) { + if (isInvalidLoopOperation(&MI, IsInnerHWLoop)) { LLVM_DEBUG(dbgs() << "\nCannot convert to hw_loop due to:"; - MI->dump();); + MI.dump();); return true; } } @@ -1034,8 +1032,7 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L, bool HexagonHardwareLoops::isDead(const MachineInstr *MI, SmallVectorImpl<MachineInstr *> &DeadPhis) const { // Examine each operand. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; @@ -1089,8 +1086,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) { // It is possible that some DBG_VALUE instructions refer to this // instruction. Examine each def operand for such references; // if found, mark the DBG_VALUE as undef (but don't delete it). - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; Register Reg = MO.getReg(); @@ -1123,7 +1119,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) { bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L, bool &RecL0used, bool &RecL1used) { - // This is just for sanity. + // This is just to confirm basic correctness. assert(L->getHeader() && "Loop without a header?"); bool Changed = false; @@ -1877,8 +1873,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( if (TII->analyzeBranch(*ExitingBlock, TB, FB, Tmp1, false)) return nullptr; - for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) { - MachineBasicBlock *PB = *I; + for (MachineBasicBlock *PB : Preds) { bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp1, false); if (NotAnalyzed) return nullptr; @@ -1960,8 +1955,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( TB = FB = nullptr; - for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) { - MachineBasicBlock *PB = *I; + for (MachineBasicBlock *PB : Preds) { if (PB != Latch) { Tmp2.clear(); bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp2, false); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index b50a0e29ecae..ed4874baf7c8 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -1006,7 +1006,7 @@ static void packSegmentMask(ArrayRef<int> Mask, ArrayRef<unsigned> OutSegMap, static bool isPermutation(ArrayRef<int> Mask) { // Check by adding all numbers only works if there is no overflow. - assert(Mask.size() < 0x00007FFF && "Sanity failure"); + assert(Mask.size() < 0x00007FFF && "Overflow failure"); int Sum = 0; for (int Idx : Mask) { if (Idx == -1) @@ -1217,7 +1217,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb, } else if (Seg0 == ~1u) { Seg0 = SegList[0] != Seg1 ? SegList[0] : SegList[1]; } else { - assert(Seg1 == ~1u); // Sanity + assert(Seg1 == ~1u); Seg1 = SegList[0] != Seg0 ? SegList[0] : SegList[1]; } } @@ -1265,7 +1265,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb, } else { // BC or DA: this could be done via valign by SegLen. // Do nothing here, because valign (if possible) will be generated - // later on (make sure the Seg0 values are as expected, for sanity). + // later on (make sure the Seg0 values are as expected). assert(Seg0 == 1 || Seg0 == 3); } } @@ -1414,7 +1414,7 @@ OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) { return OpRef::undef(getSingleVT(MVT::i8)); unsigned HalfLen = HwLen / 2; - assert(isPowerOf2_32(HalfLen)); // Sanity. + assert(isPowerOf2_32(HalfLen)); // Handle special case where the output is the same half of the input // repeated twice, i.e. if Va = AB, then handle the output of AA or BB. diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 29572e3106d1..88effed9f076 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -442,8 +442,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CLI.IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, IsVarArg, IsStructRet, StructAttrFlag, Outs, OutVals, Ins, DAG); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (const CCValAssign &VA : ArgLocs) { if (VA.isMemLoc()) { CLI.IsTailCall = false; break; @@ -2549,7 +2548,8 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV, // Special case for v{8,4,2}i1 (the only boolean vectors legal in Hexagon // without any coprocessors). if (ElemWidth == 1) { - assert(VecWidth == VecTy.getVectorNumElements() && "Sanity failure"); + assert(VecWidth == VecTy.getVectorNumElements() && + "Vector elements should equal vector width size"); assert(VecWidth == 8 || VecWidth == 4 || VecWidth == 2); // Check if this is an extract of the lowest bit. if (IdxN) { @@ -2863,8 +2863,7 @@ HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op, Scale /= 2; } - // Another sanity check. At this point there should only be two words - // left, and Scale should be 2. + // At this point there should only be two words left, and Scale should be 2. assert(Scale == 2 && Words[IdxW].size() == 2); SDValue WW = DAG.getNode(HexagonISD::COMBINE, dl, MVT::i64, diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 8900fca8bb78..f7237f496aee 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -9,6 +9,7 @@ #include "HexagonISelLowering.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/IR/IntrinsicsHexagon.h" #include "llvm/Support/CommandLine.h" @@ -1846,16 +1847,18 @@ HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = MemN->getChain(); SDValue Base0 = MemN->getBasePtr(); SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl); + unsigned MemOpc = MemN->getOpcode(); MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr; if (MachineMemOperand *MMO = MemN->getMemOperand()) { MachineFunction &MF = DAG.getMachineFunction(); - MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen); - MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen); + uint64_t MemSize = (MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE) + ? (uint64_t)MemoryLocation::UnknownSize + : HwLen; + MOp0 = MF.getMachineMemOperand(MMO, 0, MemSize); + MOp1 = MF.getMachineMemOperand(MMO, HwLen, MemSize); } - unsigned MemOpc = MemN->getOpcode(); - if (MemOpc == ISD::LOAD) { assert(cast<LoadSDNode>(Op)->isUnindexed()); SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 76220eff4d51..b6984d40f78e 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -193,9 +193,7 @@ static inline void parseOperands(const MachineInstr &MI, Defs.clear(); Uses.clear(); - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); - + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; @@ -1644,8 +1642,7 @@ bool HexagonInstrInfo::ClobbersPredicate(MachineInstr &MI, bool SkipDead) const { const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo(); - for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) { - MachineOperand MO = MI.getOperand(oper); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg()) { if (!MO.isDef()) continue; diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp index 9507de95231f..987c4a5fa6c4 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -109,8 +109,7 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) && "MCI opcode should have been set on construction"); - for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { MCOperand MCO; bool MustExtend = MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended; diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp index fc31139e13ce..1ff248200572 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonPeephole.cpp @@ -120,16 +120,12 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { if (DisableHexagonPeephole) return false; // Loop over all of the basic blocks. - for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); - MBBb != MBBe; ++MBBb) { - MachineBasicBlock *MBB = &*MBBb; + for (MachineBasicBlock &MBB : MF) { PeepholeMap.clear(); PeepholeDoubleRegsMap.clear(); // Traverse the basic block. - for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) { - NextI = std::next(I); - MachineInstr &MI = *I; + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { // Look for sign extends: // %170 = SXTW %166 if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) { @@ -274,11 +270,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { if (NewOp) { Register PSrc = MI.getOperand(PR).getReg(); if (unsigned POrig = PeepholeMap.lookup(PSrc)) { - BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(), - QII->get(NewOp), MI.getOperand(0).getReg()) - .addReg(POrig) - .add(MI.getOperand(S2)) - .add(MI.getOperand(S1)); + BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), QII->get(NewOp), + MI.getOperand(0).getReg()) + .addReg(POrig) + .add(MI.getOperand(S2)) + .add(MI.getOperand(S1)); MRI->clearKillFlags(POrig); MI.eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp index 93ba277b0c9d..2c5c64cfcfc6 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp @@ -400,8 +400,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned Acc = 0; // Value accumulator. unsigned Shift = 0; - for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) { - MachineInstr *MI = *I; + for (MachineInstr *MI : OG) { const MachineMemOperand &MMO = getStoreTarget(MI); MachineOperand &SO = MI->getOperand(2); // Source. assert(SO.isImm() && "Expecting an immediate operand"); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index 87b1c43961d7..ecb2f88d8096 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -305,8 +305,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) { VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg(); LastVRegUse.erase(MI->getOperand(1).getReg()); } else { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; if (MO.isUse() && !MI->isCopy() && diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 897fb209a8bf..ea2798a3b44e 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -749,7 +749,6 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool { WithMaxAlign.ValTy, Adjust); int Diff = Start - (OffAtMax + Adjust); AlignVal = HVC.getConstInt(Diff); - // Sanity. assert(Diff >= 0); assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value()); } else { diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp index b9e577d201f9..cafe93bf8f4b 100644 --- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp @@ -51,9 +51,8 @@ struct Filler : public MachineFunctionPass { TRI = Subtarget.getRegisterInfo(); bool Changed = false; - for (MachineFunction::iterator FI = MF.begin(), FE = MF.end(); FI != FE; - ++FI) - Changed |= runOnMachineBasicBlock(*FI); + for (MachineBasicBlock &MBB : MF) + Changed |= runOnMachineBasicBlock(MBB); return Changed; } @@ -200,8 +199,7 @@ bool Filler::delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad, assert((!MI->isCall() && !MI->isReturn()) && "Cannot put calls or returns in delay slot."); - for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI->getOperand(I); + for (const MachineOperand &MO : MI->operands()) { unsigned Reg; if (!MO.isReg() || !(Reg = MO.getReg())) diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp index 3a2d5030775e..3644eafe4353 100644 --- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp @@ -65,17 +65,14 @@ void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const { *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo()); unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize(); - for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; - ++MBB) { - MachineBasicBlock::iterator MBBI = MBB->begin(); - while (MBBI != MBB->end()) { - MachineInstr &MI = *MBBI++; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { if (MI.getOpcode() == Lanai::ADJDYNALLOC) { DebugLoc DL = MI.getDebugLoc(); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); - BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst) + BuildMI(MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst) .addReg(Src) .addImm(MaxCallFrameSize); MI.eraseFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 21d035c7ee9c..4217b8509676 100644 --- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -467,8 +467,7 @@ static MachineInstr *canFoldIntoSelect(Register Reg, return nullptr; // Check if MI has any non-dead defs or physreg uses. This also detects // predicated instructions which will be reading SR. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 1)) { // Reject frame index operands. if (MO.isFI() || MO.isCPI() || MO.isJTI()) return nullptr; diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp index 743f4f7c6e2f..479c0b1f0358 100644 --- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp @@ -93,9 +93,7 @@ MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO, void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI->getOperand(I); - + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; switch (MO.getType()) { case MachineOperand::MO_Register: diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp index a83a5d2dfcc9..2a77a150f9aa 100644 --- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -211,8 +211,8 @@ bool MSP430FrameLowering::restoreCalleeSavedRegisters( MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - for (unsigned i = 0, e = CSI.size(); i != e; ++i) - BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg()); + for (const CalleeSavedInfo &I : CSI) + BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), I.getReg()); return true; } diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp index 1e57f33386e6..52c037de7660 100644 --- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp @@ -115,9 +115,7 @@ LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const { void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; switch (MO.getType()) { default: diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp index fefa1134b021..622f2039f9e4 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/Mips16FrameLowering.cpp @@ -72,10 +72,9 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, if (!CSI.empty()) { const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), - E = CSI.end(); I != E; ++I) { - int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); - unsigned Reg = I->getReg(); + for (const CalleeSavedInfo &I : CSI) { + int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); + unsigned Reg = I.getReg(); unsigned DReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createOffset(nullptr, DReg, Offset)); @@ -119,13 +118,13 @@ bool Mips16FrameLowering::spillCalleeSavedRegisters( // will be saved with the "save" instruction // during emitPrologue // - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + for (const CalleeSavedInfo &I : CSI) { // Add the callee-saved register as live-in. Do not add if the register is // RA and return address is taken, because it has already been added in // method MipsTargetLowering::lowerRETURNADDR. // It's killed at the spill, unless the register is RA and return address // is taken. - unsigned Reg = CSI[i].getReg(); + unsigned Reg = I.getReg(); bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA) && MF->getFrameInfo().isReturnAddressTaken(); if (!IsRAAndRetAddrIsTaken) diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 6d3f3adb2b7a..5d026785b921 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -163,9 +163,8 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, TargetMachine &TM, MCStreamer &OutStreamer, const MipsSubtarget &Subtarget) { - for (unsigned int I = MI.getDesc().getNumOperands(), E = MI.getNumOperands(); - I < E; ++I) { - MachineOperand MO = MI.getOperand(I); + for (const MachineOperand &MO : + llvm::drop_begin(MI.operands(), MI.getDesc().getNumOperands())) { if (MO.isMCSymbol() && (MO.getTargetFlags() & MipsII::MO_JALR)) { MCSymbol *Callee = MO.getMCSymbol(); if (Callee && !Callee->getName().empty()) { diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 8e619549f01c..491d379bfe0b 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -637,8 +637,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { // has any inline assembly in it. If so, we have to be conservative about // alignment assumptions, as we don't know for sure the size of any // instructions in the inline assembly. - for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) - computeBlockSize(&*I); + for (MachineBasicBlock &MBB : *MF) + computeBlockSize(&MBB); // Compute block offsets. adjustBBOffsetsAfter(&MF->front()); @@ -730,8 +730,8 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { continue; // Scan the instructions for constant pool operands. - for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op) - if (MI.getOperand(op).isCPI()) { + for (const MachineOperand &MO : MI.operands()) + if (MO.isCPI()) { // We found one. The addressing mode tells us the max displacement // from the PC that this instruction permits. @@ -759,7 +759,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { break; } // Remember that this is a user of a CP entry. - unsigned CPI = MI.getOperand(op).getIndex(); + unsigned CPI = MO.getIndex(); MachineInstr *CPEMI = CPEMIs[CPI]; unsigned MaxOffs = ((1 << Bits)-1) * Scale; unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale; @@ -1066,9 +1066,9 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) // Point the CPUser node to the replacement U.CPEMI = CPEs[i].CPEMI; // Change the CPI in the instruction operand to refer to the clone. - for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) - if (UserMI->getOperand(j).isCPI()) { - UserMI->getOperand(j).setIndex(CPEs[i].CPI); + for (MachineOperand &MO : UserMI->operands()) + if (MO.isCPI()) { + MO.setIndex(CPEs[i].CPI); break; } // Adjust the refcount of the clone... @@ -1122,9 +1122,9 @@ int MipsConstantIslands::findLongFormInRangeCPEntry // Point the CPUser node to the replacement U.CPEMI = CPEs[i].CPEMI; // Change the CPI in the instruction operand to refer to the clone. - for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) - if (UserMI->getOperand(j).isCPI()) { - UserMI->getOperand(j).setIndex(CPEs[i].CPI); + for (MachineOperand &MO : UserMI->operands()) + if (MO.isCPI()) { + MO.setIndex(CPEs[i].CPI); break; } // Adjust the refcount of the clone... @@ -1392,9 +1392,9 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. - for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) - if (UserMI->getOperand(i).isCPI()) { - UserMI->getOperand(i).setIndex(ID); + for (MachineOperand &MO : UserMI->operands()) + if (MO.isCPI()) { + MO.setIndex(ID); break; } @@ -1633,10 +1633,10 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) { void MipsConstantIslands::prescanForConstants() { unsigned J = 0; (void)J; - for (MachineFunction::iterator B = - MF->begin(), E = MF->end(); B != E; ++B) { - for (MachineBasicBlock::instr_iterator I = - B->instr_begin(), EB = B->instr_end(); I != EB; ++I) { + for (MachineBasicBlock &B : *MF) { + for (MachineBasicBlock::instr_iterator I = B.instr_begin(), + EB = B.instr_end(); + I != EB; ++I) { switch(I->getDesc().getOpcode()) { case Mips::LwConstant32: { PrescannedForConstants = true; diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index c2e3d7393a6d..2d27d7553de6 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -218,9 +218,8 @@ namespace { bool runOnMachineFunction(MachineFunction &F) override { TM = &F.getTarget(); bool Changed = false; - for (MachineFunction::iterator FI = F.begin(), FE = F.end(); - FI != FE; ++FI) - Changed |= runOnMachineBasicBlock(*FI); + for (MachineBasicBlock &MBB : F) + Changed |= runOnMachineBasicBlock(MBB); // This pass invalidates liveness information when it reorders // instructions to fill delay slot. Without this, -verify-machineinstrs diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp index f72dc1da4131..31180d5a23ef 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsExpandPseudo.cpp @@ -896,9 +896,8 @@ bool MipsExpandPseudo::runOnMachineFunction(MachineFunction &MF) { TII = STI->getInstrInfo(); bool Modified = false; - for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; - ++MFI) - Modified |= expandMBB(*MFI); + for (MachineBasicBlock &MBB : MF) + Modified |= expandMBB(MBB); if (Modified) MF.RenumberBlocks(); diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h index 612b2b712fa8..710a3d40c38e 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsFrameLowering.h @@ -34,7 +34,10 @@ public: bool hasBP(const MachineFunction &MF) const; - bool isFPCloseToIncomingSP() const override { return false; } + bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const override { + return false; + } bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp index 66e04bda2af3..7b58cb90ab87 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsMCInstLower.cpp @@ -318,8 +318,7 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp = LowerOperand(MO); if (MCOp.isValid()) diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index bb4b9c6fa6a7..193d071447ff 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -452,10 +452,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, // Iterate over list of callee-saved registers and emit .cfi_offset // directives. - for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), - E = CSI.end(); I != E; ++I) { - int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); - unsigned Reg = I->getReg(); + for (const CalleeSavedInfo &I : CSI) { + int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); + unsigned Reg = I.getReg(); // If Reg is a double precision register, emit two cfa_offsets, // one for each of the paired single precision registers. @@ -796,13 +795,13 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( MachineFunction *MF = MBB.getParent(); const TargetInstrInfo &TII = *STI.getInstrInfo(); - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + for (const CalleeSavedInfo &I : CSI) { // Add the callee-saved register as live-in. Do not add if the register is // RA and return address is taken, because it has already been added in // method MipsTargetLowering::lowerRETURNADDR. // It's killed at the spill, unless the register is RA and return address // is taken. - unsigned Reg = CSI[i].getReg(); + unsigned Reg = I.getReg(); bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64) && MF->getFrameInfo().isReturnAddressTaken(); if (!IsRAAndRetAddrIsTaken) @@ -831,8 +830,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( // Insert the spill to the stack frame. bool IsKill = !IsRAAndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, - CSI[i].getFrameIdx(), RC, TRI); + TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI); } return true; diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index 1fe6ab09804b..40b215a8204c 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -3581,8 +3581,8 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI, MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt); - for (unsigned i = 1; i < MI.getNumOperands(); i++) - MIB.add(MI.getOperand(i)); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) + MIB.add(MO); if(!UsingMips32) { Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index aab6d2034f11..c35e67d6726f 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -130,10 +130,8 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV, for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i) DiscoverDependentGlobals(GV->getOperand(i), Others); - for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(), - E = Others.end(); - I != E; ++I) - VisitGlobalVariableForEmission(*I, Order, Visited, Visiting); + for (const GlobalVariable *GV : Others) + VisitGlobalVariableForEmission(GV, Order, Visited, Visiting); // Now we can visit ourself Order.push_back(GV); @@ -699,35 +697,33 @@ static bool useFuncSeen(const Constant *C, void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { DenseMap<const Function *, bool> seenMap; - for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { - const Function *F = &*FI; - - if (F->getAttributes().hasFnAttr("nvptx-libcall-callee")) { - emitDeclaration(F, O); + for (const Function &F : M) { + if (F.getAttributes().hasFnAttr("nvptx-libcall-callee")) { + emitDeclaration(&F, O); continue; } - if (F->isDeclaration()) { - if (F->use_empty()) + if (F.isDeclaration()) { + if (F.use_empty()) continue; - if (F->getIntrinsicID()) + if (F.getIntrinsicID()) continue; - emitDeclaration(F, O); + emitDeclaration(&F, O); continue; } - for (const User *U : F->users()) { + for (const User *U : F.users()) { if (const Constant *C = dyn_cast<Constant>(U)) { if (usedInGlobalVarDef(C)) { // The use is in the initialization of a global variable // that is a function pointer, so print a declaration // for the original function - emitDeclaration(F, O); + emitDeclaration(&F, O); break; } // Emit a declaration of this function if the function that // uses this constant expr has already been seen. if (useFuncSeen(C, seenMap)) { - emitDeclaration(F, O); + emitDeclaration(&F, O); break; } } @@ -746,11 +742,11 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { // appearing in the module before the callee. so print out // a declaration for the callee. if (seenMap.find(caller) != seenMap.end()) { - emitDeclaration(F, O); + emitDeclaration(&F, O); break; } } - seenMap[F] = true; + seenMap[&F] = true; } } @@ -887,33 +883,11 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { GlobalsEmitted = true; } - // XXX Temproarily remove global variables so that doFinalization() will not - // emit them again (global variables are emitted at beginning). - - Module::GlobalListType &global_list = M.getGlobalList(); - int i, n = global_list.size(); - GlobalVariable **gv_array = new GlobalVariable *[n]; - - // first, back-up GlobalVariable in gv_array - i = 0; - for (Module::global_iterator I = global_list.begin(), E = global_list.end(); - I != E; ++I) - gv_array[i++] = &*I; - - // second, empty global_list - while (!global_list.empty()) - global_list.remove(global_list.begin()); - // call doFinalization bool ret = AsmPrinter::doFinalization(M); - // now we restore global variables - for (i = 0; i < n; i++) - global_list.insert(global_list.end(), gv_array[i]); - clearAnnotationCache(&M); - delete[] gv_array; // Close the last emitted section if (HasDebugInfo) { static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer()) diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index 5d680e731e4a..2a3a38d7b2f1 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -306,6 +306,11 @@ public: std::string getVirtualRegisterName(unsigned) const; const MCSymbol *getFunctionFrameSymbol() const override; + + // Make emitGlobalVariable() no-op for NVPTX. + // Global variables have been already emitted by the time the base AsmPrinter + // attempts to do so in doFinalization() (see NVPTXAsmPrinter::emitGlobals()). + void emitGlobalVariable(const GlobalVariable *GV) override {} }; } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp index a8a43cee9ab7..34b9dfe87cc2 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -72,8 +72,7 @@ bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) { std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) { std::string ValidName; raw_string_ostream ValidNameStream(ValidName); - for (unsigned I = 0, E = Name.size(); I != E; ++I) { - char C = Name[I]; + for (char C : Name) { if (C == '.' || C == '@') { ValidNameStream << "_$_"; } else { diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index e404cead344b..f4934f0bc20b 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -56,23 +56,16 @@ bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; InstrsToRemove.clear(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; - ++BI) { - for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end(); - I != E; ++I) { - MachineInstr &MI = *I; + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) Changed |= processInstr(MI); - } - } // Now clean up any handle-access instructions // This is needed in debug mode when code cleanup passes are not executed, // but we need the handle access to be eliminated because they are not // valid instructions when image handles are disabled. - for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(), - E = InstrsToRemove.end(); I != E; ++I) { - (*I)->eraseFromParent(); - } + for (MachineInstr *MI : InstrsToRemove) + MI->eraseFromParent(); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td index f43ba00ec373..f3ae0010ad8e 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -626,7 +626,9 @@ def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read], // 5 Cycles Fixed-Point and BCD operations, 3 input operands def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read, P10DX_Read], (instrs + BCDADD_rec, BCDS_rec, + BCDSUB_rec, BCDTRUNC_rec, VADDECUQ, VADDEUQM, @@ -1974,7 +1976,7 @@ def : InstRW<[P10W_SX, P10W_DISP_ANY], ICBLQ, ICBTLS, ICCCI, - LA, + LA, LA8, LDMX, MFDCR, MFPMR, @@ -2073,3 +2075,4 @@ def : InstRW<[P10W_vMU_7C, P10W_DISP_ANY, P10vMU_Read, P10vMU_Read, P10vMU_Read] VMSUMUHM, VMSUMUHS )>; + diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td index c4f4a2b3d796..f7c049951c54 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -151,6 +151,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], (instregex "ADD(4|8)(TLS)?(_)?$"), (instregex "NEG(8)?(O)?$"), (instregex "ADDI(S)?toc(HA|L)(8)?$"), + (instregex "LA(8)?$"), COPY, MCRF, MCRXRX, @@ -165,7 +166,6 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], SRADI_32, RLDIC, RFEBB, - LA, TBEGIN, TRECHKPT, NOP, @@ -624,7 +624,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], BCDS_rec, BCDTRUNC_rec, BCDUS_rec, - BCDUTRUNC_rec + BCDUTRUNC_rec, + BCDADD_rec, + BCDSUB_rec )>; // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td index a1ff20bb3612..422bd11dca52 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPC.td @@ -203,6 +203,22 @@ def FeatureLogicalFusion : SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true", "Target supports Logical Operations fusion", [FeatureFusion]>; +def FeatureSha3Fusion : + SubtargetFeature<"fuse-sha3", "HasSha3Fusion", "true", + "Target supports SHA3 assist fusion", + [FeatureFusion]>; +def FeatureCompareFusion: + SubtargetFeature<"fuse-cmp", "HasCompareFusion", "true", + "Target supports Comparison Operations fusion", + [FeatureFusion]>; +def FeatureWideImmFusion: + SubtargetFeature<"fuse-wideimm", "HasWideImmFusion", "true", + "Target supports Wide-Immediate fusion", + [FeatureFusion]>; +def FeatureZeroMoveFusion: + SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true", + "Target supports move to SPR with branch fusion", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -393,7 +409,7 @@ def ProcessorFeatures { // still exist with the exception of those we know are Power9 specific. list<SubtargetFeature> FusionFeatures = [ FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion, - FeatureLogicalFusion, FeatureArithAddFusion + FeatureLogicalFusion, FeatureArithAddFusion, FeatureSha3Fusion, ]; list<SubtargetFeature> P10AdditionalFeatures = !listconcat(FusionFeatures, [ diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index a76963abb8e4..16e3b2b85c2e 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -875,18 +875,19 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } - case PPC::ADDItoc: { + case PPC::ADDItoc: + case PPC::ADDItoc8: { assert(IsAIX && TM.getCodeModel() == CodeModel::Small && - "Operand only valid in AIX 32 bit mode"); + "PseudoOp only valid for small code model AIX"); - // Transform %rN = ADDItoc @op1, %r2. + // Transform %rN = ADDItoc/8 @op1, %r2. LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); // Change the opcode to load address. - TmpInst.setOpcode(PPC::LA); + TmpInst.setOpcode((!IsPPC64) ? (PPC::LA) : (PPC::LA8)); const MachineOperand &MO = MI->getOperand(1); - assert(MO.isGlobal() && "Invalid operand for ADDItoc."); + assert(MO.isGlobal() && "Invalid operand for ADDItoc[8]."); // Map the operand to its corresponding MCSymbol. const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp index fa6713dcca80..4cac0e3551f6 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -120,16 +120,13 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) { static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo()); unsigned FuncSize = GetInitialOffset(Fn); - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) { - MachineBasicBlock *MBB = &*MFI; - + for (MachineBasicBlock &MBB : Fn) { // The end of the previous block may have extra nops if this block has an // alignment requirement. - if (MBB->getNumber() > 0) { - unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize); + if (MBB.getNumber() > 0) { + unsigned AlignExtra = GetAlignmentAdjustment(MBB, FuncSize); - auto &BS = BlockSizes[MBB->getNumber()-1]; + auto &BS = BlockSizes[MBB.getNumber()-1]; BS.first += AlignExtra; BS.second = AlignExtra; @@ -138,10 +135,10 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) { unsigned BlockSize = 0; unsigned UnalignedBytesRemaining = 0; - for (MachineInstr &MI : *MBB) { + for (MachineInstr &MI : MBB) { unsigned MINumBytes = TII->getInstSizeInBytes(MI); if (MI.isInlineAsm() && (FirstImpreciseBlock < 0)) - FirstImpreciseBlock = MBB->getNumber(); + FirstImpreciseBlock = MBB.getNumber(); if (TII->isPrefixed(MI.getOpcode())) { NumPrefixed++; @@ -171,7 +168,7 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) { BlockSize += MINumBytes; } - BlockSizes[MBB->getNumber()].first = BlockSize; + BlockSizes[MBB.getNumber()].first = BlockSize; FuncSize += BlockSize; } @@ -181,16 +178,13 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) { /// Modify the basic block align adjustment. void PPCBSel::modifyAdjustment(MachineFunction &Fn) { unsigned Offset = GetInitialOffset(Fn); - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) { - MachineBasicBlock *MBB = &*MFI; - - if (MBB->getNumber() > 0) { - auto &BS = BlockSizes[MBB->getNumber()-1]; + for (MachineBasicBlock &MBB : Fn) { + if (MBB.getNumber() > 0) { + auto &BS = BlockSizes[MBB.getNumber()-1]; BS.first -= BS.second; Offset -= BS.second; - unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset); + unsigned AlignExtra = GetAlignmentAdjustment(MBB, Offset); BS.first += AlignExtra; BS.second = AlignExtra; @@ -198,7 +192,7 @@ void PPCBSel::modifyAdjustment(MachineFunction &Fn) { Offset += AlignExtra; } - Offset += BlockSizes[MBB->getNumber()].first; + Offset += BlockSizes[MBB.getNumber()].first; } } diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index b9518d6d7064..b1f5bdd885cd 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -81,8 +81,7 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() { } static bool clobbersCTR(const MachineInstr &MI) { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg()) { if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) return true; @@ -167,18 +166,16 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before // any other instructions that might clobber the ctr register. - for (MachineFunction::iterator I = MF.begin(), IE = MF.end(); - I != IE; ++I) { - MachineBasicBlock *MBB = &*I; - if (!MDT->isReachableFromEntry(MBB)) + for (MachineBasicBlock &MBB : MF) { + if (!MDT->isReachableFromEntry(&MBB)) continue; - for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(), - MIIE = MBB->end(); MII != MIIE; ++MII) { + for (MachineBasicBlock::iterator MII = MBB.getFirstTerminator(), + MIIE = MBB.end(); MII != MIIE; ++MII) { unsigned Opc = MII->getOpcode(); if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ || Opc == PPC::BDZ8 || Opc == PPC::BDZ) - if (!verifyCTRBranch(MBB, MII)) + if (!verifyCTRBranch(&MBB, MII)) llvm_unreachable("Invalid PPC CTR loop!"); } } diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp index be4c9dd60b00..a9794ddd0566 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp @@ -74,8 +74,7 @@ bool PPCExpandAtomicPseudo::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; TII = static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = &TII->getRegisterInfo(); - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock &MBB = *I; + for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE;) { MachineInstr &MI = *MBBI; diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index fc3c7ec35b8d..3ca563fee970 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -391,9 +391,8 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg; unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) - for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) { + for (MachineBasicBlock &MBB : MF) + for (MachineBasicBlock::iterator MBBI = MBB.end(); MBBI != MBB.begin();) { --MBBI; for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { MachineOperand &MO = MBBI->getOperand(I); @@ -1172,8 +1171,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, // Describe where callee saved registers were saved, at fixed offsets from // CFA. const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue; // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just @@ -1204,15 +1203,15 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, continue; } - if (CSI[I].isSpilledToReg()) { - unsigned SpilledReg = CSI[I].getDstReg(); + if (I.isSpilledToReg()) { + unsigned SpilledReg = I.getDstReg(); unsigned CFIRegister = MF.addFrameInst(MCCFIInstruction::createRegister( nullptr, MRI->getDwarfRegNum(Reg, true), MRI->getDwarfRegNum(SpilledReg, true))); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIRegister); } else { - int64_t Offset = MFI.getObjectOffset(CSI[I].getFrameIdx()); + int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); // We have changed the object offset above but we do not want to change // the actual offsets in the CFI instruction so we have to undo the // offset change here. @@ -2085,15 +2084,15 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, SmallVector<CalleeSavedInfo, 18> FPRegs; SmallVector<CalleeSavedInfo, 18> VRegs; - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() || (Reg != PPC::X2 && Reg != PPC::R2)) && "Not expecting to try to spill R2 in a function that must save TOC"); if (PPC::GPRCRegClass.contains(Reg)) { HasGPSaveArea = true; - GPRegs.push_back(CSI[i]); + GPRegs.push_back(I); if (Reg < MinGPR) { MinGPR = Reg; @@ -2101,7 +2100,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, } else if (PPC::G8RCRegClass.contains(Reg)) { HasG8SaveArea = true; - G8Regs.push_back(CSI[i]); + G8Regs.push_back(I); if (Reg < MinG8R) { MinG8R = Reg; @@ -2109,7 +2108,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, } else if (PPC::F8RCRegClass.contains(Reg)) { HasFPSaveArea = true; - FPRegs.push_back(CSI[i]); + FPRegs.push_back(I); if (Reg < MinFPR) { MinFPR = Reg; @@ -2123,7 +2122,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, // alignment requirements, so overload the save area for both cases. HasVRSaveArea = true; - VRegs.push_back(CSI[i]); + VRegs.push_back(I); if (Reg < MinVR) { MinVR = Reg; @@ -2395,8 +2394,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( } }); - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); // CR2 through CR4 are the nonvolatile CR fields. bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4; @@ -2439,11 +2438,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW)) .addReg(PPC::R12, getKillRegState(true)), - CSI[i].getFrameIdx())); + I.getFrameIdx())); } } else { - if (CSI[i].isSpilledToReg()) { - unsigned Dst = CSI[i].getDstReg(); + if (I.isSpilledToReg()) { + unsigned Dst = I.getDstReg(); if (Spilled[Dst]) continue; @@ -2478,9 +2477,9 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( if (Subtarget.needsSwapsForVSXMemOps() && !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, - CSI[i].getFrameIdx(), RC, TRI); + I.getFrameIdx(), RC, TRI); else - TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(), + TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC, TRI); } } diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 0abdf81d0908..a2664bcff4ab 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -510,14 +510,12 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) { return false; // TODO: These asserts should be updated as more support for the toc data - // transformation is added (64 bit, struct support, etc.). + // transformation is added (struct support, etc.). - assert(PointerSize == 4 && "Only 32 Bit Codegen is currently supported by " - "the toc data transformation."); - - assert(PointerSize >= GV->getAlign().valueOrOne().value() && - "GlobalVariables with an alignment requirement stricter then 4-bytes " - "not supported by the toc data transformation."); + assert( + PointerSize >= GV->getAlign().valueOrOne().value() && + "GlobalVariables with an alignment requirement stricter than TOC entry " + "size not supported by the toc data transformation."); Type *GVType = GV->getValueType(); @@ -537,7 +535,7 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) { "supported by the toc data transformation."); assert(GVType->getPrimitiveSizeInBits() <= PointerSize * 8 && - "A GlobalVariable with size larger than 32 bits is not currently " + "A GlobalVariable with size larger than a TOC entry is not currently " "supported by the toc data transformation."); if (GV->hasLocalLinkage() || GV->hasPrivateLinkage()) @@ -5049,16 +5047,94 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // value for the comparison. When selecting through a .td file, a type // error is raised. Must check this first so we never break on the // !Subtarget->isISA3_1() check. - if (N->getConstantOperandVal(0) == Intrinsic::ppc_fsels) { + auto IntID = N->getConstantOperandVal(0); + if (IntID == Intrinsic::ppc_fsels) { SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3)}; CurDAG->SelectNodeTo(N, PPC::FSELS, MVT::f32, Ops); return; } + if (IntID == Intrinsic::ppc_bcdadd_p || IntID == Intrinsic::ppc_bcdsub_p) { + auto Pred = N->getConstantOperandVal(1); + unsigned Opcode = + IntID == Intrinsic::ppc_bcdadd_p ? PPC::BCDADD_rec : PPC::BCDSUB_rec; + unsigned SubReg = 0; + unsigned ShiftVal = 0; + bool Reverse = false; + switch (Pred) { + case 0: + SubReg = PPC::sub_eq; + ShiftVal = 1; + break; + case 1: + SubReg = PPC::sub_eq; + ShiftVal = 1; + Reverse = true; + break; + case 2: + SubReg = PPC::sub_lt; + ShiftVal = 3; + break; + case 3: + SubReg = PPC::sub_lt; + ShiftVal = 3; + Reverse = true; + break; + case 4: + SubReg = PPC::sub_gt; + ShiftVal = 2; + break; + case 5: + SubReg = PPC::sub_gt; + ShiftVal = 2; + Reverse = true; + break; + case 6: + SubReg = PPC::sub_un; + break; + case 7: + SubReg = PPC::sub_un; + Reverse = true; + break; + } + + EVT VTs[] = {MVT::v16i8, MVT::Glue}; + SDValue Ops[] = {N->getOperand(2), N->getOperand(3), + CurDAG->getTargetConstant(0, dl, MVT::i32)}; + SDValue BCDOp = SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, Ops), 0); + SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32); + // On Power10, we can use SETBC[R]. On prior architectures, we have to use + // MFOCRF and shift/negate the value. + if (Subtarget->isISA3_1()) { + SDValue SubRegIdx = CurDAG->getTargetConstant(SubReg, dl, MVT::i32); + SDValue CRBit = SDValue( + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1, + CR6Reg, SubRegIdx, BCDOp.getValue(1)), + 0); + CurDAG->SelectNodeTo(N, Reverse ? PPC::SETBCR : PPC::SETBC, MVT::i32, + CRBit); + } else { + SDValue Move = + SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR6Reg, + BCDOp.getValue(1)), + 0); + SDValue Ops[] = {Move, getI32Imm((32 - (4 + ShiftVal)) & 31, dl), + getI32Imm(31, dl), getI32Imm(31, dl)}; + if (!Reverse) + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + else { + SDValue Shift = SDValue( + CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Shift, getI32Imm(1, dl)); + } + } + return; + } + if (!Subtarget->isISA3_1()) break; unsigned Opcode = 0; - switch (N->getConstantOperandVal(0)) { + switch (IntID) { default: break; case Intrinsic::ppc_altivec_vstribr_p: @@ -5713,41 +5789,57 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (isAIXABI && CModel == CodeModel::Medium) report_fatal_error("Medium code model is not supported on AIX."); - // For 64-bit small code model, we allow SelectCodeCommon to handle this, - // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA. - if (isPPC64 && CModel == CodeModel::Small) + // For 64-bit ELF small code model, we allow SelectCodeCommon to handle + // this, selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA. For AIX + // small code model, we need to check for a toc-data attribute. + if (isPPC64 && !isAIXABI && CModel == CodeModel::Small) break; + auto replaceWith = [this, &dl](unsigned OpCode, SDNode *TocEntry, + EVT OperandTy) { + SDValue GA = TocEntry->getOperand(0); + SDValue TocBase = TocEntry->getOperand(1); + SDNode *MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase); + transferMemOperands(TocEntry, MN); + ReplaceNode(TocEntry, MN); + }; + // Handle 32-bit small code model. - if (!isPPC64) { + if (!isPPC64 && CModel == CodeModel::Small) { // Transforms the ISD::TOC_ENTRY node to passed in Opcode, either // PPC::ADDItoc, or PPC::LWZtoc - auto replaceWith = [this, &dl](unsigned OpCode, SDNode *TocEntry) { - SDValue GA = TocEntry->getOperand(0); - SDValue TocBase = TocEntry->getOperand(1); - SDNode *MN = CurDAG->getMachineNode(OpCode, dl, MVT::i32, GA, TocBase); - transferMemOperands(TocEntry, MN); - ReplaceNode(TocEntry, MN); - }; - if (isELFABI) { assert(TM.isPositionIndependent() && "32-bit ELF can only have TOC entries in position independent" " code."); // 32-bit ELF always uses a small code model toc access. - replaceWith(PPC::LWZtoc, N); + replaceWith(PPC::LWZtoc, N, MVT::i32); return; } - if (isAIXABI && CModel == CodeModel::Small) { - if (hasTocDataAttr(N->getOperand(0), - CurDAG->getDataLayout().getPointerSize())) - replaceWith(PPC::ADDItoc, N); - else - replaceWith(PPC::LWZtoc, N); + assert(isAIXABI && "ELF ABI already handled"); + if (hasTocDataAttr(N->getOperand(0), + CurDAG->getDataLayout().getPointerSize())) { + replaceWith(PPC::ADDItoc, N, MVT::i32); return; } + + replaceWith(PPC::LWZtoc, N, MVT::i32); + return; + } + + if (isPPC64 && CModel == CodeModel::Small) { + assert(isAIXABI && "ELF ABI handled in common SelectCode"); + + if (hasTocDataAttr(N->getOperand(0), + CurDAG->getDataLayout().getPointerSize())) { + replaceWith(PPC::ADDItoc8, N, MVT::i64); + return; + } + // Break if it doesn't have toc data attribute. Proceed with common + // SelectCode. + break; } assert(CModel != CodeModel::Small && "All small code models handled."); diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ac952b240a48..ec7e30d7e362 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12116,6 +12116,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineFunction::iterator It = ++BB->getIterator(); MachineFunction *F = BB->getParent(); + MachineRegisterInfo &MRI = F->getRegInfo(); if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 || @@ -12721,7 +12722,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Register OldFPSCRReg = MI.getOperand(0).getReg(); // Save FPSCR value. - BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); + if (MRI.use_empty(OldFPSCRReg)) + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg); + else + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg); // The floating point rounding mode is in the bits 62:63 of FPCSR, and has // the following settings: @@ -12854,7 +12858,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Result of setflm is previous FPSCR content, so we need to save it first. Register OldFPSCRReg = MI.getOperand(0).getReg(); - BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg); + if (MRI.use_empty(OldFPSCRReg)) + BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg); + else + BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg); // Put bits in 32:63 to FPSCR. Register NewFPSCRReg = MI.getOperand(1).getReg(); @@ -15966,8 +15973,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } break; case 'v': - if (Subtarget.hasAltivec()) + if (Subtarget.hasAltivec() && VT.isVector()) return std::make_pair(0U, &PPC::VRRCRegClass); + else if (Subtarget.hasVSX()) + // Scalars in Altivec registers only make sense with VSX. + return std::make_pair(0U, &PPC::VFRCRegClass); break; case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); @@ -17664,6 +17674,24 @@ PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp, return Mode; } +bool PPCTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { + EVT ValVT = Val.getValueType(); + // If we are splitting a scalar integer into f64 parts (i.e. so they + // can be placed into VFRC registers), we need to zero extend and + // bitcast the values. This will ensure the value is placed into a + // VSR using direct moves or stack operations as needed. + if (PartVT == MVT::f64 && + (ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) { + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Val); + Parts[0] = Val; + return true; + } + return false; +} + // If we happen to match to an aligned D-Form, check if the Frame Index is // adequately aligned. If it is not, reset the mode to match to X-Form. static void setXFormForUnalignedFI(SDValue N, unsigned Flags, diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h index 34dce2c3172d..87b7f96112ec 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1139,6 +1139,10 @@ namespace llvm { PPC::AddrMode SelectForceXFormMode(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG) const; + bool + splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, + SDValue *Parts, unsigned NumParts, MVT PartVT, + Optional<CallingConv::ID> CC) const override; /// Structure that collects some common arguments that get passed around /// between the functions for call lowering. struct CallFlags { diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 417a6ce7e522..58af8037f59c 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -773,6 +773,11 @@ def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm), "addis $rD, $rA, $imm", IIC_IntSimple, [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>; +def LA8 : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$sym), + "la $rD, $sym($rA)", IIC_IntGeneral, + [(set i64:$rD, (add i64:$rA, + (PPClo tglobaladdr:$sym, 0)))]>; + let Defs = [CARRY] in { def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), "subfic $rD, $rA, $imm", IIC_IntGeneral, @@ -1435,6 +1440,13 @@ def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentr def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDItocL", []>, isPPC64; } + +// Local Data Transform +def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), + "#ADDItoc8", + [(set i64:$rD, + (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64; + let mayLoad = 1 in def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), "#LDtocL", []>, isPPC64; diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 1e0e2d88e54b..fe21a164dfab 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1161,6 +1161,22 @@ def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot v4i32:$vB)))), } // end HasAltivec +// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set. +class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern> + : VX_RD5_RSp5_PS1_XO9<xo, + (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS), + !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> { + let Defs = [CR6]; +} + +// [PO VRT VRA VRB 1 / XO] +class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern> + : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> { + let Defs = [CR6]; + let PS = 0; +} + def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">; def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">; let Predicates = [HasP8Altivec] in { @@ -1351,6 +1367,13 @@ def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw, v2i64, v4i32>; def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw, v2i64, v4i32>; +def BCDADD_rec : VX_VT5_VA5_VB5_PS1_XO9_o<1, "bcdadd." , []>; +def BCDSUB_rec : VX_VT5_VA5_VB5_PS1_XO9_o<65, "bcdsub." , []>; + +def : Pat<(v16i8 (int_ppc_bcdadd v16i8:$vA, v16i8:$vB, timm:$PS)), + (BCDADD_rec $vA, $vB, $PS)>; +def : Pat<(v16i8 (int_ppc_bcdsub v16i8:$vA, v16i8:$vB, timm:$PS)), + (BCDSUB_rec $vA, $vB, $PS)>; // Shuffle patterns for unary and swapped (LE) vector pack modulo. def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef), @@ -1598,22 +1621,6 @@ def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>; def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>; -// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set. -class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern> - : VX_RD5_RSp5_PS1_XO9<xo, - (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS), - !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> { - let Defs = [CR6]; -} - -// [PO VRT VRA VRB 1 / XO] -class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern> - : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> { - let Defs = [CR6]; - let PS = 0; -} - // Decimal Shift/Unsigned-Shift/Shift-and-Round def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>; def BCDUS_rec : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>; diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 649a150866b4..a0fd2111de11 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2138,9 +2138,8 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } static bool MBBDefinesCTR(MachineBasicBlock &MBB) { - for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); - I != IE; ++I) - if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8)) + for (MachineInstr &MI : MBB) + if (MI.definesRegister(PPC::CTR) || MI.definesRegister(PPC::CTR8)) return true; return false; } @@ -2331,8 +2330,7 @@ bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI, &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass }; bool Found = false; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) { const TargetRegisterClass *RC = RCs[c]; if (MO.isReg()) { diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td index d2d5ca92ca1c..d92a10c5b208 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2471,6 +2471,7 @@ def DblwdCmp { // [HasVSX, HasP8Vector, IsLittleEndian] // [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64] // [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] +// [HasVSX, HasP8Altivec] // [HasVSX, HasDirectMove] // [HasVSX, HasDirectMove, IsBigEndian] // [HasVSX, HasDirectMove, IsLittleEndian] @@ -2500,6 +2501,10 @@ let Predicates = [HasVSX, IsBigEndian, HasP8Altivec] in def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a, v16i8:$b, v16i8:$c)), (v16i8 (VPERMXOR $a, $b, $c))>; +let Predicates = [HasVSX, HasP8Altivec] in + def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor_be v16i8:$a, + v16i8:$b, v16i8:$c)), + (v16i8 (VPERMXOR $a, $b, $c))>; let AddedComplexity = 400 in { // Valid for any VSX subtarget, regardless of endianness. diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 5cc180d770b2..22c5b6c11289 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -152,9 +152,9 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP) { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; - if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP)) + if (LowerPPCMachineOperandToMCOperand(MO, MCOp, AP)) OutMI.addOperand(MCOp); } } diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp index bdff5109c1e1..9d5206f8fd43 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp @@ -149,6 +149,79 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd, case FusionFeature::FK_SldiAdd: return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) || (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57)); + + // rldicl rx, ra, 1, 0 - xor + case FusionFeature::FK_RotateLeftXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 0); + + // rldicr rx, ra, 1, 63 - xor + case FusionFeature::FK_RotateRightXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 63); + + // We actually use CMPW* and CMPD*, 'l' doesn't exist as an operand in instr. + + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp1: + // { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } + // { ld,ldx } - cmpli 0,1,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp2: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + if (SecondMI.getOpcode() == PPC::CMPDI && + matchingImmOps(SecondMI, 2, -1, 16)) + return true; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1); + } + + // { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } + case FusionFeature::FK_LoadCmp3: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1) || + matchingImmOps(SecondMI, 2, -1, 16); + } + + // mtctr - { bcctr,bcctrl } + case FusionFeature::FK_ZeroMoveCTR: + // ( mtctr rx ) is alias of ( mtspr 9, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 9); + + // mtlr - { bclr,bclrl } + case FusionFeature::FK_ZeroMoveLR: + // ( mtlr rx ) is alias of ( mtspr 8, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 8); + + // addis rx,ra,si - addi rt,rx,SI, SI >= 0 + case FusionFeature::FK_AddisAddi: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = SecondMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + return SignExtend64(SI.getImm(), 16) >= 0; + } + + // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 + case FusionFeature::FK_AddiAddis: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = FirstMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + int64_t ExtendedSI = SignExtend64(SI.getImm(), 16); + return ExtendedSI >= 2; + } } llvm_unreachable("All the cases should have been handled"); diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def index 469a24800423..e4954b722fd0 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCMacroFusion.def @@ -78,5 +78,80 @@ FUSION_FEATURE(VecLogical, hasLogicalFusion, -1, FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8)) +// rldicl rx, ra, 1, 0 - xor +FUSION_FEATURE(RotateLeftXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICL, RLDICL_32, RLDICL_32_64), + FUSION_OP_SET(XOR, XOR8)) + +// rldicr rx, ra, 1, 63 - xor +FUSION_FEATURE(RotateRightXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(XOR, XOR8)) + +// There're two special cases in 'load-compare' series, so we have to split +// them into several pattern groups to fit into current framework. This can +// be clearer once we switched to a more expressive approach. + +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp1, hasCompareFusion, 1, + FUSION_OP_SET(LBZ, LBZ8, LBZX, LBZX8, LBZXTLS, LBZXTLS_, + LBZXTLS_32, LHZ, LHZ8, LHZX, LHZX8, LHZXTLS, + LHZXTLS_, LHZXTLS_32, LWZ, LWZ8, LWZX, LWZX8, + LWZXTLS, LWZXTLS_, LWZXTLS_32), + FUSION_OP_SET(CMPDI, CMPLDI, CMPLWI)) + +// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } +// { ld,ldx } - cmpli 0,1,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp2, hasCompareFusion, 1, + FUSION_OP_SET(LD, LDX, LDXTLS, LDXTLS_), + FUSION_OP_SET(CMPDI, CMPLDI)) + +// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } +FUSION_FEATURE(LoadCmp3, hasCompareFusion, 1, + FUSION_OP_SET(LHA, LHA8, LHAX, LHAX8, LWA, LWA_32, LWAX, + LWAX_32), + FUSION_OP_SET(CMPLDI, CMPLWI)) + +// ori - oris +FUSION_FEATURE(OriOris, hasWideImmFusion, 1, FUSION_OP_SET(ORI, ORI8), + FUSION_OP_SET(ORIS, ORIS8)) + +// lis - ori +FUSION_FEATURE(LisOri, hasWideImmFusion, 1, FUSION_OP_SET(LIS, LIS8), + FUSION_OP_SET(ORI, ORI8)) + +// oris - ori +FUSION_FEATURE(OrisOri, hasWideImmFusion, 1, FUSION_OP_SET(ORIS, ORIS8), + FUSION_OP_SET(ORI, ORI8)) + +// xori - xoris +FUSION_FEATURE(XoriXoris, hasWideImmFusion, 1, FUSION_OP_SET(XORI, XORI8), + FUSION_OP_SET(XORIS, XORIS8)) + +// xoris - xori +FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8), + FUSION_OP_SET(XORI, XORI8)) + +// addis rx,ra,si - addi rt,rx,SI, SI >= 0 +FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1, + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), + FUSION_OP_SET(ADDI, ADDI8, ADDItocL)) + +// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 +FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1, + FUSION_OP_SET(ADDI, ADDI8, ADDItocL), + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8)) + +// mtctr - { bcctr,bcctrl } +FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTCTR, MTCTRloop, MTSPR8, MTSPR), + FUSION_OP_SET(BCCTR, BCCTRn, BCCTR8, BCCTR8n, BCCTRL, BCCTRLn, + BCCTRL8, BCCTRL8n, gBCCTR, gBCCTRL)) + +// mtlr - { bclr,bclrl } +FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR), + FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL)) + #undef FUSION_FEATURE #undef FUSION_OP_SET diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index dfc29dbb10f1..1258a1281597 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -131,6 +131,10 @@ void PPCSubtarget::initializeEnvironment() { HasAddLogicalFusion = false; HasLogicalAddFusion = false; HasLogicalFusion = false; + HasSha3Fusion = false; + HasCompareFusion = false; + HasWideImmFusion = false; + HasZeroMoveFusion = false; IsISA2_06 = false; IsISA2_07 = false; IsISA3_0 = false; diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h index 783ea121ccb8..d52833cb1465 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -151,6 +151,10 @@ protected: bool HasAddLogicalFusion; bool HasLogicalAddFusion; bool HasLogicalFusion; + bool HasSha3Fusion; + bool HasCompareFusion; + bool HasWideImmFusion; + bool HasZeroMoveFusion; bool IsISA2_06; bool IsISA2_07; bool IsISA3_0; @@ -340,6 +344,10 @@ public: bool hasAddLogicalFusion() const { return HasAddLogicalFusion; } bool hasLogicalAddFusion() const { return HasLogicalAddFusion; } bool hasLogicalFusion() const { return HasLogicalFusion; } + bool hasCompareFusion() const { return HasCompareFusion; } + bool hasWideImmFusion() const { return HasWideImmFusion; } + bool hasSha3Fusion() const { return HasSha3Fusion; } + bool hasZeroMoveFusion() const { return HasZeroMoveFusion; } bool needsSwapsForVSXMemOps() const { return hasVSX() && isLittleEndian() && !hasP9Vector(); } diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 77d5a2668b60..5d6f58a77a39 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -318,9 +318,20 @@ InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind); } +// Check if the current Type is an MMA vector type. Valid MMA types are +// v256i1 and v512i1 respectively. +static bool isMMAType(Type *Ty) { + return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) && + (Ty->getPrimitiveSizeInBits() > 128); +} + InstructionCost PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands, TTI::TargetCostKind CostKind) { + // Set the max cost if an MMA type is present (v256i1, v512i1). + if (isMMAType(U->getType())) + return InstructionCost::getMax(); + // We already implement getCastInstrCost and getMemoryOpCost where we perform // the vector adjustment there. if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U)) @@ -942,32 +953,39 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -// Adjust the cost of vector instructions on targets which there is overlap -// between the vector and scalar units, thereby reducing the overall throughput -// of vector code wrt. scalar code. -InstructionCost PPCTTIImpl::vectorCostAdjustment(InstructionCost Cost, - unsigned Opcode, Type *Ty1, - Type *Ty2) { +// Returns a cost adjustment factor to adjust the cost of vector instructions +// on targets which there is overlap between the vector and scalar units, +// thereby reducing the overall throughput of vector code wrt. scalar code. +// An invalid instruction cost is returned if the type is an MMA vector type. +InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode, + Type *Ty1, Type *Ty2) { + // If the vector type is of an MMA type (v256i1, v512i1), an invalid + // instruction cost is returned. This is to signify to other cost computing + // functions to return the maximum instruction cost in order to prevent any + // opportunities for the optimizer to produce MMA types within the IR. + if (isMMAType(Ty1)) + return InstructionCost::getInvalid(); + if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy()) - return Cost; + return InstructionCost(1); std::pair<InstructionCost, MVT> LT1 = TLI->getTypeLegalizationCost(DL, Ty1); // If type legalization involves splitting the vector, we don't want to // double the cost at every step - only the last step. if (LT1.first != 1 || !LT1.second.isVector()) - return Cost; + return InstructionCost(1); int ISD = TLI->InstructionOpcodeToISD(Opcode); if (TLI->isOperationExpand(ISD, LT1.second)) - return Cost; + return InstructionCost(1); if (Ty2) { std::pair<InstructionCost, MVT> LT2 = TLI->getTypeLegalizationCost(DL, Ty2); if (LT2.first != 1 || !LT2.second.isVector()) - return Cost; + return InstructionCost(1); } - return Cost * 2; + return InstructionCost(2); } InstructionCost PPCTTIImpl::getArithmeticInstrCost( @@ -977,6 +995,11 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost( TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + + InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + // TODO: Handle more cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -986,12 +1009,18 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost( // Fallback to the default implementation. InstructionCost Cost = BaseT::getArithmeticInstrCost( Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); - return vectorCostAdjustment(Cost, Opcode, Ty, nullptr); + return Cost * CostFactor; } InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef<int> Mask, int Index, Type *SubTp) { + + InstructionCost CostFactor = + vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -1000,8 +1029,7 @@ InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the // structured types of shuffles covered by TTI::ShuffleKind). - return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp, - nullptr); + return LT.first * CostFactor; } InstructionCost PPCTTIImpl::getCFInstrCost(unsigned Opcode, @@ -1020,9 +1048,13 @@ InstructionCost PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + InstructionCost Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); - Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src); + Cost *= CostFactor; // TODO: Allow non-throughput costs that aren't binary. if (CostKind != TTI::TCK_RecipThroughput) return Cost == 0 ? 0 : 1; @@ -1034,12 +1066,17 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) { + InstructionCost CostFactor = + vectorCostAdjustmentFactor(Opcode, ValTy, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + InstructionCost Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return Cost; - return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr); + return Cost * CostFactor; } InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, @@ -1049,8 +1086,12 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + InstructionCost Cost = BaseT::getVectorInstrCost(Opcode, Val, Index); - Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr); + Cost *= CostFactor; if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { // Double-precision scalars are already located in index #0 (or #1 if LE). @@ -1065,7 +1106,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (ISD == ISD::INSERT_VECTOR_ELT) // A move-to VSR and a permute/insert. Assume vector operation cost // for both (cost will be 2x on P9). - return vectorCostAdjustment(2, Opcode, Val, nullptr); + return 2 * CostFactor; // It's an extract. Maybe we can do a cheap move-from VSR. unsigned EltSize = Val->getScalarSizeInBits(); @@ -1082,7 +1123,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // We need a vector extract (or mfvsrld). Assume vector operation cost. // The cost of the load constant for a vector extract is disregarded // (invariant, easily schedulable). - return vectorCostAdjustment(1, Opcode, Val, nullptr); + return CostFactor; } else if (ST->hasDirectMove()) // Assume permute has standard cost. @@ -1114,6 +1155,11 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) { + + InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + if (TLI->getValueType(DL, Src, true) == MVT::Other) return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); @@ -1128,7 +1174,7 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, if (CostKind != TTI::TCK_RecipThroughput) return Cost; - Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr); + Cost *= CostFactor; bool IsAltivecType = ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || @@ -1194,6 +1240,11 @@ InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { + InstructionCost CostFactor = + vectorCostAdjustmentFactor(Opcode, VecTy, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, CostKind, diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index aa84013803af..7aeb0c59d503 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -100,8 +100,8 @@ public: unsigned getCacheLineSize() const override; unsigned getPrefetchDistance() const override; unsigned getMaxInterleaveFactor(unsigned VF); - InstructionCost vectorCostAdjustment(InstructionCost Cost, unsigned Opcode, - Type *Ty1, Type *Ty2); + InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, + Type *Ty2); InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index d1979b5456ce..f1c3810f4ee5 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -170,6 +170,14 @@ void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo, void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); + // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx, + // or non-zero bits 8/9/10. + if (RISCVVType::getVLMUL(Imm) == RISCVII::VLMUL::LMUL_RESERVED || + RISCVVType::getSEW(Imm) > 64 || (Imm & 0x700) != 0) { + O << Imm; + return; + } + // Print the text form. RISCVVType::printVType(Imm, O); } diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 595c3cdfbb1d..f5d491938050 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -237,7 +237,13 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo *TRI = STI.getRegisterInfo(); - return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF); + // If we do not reserve stack space for outgoing arguments in prologue, + // we will adjust the stack pointer before call instruction. After the + // adjustment, we can not use SP to access the stack objects for the + // arguments. Instead, use BP to access these stack objects. + return (MFI.hasVarSizedObjects() || + (!hasReservedCallFrame(MF) && MFI.getMaxCallFrameSize() != 0)) && + TRI->hasStackRealignment(MF); } // Determines the size of the frame and maximum call frame size. @@ -1065,10 +1071,14 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( if (MI != MBB.end() && !MI->isDebugInstr()) DL = MI->getDebugLoc(); - // Manually restore values not restored by libcall. Insert in reverse order. + // Manually restore values not restored by libcall. + // Keep the same order as in the prologue. There is no need to reverse the + // order in the epilogue. In addition, the return address will be restored + // first in the epilogue. It increases the opportunity to avoid the + // load-to-use data hazard between loading RA and return by RA. // loadRegFromStackSlot can insert multiple instructions. const auto &NonLibcallCSI = getNonLibcallCSI(*MF, CSI); - for (auto &CS : reverse(NonLibcallCSI)) { + for (auto &CS : NonLibcallCSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0f1a6e5f9154..f3331571fc55 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -335,17 +335,29 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f16, Expand); - for (auto Op : FPOpToExpand) - setOperationAction(Op, MVT::f16, Expand); setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FCEIL, MVT::f16, Promote); - setOperationAction(ISD::FFLOOR, MVT::f16, Promote); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); - setOperationAction(ISD::FRINT, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Promote); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); - setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + + // We need to custom promote this. + if (Subtarget.is64Bit()) + setOperationAction(ISD::FPOWI, MVT::i32, Custom); } if (Subtarget.hasStdExtF()) { @@ -676,6 +688,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Custom); + setOperationAction(ISD::FCEIL, VT, Custom); + setOperationAction(ISD::FFLOOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); @@ -924,6 +940,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::FTRUNC, VT, Custom); + setOperationAction(ISD::FCEIL, VT, Custom); + setOperationAction(ISD::FFLOOR, VT, Custom); + for (auto CC : VFPCCToExpand) setCondCodeAction(CC, VT, Expand); @@ -1165,6 +1185,10 @@ bool RISCVTargetLowering::shouldSinkOperands( case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: return Operand == 1; case Instruction::Call: if (auto *II = dyn_cast<IntrinsicInst>(I)) { @@ -1631,6 +1655,66 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) { return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); } +// Expand vector FTRUNC, FCEIL, and FFLOOR by converting to the integer domain +// and back. Taking care to avoid converting values that are nan or already +// correct. +// TODO: Floor and ceil could be shorter by changing rounding mode, but we don't +// have FRM dependencies modeled yet. +static SDValue lowerFTRUNC_FCEIL_FFLOOR(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert(VT.isVector() && "Unexpected type"); + + SDLoc DL(Op); + + // Freeze the source since we are increasing the number of uses. + SDValue Src = DAG.getNode(ISD::FREEZE, DL, VT, Op.getOperand(0)); + + // Truncate to integer and convert back to FP. + MVT IntVT = VT.changeVectorElementTypeToInteger(); + SDValue Truncated = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, Src); + Truncated = DAG.getNode(ISD::SINT_TO_FP, DL, VT, Truncated); + + MVT SetccVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); + + if (Op.getOpcode() == ISD::FCEIL) { + // If the truncated value is the greater than or equal to the original + // value, we've computed the ceil. Otherwise, we went the wrong way and + // need to increase by 1. + // FIXME: This should use a masked operation. Handle here or in isel? + SDValue Adjust = DAG.getNode(ISD::FADD, DL, VT, Truncated, + DAG.getConstantFP(1.0, DL, VT)); + SDValue NeedAdjust = DAG.getSetCC(DL, SetccVT, Truncated, Src, ISD::SETOLT); + Truncated = DAG.getSelect(DL, VT, NeedAdjust, Adjust, Truncated); + } else if (Op.getOpcode() == ISD::FFLOOR) { + // If the truncated value is the less than or equal to the original value, + // we've computed the floor. Otherwise, we went the wrong way and need to + // decrease by 1. + // FIXME: This should use a masked operation. Handle here or in isel? + SDValue Adjust = DAG.getNode(ISD::FSUB, DL, VT, Truncated, + DAG.getConstantFP(1.0, DL, VT)); + SDValue NeedAdjust = DAG.getSetCC(DL, SetccVT, Truncated, Src, ISD::SETOGT); + Truncated = DAG.getSelect(DL, VT, NeedAdjust, Adjust, Truncated); + } + + // Restore the original sign so that -0.0 is preserved. + Truncated = DAG.getNode(ISD::FCOPYSIGN, DL, VT, Truncated, Src); + + // Determine the largest integer that can be represented exactly. This and + // values larger than it don't have any fractional bits so don't need to + // be converted. + const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); + unsigned Precision = APFloat::semanticsPrecision(FltSem); + APFloat MaxVal = APFloat(FltSem); + MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1), + /*IsSigned*/ false, APFloat::rmNearestTiesToEven); + SDValue MaxValNode = DAG.getConstantFP(MaxVal, DL, VT); + + // If abs(Src) was larger than MaxVal or nan, keep it. + SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, Src); + SDValue Setcc = DAG.getSetCC(DL, SetccVT, Abs, MaxValNode, ISD::SETOLT); + return DAG.getSelect(DL, VT, Setcc, Truncated, Src); +} + static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -2670,6 +2754,20 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, DAG.getConstant(3, DL, VT)); return DAG.getNode(ISD::MUL, DL, VT, VScale, Op.getOperand(0)); } + case ISD::FPOWI: { + // Custom promote f16 powi with illegal i32 integer type on RV64. Once + // promoted this will be legalized into a libcall by LegalizeIntegerTypes. + if (Op.getValueType() == MVT::f16 && Subtarget.is64Bit() && + Op.getOperand(1).getValueType() == MVT::i32) { + SDLoc DL(Op); + SDValue Op0 = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0)); + SDValue Powi = + DAG.getNode(ISD::FPOWI, DL, MVT::f32, Op0, Op.getOperand(1)); + return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Powi, + DAG.getIntPtrConstant(0, DL)); + } + return SDValue(); + } case ISD::FP_EXTEND: { // RVV can only do fp_extend to types double the size as the source. We // custom-lower f16->f64 extensions to two hops of ISD::FP_EXTEND, going @@ -2858,6 +2956,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return lowerFP_TO_INT_SAT(Op, DAG); + case ISD::FTRUNC: + case ISD::FCEIL: + case ISD::FFLOOR: + return lowerFTRUNC_FCEIL_FFLOOR(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_SMAX: @@ -9834,6 +9936,23 @@ bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { return false; } +bool RISCVTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, + EVT VT) const { + if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple()) + return false; + + switch (FPVT.getSimpleVT().SimpleTy) { + case MVT::f16: + return Subtarget.hasStdExtZfh(); + case MVT::f32: + return Subtarget.hasStdExtF(); + case MVT::f64: + return Subtarget.hasStdExtD(); + default: + return false; + } +} + bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h index 8e3d716ae919..849928eb46ae 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -514,6 +514,8 @@ public: bool isLegalElementTypeForRVV(Type *ScalarTy) const; + bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td index b653928ccea9..6f9cde966132 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -705,6 +705,7 @@ def PseudoLD : PseudoLoad<"ld">; def PseudoSD : PseudoStore<"sd">; } // Predicates = [IsRV64] +def : InstAlias<"li $rd, $imm", (ADDI GPR:$rd, X0, simm12:$imm)>; def : InstAlias<"mv $rd, $rs", (ADDI GPR:$rd, GPR:$rs, 0)>; def : InstAlias<"not $rd, $rs", (XORI GPR:$rd, GPR:$rs, -1)>; def : InstAlias<"neg $rd, $rs", (SUB GPR:$rd, X0, GPR:$rs)>; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 388cce00bdf3..798532d5bc44 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/ErrorHandling.h" #define GET_REGINFO_TARGET_DESC @@ -320,3 +321,30 @@ RISCVRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, return &RISCV::VRRegClass; return RC; } + +void RISCVRegisterInfo::getOffsetOpcodes(const StackOffset &Offset, + SmallVectorImpl<uint64_t> &Ops) const { + // VLENB is the length of a vector register in bytes. We use <vscale x 8 x i8> + // to represent one vector register. The dwarf offset is + // VLENB * scalable_offset / 8. + assert(Offset.getScalable() % 8 == 0 && "Invalid frame offset"); + + // Add fixed-sized offset using existing DIExpression interface. + DIExpression::appendOffset(Ops, Offset.getFixed()); + + unsigned VLENB = getDwarfRegNum(RISCV::VLENB, true); + int64_t VLENBSized = Offset.getScalable() / 8; + if (VLENBSized > 0) { + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(VLENBSized); + Ops.append({dwarf::DW_OP_bregx, VLENB, 0ULL}); + Ops.push_back(dwarf::DW_OP_mul); + Ops.push_back(dwarf::DW_OP_plus); + } else if (VLENBSized < 0) { + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(-VLENBSized); + Ops.append({dwarf::DW_OP_bregx, VLENB, 0ULL}); + Ops.push_back(dwarf::DW_OP_mul); + Ops.push_back(dwarf::DW_OP_minus); + } +} diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 74a5b83ff6f3..2b2bbdfbdf32 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -63,6 +63,9 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const override; + + void getOffsetOpcodes(const StackOffset &Offset, + SmallVectorImpl<uint64_t> &Ops) const override; }; } diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index a915a572f3b7..a56f992d320e 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -480,6 +480,8 @@ let RegAltNameIndices = [ABIRegAltName] in { def VL : RISCVReg<0, "vl", ["vl"]>; def VXSAT : RISCVReg<0, "vxsat", ["vxsat"]>; def VXRM : RISCVReg<0, "vxrm", ["vxrm"]>; + def VLENB : RISCVReg<0, "vlenb", ["vlenb"]>, + DwarfRegNum<[!add(4096, SysRegVLENB.Encoding)]>; } foreach m = [1, 2, 4] in { diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td index 41599dd8bb3f..5a4c579dd708 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSystemOperands.td @@ -388,4 +388,4 @@ def : SysReg<"vxrm", 0x00A>; def : SysReg<"vcsr", 0x00F>; def : SysReg<"vl", 0xC20>; def : SysReg<"vtype", 0xC21>; -def : SysReg<"vlenb", 0xC22>; +def SysRegVLENB: SysReg<"vlenb", 0xC22>; diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp index 7319924a24ba..259b37954183 100644 --- a/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Sparc/DelaySlotFiller.cpp @@ -53,9 +53,8 @@ namespace { // instructions to fill delay slot. F.getRegInfo().invalidateLiveness(); - for (MachineFunction::iterator FI = F.begin(), FE = F.end(); - FI != FE; ++FI) - Changed |= runOnMachineBasicBlock(*FI); + for (MachineBasicBlock &MBB : F) + Changed |= runOnMachineBasicBlock(MBB); return Changed; } @@ -319,8 +318,7 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI, SmallSet<unsigned, 32>& RegDefs, SmallSet<unsigned, 32>& RegUses) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp index fa05a41f3127..bd26710fcbab 100644 --- a/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Sparc/LeonPasses.cpp @@ -42,8 +42,7 @@ bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) { DebugLoc DL = DebugLoc(); bool Modified = false; - for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) { - MachineBasicBlock &MBB = *MFI; + for (MachineBasicBlock &MBB : MF) { for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); @@ -77,10 +76,8 @@ bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<SparcSubtarget>(); bool Modified = false; - for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) { - MachineBasicBlock &MBB = *MFI; - for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) { - MachineInstr &MI = *MBBI; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { unsigned Opcode = MI.getOpcode(); if (Opcode == SP::CALL && MI.getNumOperands() > 0) { MachineOperand &MO = MI.getOperand(0); @@ -129,8 +126,7 @@ bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) { DebugLoc DL = DebugLoc(); bool Modified = false; - for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) { - MachineBasicBlock &MBB = *MFI; + for (MachineBasicBlock &MBB : MF) { for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index d165052ca512..a740de9123c9 100644 --- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -343,19 +343,18 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { } // Rewrite MBB's Live-ins. - for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); - MBB != E; ++MBB) { + for (MachineBasicBlock &MBB : MF) { for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) { - if (!MBB->isLiveIn(reg)) + if (!MBB.isLiveIn(reg)) continue; - MBB->removeLiveIn(reg); - MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1); + MBB.removeLiveIn(reg); + MBB.addLiveIn(reg - SP::I0_I1 + SP::O0_O1); } for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { - if (!MBB->isLiveIn(reg)) + if (!MBB.isLiveIn(reg)) continue; - MBB->removeLiveIn(reg); - MBB->addLiveIn(reg - SP::I0 + SP::O0); + MBB.removeLiveIn(reg); + MBB.addLiveIn(reg - SP::I0 + SP::O0); } } diff --git a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp index 8ea317fdd453..4e7e7bb5c81b 100644 --- a/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Sparc/SparcMCInstLower.cpp @@ -97,8 +97,7 @@ void llvm::LowerSparcMachineInstrToMCInst(const MachineInstr *MI, OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp = LowerOperand(MI, MO, AP); if (MCOp.isValid()) diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp index ac94570e568f..631cbff303e8 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -144,8 +144,7 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) { if (MI.isDebugInstr()) return Ref; - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI.getOperand(I); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg()) { if (Register MOReg = MO.getReg()) { if (TRI->regsOverlap(MOReg, Reg)) { diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index d11d118fb8ee..2f7cdfcf7bde 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -270,8 +270,8 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters( // Make sure all call-saved GPRs are included as operands and are // marked as live on entry. - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (SystemZ::GR64BitRegClass.contains(Reg)) addSavedGPR(MBB, MIB, Reg, true); } @@ -283,16 +283,16 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters( } // Save FPRs/VRs in the normal TargetInstrInfo way. - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); - TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(), + TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), &SystemZ::FP64BitRegClass, TRI); } if (SystemZ::VR128BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); - TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(), + TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), &SystemZ::VR128BitRegClass, TRI); } } @@ -313,13 +313,13 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters( DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Restore FPRs/VRs in the normal TargetInstrInfo way. - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) - TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), + TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), &SystemZ::FP64BitRegClass, TRI); if (SystemZ::VR128BitRegClass.contains(Reg)) - TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), + TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), &SystemZ::VR128BitRegClass, TRI); } @@ -345,8 +345,8 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters( MIB.addImm(RestoreGPRs.GPROffset); // Do a second scan adding regs as being defined by instruction - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (Reg != RestoreGPRs.LowGPR && Reg != RestoreGPRs.HighGPR && SystemZ::GR64BitRegClass.contains(Reg)) MIB.addReg(Reg, RegState::ImplicitDefine); @@ -965,24 +965,24 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( // Make sure all call-saved GPRs are included as operands and are // marked as live on entry. auto &GRRegClass = SystemZ::GR64BitRegClass; - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (GRRegClass.contains(Reg)) addSavedGPR(MBB, MIB, Reg, true); } } // Spill FPRs to the stack in the normal TargetInstrInfo way - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - unsigned Reg = CSI[I].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); - TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(), + TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), &SystemZ::FP64BitRegClass, TRI); } if (SystemZ::VR128BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); - TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(), + TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), &SystemZ::VR128BitRegClass, TRI); } } diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index 6fddb4f81c41..af219da79c32 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -29,7 +29,18 @@ public: create(const SystemZSubtarget &STI); // Override TargetFrameLowering. - bool isFPCloseToIncomingSP() const override { return false; } + bool allocateScavengingFrameIndexesNearIncomingSP( + const MachineFunction &MF) const override { + // SystemZ wants normal register scavenging slots, as close to the stack or + // frame pointer as possible. + // The default implementation assumes an x86-like layout, where the frame + // pointer is at the opposite end of the frame from the stack pointer. + // This meant that when frame pointer elimination was disabled, + // the slots ended up being as close as possible to the incoming + // stack pointer, which is the opposite of what we want on SystemZ. + return false; + } + bool hasReservedCallFrame(const MachineFunction &MF) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -43,7 +54,6 @@ public: SystemZELFFrameLowering(); // Override TargetFrameLowering. - bool isFPCloseToIncomingSP() const override { return false; } bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2bf80882fa61..e80496e37781 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -203,8 +203,8 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode, Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef()); // Keep the remaining operands as-is. - for (unsigned I = 2; I < MI.getNumOperands(); ++I) - MIB.add(MI.getOperand(I)); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), 2)) + MIB.add(MO); MI.eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp index ef39f80a94ef..d2932de5a6ea 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp @@ -93,10 +93,8 @@ MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const { void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { - const MachineOperand &MO = MI->getOperand(I); + for (const MachineOperand &MO : MI->operands()) // Ignore all implicit register operands. if (!MO.isReg() || !MO.isImplicit()) OutMI.addOperand(lowerOperand(MO)); - } } diff --git a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp index 1fe9423e01b8..1d8c3d514bfb 100644 --- a/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp @@ -23,14 +23,6 @@ using namespace llvm; #define DEBUG_TYPE "ve-asmprinter" -// The generated AsmMatcher VEGenAsmWriter uses "VE" as the target -// namespace. -namespace llvm { -namespace VE { -using namespace VE; -} -} // namespace llvm - #define GET_INSTRUCTION_NAME #define PRINT_ALIAS_INSTR #include "VEGenAsmWriter.inc" @@ -62,13 +54,10 @@ void VEInstPrinter::printOperand(const MCInst *MI, int OpNum, } if (MO.isImm()) { - switch (MI->getOpcode()) { - default: - // Expects signed 32bit literals - int32_t TruncatedImm = static_cast<int32_t>(MO.getImm()); - O << TruncatedImm; - return; - } + // Expects signed 32bit literals. + int32_t TruncatedImm = static_cast<int32_t>(MO.getImm()); + O << TruncatedImm; + return; } assert(MO.isExpr() && "Unknown operand kind in printOperand"); diff --git a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp index ddcfb9da8249..46846edfeafb 100644 --- a/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -942,11 +942,11 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, get(VE::SVMmi), Dest).addReg(VMZ).addImm(Imm); MachineInstr *Inst = MIB.getInstr(); - MI.eraseFromParent(); if (KillSrc) { const TargetRegisterInfo *TRI = &getRegisterInfo(); Inst->addRegisterKilled(MI.getOperand(1).getReg(), TRI, true); } + MI.eraseFromParent(); return true; } case VE::VFMKyal: @@ -956,6 +956,7 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case VE::VFMKSyvl: case VE::VFMKSyvyl: expandPseudoVFMK(*this, MI); + return true; } return false; } diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 80abccd74782..7b70d99b5f52 100644 --- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -368,8 +368,8 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { return nullptr; // No reg alloc } -static void checkSanityForEHAndSjLj(const TargetMachine *TM) { - // Sanity checking related to -exception-model +static void basicCheckForEHAndSjLj(const TargetMachine *TM) { + // Basic Correctness checking related to -exception-model if (TM->Options.ExceptionModel != ExceptionHandling::None && TM->Options.ExceptionModel != ExceptionHandling::Wasm) report_fatal_error("-exception-model should be either 'none' or 'wasm'"); @@ -431,7 +431,7 @@ void WebAssemblyPassConfig::addIRPasses() { if (getOptLevel() != CodeGenOpt::None) addPass(createWebAssemblyOptimizeReturned()); - checkSanityForEHAndSjLj(TM); + basicCheckForEHAndSjLj(TM); // If exception handling is not enabled and setjmp/longjmp handling is // enabled, we lower invokes into calls and delete unreachable landingpad diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index d4f39b571394..3df48b466d07 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -148,7 +148,7 @@ public: AlignBranchType.addKind(X86::AlignBranchJcc); AlignBranchType.addKind(X86::AlignBranchJmp); } - // Allow overriding defaults set by master flag + // Allow overriding defaults set by main flag if (X86AlignBranchBoundary.getNumOccurrences()) AlignBoundary = assumeAligned(X86AlignBranchBoundary); if (X86AlignBranch.getNumOccurrences()) @@ -1452,9 +1452,7 @@ public: unsigned NumDefCFAOffsets = 0; int MinAbsOffset = std::numeric_limits<int>::max(); - for (unsigned i = 0, e = Instrs.size(); i != e; ++i) { - const MCCFIInstruction &Inst = Instrs[i]; - + for (const MCCFIInstruction &Inst : Instrs) { switch (Inst.getOperation()) { default: // Any other CFI directives indicate a frame that we aren't prepared diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp index a2ae6345c006..9826bf4bf861 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -186,8 +186,8 @@ public: TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), *MBB->getParent())); MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); - for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx) - Bld.add(MI->getOperand(Idx)); + for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) + Bld.add(MO); BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY)) .add(MI->getOperand(0)) diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp index df8df1e3a65d..c8ceebb8b8e6 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DynAllocaExpander.cpp @@ -212,6 +212,12 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) { bool Is64BitAlloca = MI->getOpcode() == X86::DYN_ALLOCA_64; assert(SlotSize == 4 || SlotSize == 8); + Optional<MachineFunction::DebugInstrOperandPair> InstrNum = None; + if (unsigned Num = MI->peekDebugInstrNum()) { + // Operand 2 of DYN_ALLOCAs contains the stack def. + InstrNum = {Num, 2}; + } + switch (L) { case TouchAndSub: { assert(Amount >= SlotSize); @@ -251,7 +257,7 @@ void X86DynAllocaExpander::lower(MachineInstr *MI, Lowering L) { // Do the probe. STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL, - /*InProlog=*/false); + /*InProlog=*/false, InstrNum); } else { // Sub BuildMI(*MBB, I, DL, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 01dc509df795..93bc23006dc4 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -209,10 +209,8 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB, llvm_unreachable("unexpected opcode"); OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); - unsigned OpStart = 1; bool RAXImplicitDead = false; - for (; OpStart < MI.getNumOperands(); ++OpStart) { - MachineOperand &Op = MI.getOperand(OpStart); + for (MachineOperand &Op : llvm::drop_begin(MI.operands())) { // RAX may be 'implicit dead', if there are no other users of the return // value. We introduce a new use, so change it to 'implicit def'. if (Op.isReg() && Op.isImplicit() && Op.isDead() && diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp index 9a63cffe0a09..4730b936ec1f 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -278,10 +278,9 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { RegUsageState RegUsage = RU_NotUsed; MachineInstr &MI = *I; - for (unsigned i = 0; i < MI.getNumOperands(); ++i) { - MachineOperand &opnd = MI.getOperand(i); - if (opnd.isReg() && opnd.getReg() == p.getReg()) { - if (opnd.isDef()) + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == p.getReg()) { + if (MO.isDef()) return RU_Write; RegUsage = RU_Read; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp index 60e1b37ed61c..4d9160f35226 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -446,11 +446,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // Get dead variables list now because the MI pointer may be deleted as part // of processing! SmallVector<unsigned, 8> DeadRegs; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) if (MO.isReg() && MO.isDead()) DeadRegs.push_back(MO.getReg()); - } switch (FPInstClass) { case X86II::ZeroArgFP: handleZeroArgFP(I); break; @@ -1672,8 +1670,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { // Collect all FP registers (register operands with constraints "t", "u", // and "f") to kill afer the instruction. unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); + for (const MachineOperand &Op : MI.operands()) { if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; unsigned FPReg = getFPReg(Op); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp index bd780273509f..c29ae9f6af4c 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -465,13 +465,11 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - if (CSI.empty()) return; // Calculate offsets. - for (std::vector<CalleeSavedInfo>::const_iterator - I = CSI.begin(), E = CSI.end(); I != E; ++I) { - int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); - unsigned Reg = I->getReg(); + for (const CalleeSavedInfo &I : CSI) { + int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); + unsigned Reg = I.getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); if (IsPrologue) { @@ -484,10 +482,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( } } -void X86FrameLowering::emitStackProbe(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const { +void X86FrameLowering::emitStackProbe( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog, + Optional<MachineFunction::DebugInstrOperandPair> InstrNum) const { const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); if (STI.isTargetWindowsCoreCLR()) { if (InProlog) { @@ -497,10 +495,14 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF, emitStackProbeInline(MF, MBB, MBBI, DL, false); } } else { - emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + emitStackProbeCall(MF, MBB, MBBI, DL, InProlog, InstrNum); } } +bool X86FrameLowering::stackProbeFunctionModifiesSP() const { + return STI.isOSWindows() && !STI.isTargetWin64(); +} + void X86FrameLowering::inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const { auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) { @@ -971,11 +973,10 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( } } -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - bool InProlog) const { +void X86FrameLowering::emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog, + Optional<MachineFunction::DebugInstrOperandPair> InstrNum) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; // FIXME: Add indirect thunk support and remove this. @@ -1015,6 +1016,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addReg(SP, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + MachineInstr *ModInst = CI; if (STI.isTargetWin64() || !STI.isOSWindows()) { // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves. // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp @@ -1022,9 +1024,27 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, // adjusting %rsp. // All other platforms do not specify a particular ABI for the stack probe // function, so we arbitrarily define it to not adjust %esp/%rsp itself. - BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP) - .addReg(SP) - .addReg(AX); + ModInst = + BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP) + .addReg(SP) + .addReg(AX); + } + + // DebugInfo variable locations -- if there's an instruction number for the + // allocation (i.e., DYN_ALLOC_*), substitute it for the instruction that + // modifies SP. + if (InstrNum) { + if (STI.isTargetWin64() || !STI.isOSWindows()) { + // Label destination operand of the subtract. + MF.makeDebugValueSubstitution(*InstrNum, + {ModInst->getDebugInstrNum(), 0}); + } else { + // Label the call. The operand number is the penultimate operand, zero + // based. + unsigned SPDefOperand = ModInst->getNumOperands() - 2; + MF.makeDebugValueSubstitution( + *InstrNum, {ModInst->getDebugInstrNum(), SPDefOperand}); + } } if (InProlog) { @@ -2652,8 +2672,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; @@ -2664,13 +2684,13 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI); } // POP GPRs. unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h index 6309b8a066c4..e18be0d26321 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Support/TypeSize.h" @@ -51,9 +52,14 @@ public: /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize /// the number of bytes to probe in RAX/EAX. - void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - bool InProlog) const; + /// \p InstrNum optionally contains a debug-info instruction number for the + /// new stack pointer. + void emitStackProbe( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog, + Optional<MachineFunction::DebugInstrOperandPair> InstrNum = None) const; + + bool stackProbeFunctionModifiesSP() const override; /// Replace a StackProbe inline-stub with the actual probe code inline. void inlineStackProbe(MachineFunction &MF, @@ -198,9 +204,10 @@ private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; /// Emit target stack probe as a call to a helper function - void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - bool InProlog) const; + void emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog, + Optional<MachineFunction::DebugInstrOperandPair> InstrNum) const; /// Emit target stack probe as an inline sequence. void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp index 17d14053d804..62b2387396be 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23190,6 +23190,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + // We don't need to replace SQRT with RSQRT for half type. + if (VT.getScalarType() == MVT::f16) + return true; + // We never want to use both SQRT and RSQRT instructions for the same input. if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) return false; @@ -23228,11 +23232,15 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, UseOneConstNR = false; // There is no FSQRT for 512-bits, but there is RSQRT14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; - return DAG.getNode(Opcode, DL, VT, Op); + SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op); + if (RefinementSteps == 0 && !Reciprocal) + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate); + return Estimate; } if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && Subtarget.hasFP16()) { + assert(Reciprocal && "Don't replace SQRT with RSQRT for half type"); if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 0; @@ -45680,7 +45688,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, if (is64BitFP && !Subtarget.is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since - // OnesOrZeroesF is all ones of all zeroes, we don't need all the + // OnesOrZeroesF is all ones or all zeroes, we don't need all the // bits, but can do this little dance to extract the lowest 32 bits // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, @@ -46577,6 +46585,59 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return Ret; } +static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, + SDValue And1_L, SDValue And1_R, SDLoc DL, + SelectionDAG &DAG) { + if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) + return SDValue(); + SDValue NotOp = And0_L->getOperand(0); + if (NotOp == And1_R) + std::swap(And1_R, And1_L); + if (NotOp != And1_L) + return SDValue(); + + // (~(NotOp) & And0_R) | (NotOp & And1_R) + // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R + EVT VT = And1_L->getValueType(0); + SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); + SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); + SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); + return Xor1; +} + +/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the +/// equivalent `((x ^ y) & m) ^ y)` pattern. +/// This is typically a better representation for targets without a fused +/// "and-not" operation. This function is intended to be called from a +/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes. +static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) { + // Note that masked-merge variants using XOR or ADD expressions are + // normalized to OR by InstCombine so we only check for OR. + assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); + SDValue N0 = Node->getOperand(0); + if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) + return SDValue(); + SDValue N1 = Node->getOperand(1); + if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) + return SDValue(); + + SDLoc DL(Node); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) + return Result; + return SDValue(); +} + static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -46670,6 +46731,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return Res; } + // We should fold "masked merge" patterns when `andn` is not available. + if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) + if (SDValue R = foldMaskedMerge(N, DAG)) + return R; + return SDValue(); } @@ -48504,20 +48570,50 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SDValue LHS = Src.getOperand(0).getOperand(0); SDValue RHS = Src.getOperand(0).getOperand(1); - unsigned ExtOpc = LHS.getOpcode(); - if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) || - RHS.getOpcode() != ExtOpc) - return SDValue(); - - // Peek through the extends. - LHS = LHS.getOperand(0); - RHS = RHS.getOperand(0); - - // Ensure the input types match. - if (LHS.getValueType() != VT || RHS.getValueType() != VT) - return SDValue(); + // Count leading sign/zero bits on both inputs - if there are enough then + // truncation back to vXi16 will be cheap - either as a pack/shuffle + // sequence or using AVX512 truncations. If the inputs are sext/zext then the + // truncations may actually be free by peeking through to the ext source. + auto IsSext = [&DAG](SDValue V) { + return DAG.ComputeMinSignedBits(V) <= 16; + }; + auto IsZext = [&DAG](SDValue V) { + return DAG.computeKnownBits(V).countMaxActiveBits() <= 16; + }; - unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU; + bool IsSigned = IsSext(LHS) && IsSext(RHS); + bool IsUnsigned = IsZext(LHS) && IsZext(RHS); + if (!IsSigned && !IsUnsigned) + return SDValue(); + + // Check if both inputs are extensions, which will be removed by truncation. + bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND || + LHS.getOpcode() == ISD::ZERO_EXTEND) && + (RHS.getOpcode() == ISD::SIGN_EXTEND || + RHS.getOpcode() == ISD::ZERO_EXTEND) && + LHS.getOperand(0).getScalarValueSizeInBits() <= 16 && + RHS.getOperand(0).getScalarValueSizeInBits() <= 16; + + // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on + // the (bitcasted) inputs directly, and then cheaply pack/truncate the result + // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU + // will have to split anyway. + unsigned InSizeInBits = InVT.getSizeInBits(); + if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() && + !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) && + (InSizeInBits % 16) == 0) { + EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + InVT.getSizeInBits() / 16); + SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS), + DAG.getBitcast(BCVT, RHS)); + return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); + } + + // Truncate back to source type. + LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); + RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); + + unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU; return DAG.getNode(Opc, DL, VT, LHS, RHS); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td index 8aee96e1c504..1db83033ba35 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12937,8 +12937,8 @@ def : Pat<(v16i32 (X86vzmovl (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>; -def : Pat<(v8i16 (X86vzmovl (v8i16 (scalar_to_vector (i16 (trunc GR32:$src)))))), - (VMOVW2SHrr GR32:$src)>; +def : Pat<(v8i16 (X86vzmovl (scalar_to_vector (i16 GR16:$src)))), + (VMOVW2SHrr (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit))>; // AVX 128-bit movw instruction write zeros in the high 128-bit part. def : Pat<(v8i16 (X86vzload16 addr:$src)), diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp index 639aa5199ea5..bb5637a31947 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1163,8 +1163,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const { - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : MI.operands()) { if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS && !MO.isDead()) { return true; @@ -5676,10 +5675,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, MachineOperand &MO = MI.getOperand(i + 2); MIB.add(MO); } - for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); + for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2)) MIB.add(MO); - } updateOperandRegConstraints(MF, *NewMI, TII); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td index 74c515850ab1..91a497252595 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterBanks.td @@ -1,4 +1,4 @@ -//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// +//=- X86RegisterBank.td - Describe the X86 Banks -------------*- tablegen -*-=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 06dacb638d16..869762b35196 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1584,54 +1584,98 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, // Mask sign extend has an instruction. - { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, // Mask zero extend is a sext + shift. - { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, + + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm - { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, - { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, - { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, }; static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, + + // Mask zero extend is a sext + shift. + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, @@ -1786,40 +1830,94 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { // Mask sign extend has an instruction. - { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, // Mask zero extend is a sext + shift. - { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, + + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, + { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw - { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb }; static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { + // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, + + // Mask zero extend is a sext + shift. + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, + + { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, @@ -3674,6 +3772,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, PromEltTyBits = 16; // promote to i16, AVX512BW. break; } + if (ST->hasDQI()) { + PromEltTyBits = 32; // promote to i32, AVX512F. + break; + } return bailout(); default: return bailout(); @@ -3969,7 +4071,9 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, // Even in the case of (loop invariant) stride whose value is not known at // compile time, the address computation will not incur more than one extra // ADD instruction. - if (Ty->isVectorTy() && SE) { + if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { + // TODO: AVX2 is the current cut-off because we don't have correct + // interleaving costs for prior ISA's. if (!BaseT::isStridedAccess(Ptr)) return NumVectorInstToHideOverhead; if (!BaseT::getConstantStrideStep(SE, Ptr)) @@ -5173,7 +5277,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), LegalVT.getVectorNumElements()); InstructionCost MemOpCost; - if (UseMaskForCond || UseMaskForGaps) + bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; + if (UseMaskedMemOp) MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); else @@ -5183,9 +5288,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( unsigned VF = VecTy->getNumElements() / Factor; MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); - // FIXME: this is the most conservative estimate for the mask cost. InstructionCost MaskCost; - if (UseMaskForCond || UseMaskForGaps) { + if (UseMaskedMemOp) { APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); @@ -5193,10 +5297,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( DemandedLoadStoreElts.setBit(Index + Elm * Factor); } - Type *I8Type = Type::getInt8Ty(VecTy->getContext()); + Type *I1Type = Type::getInt1Ty(VecTy->getContext()); MaskCost = getReplicationShuffleCost( - I8Type, Factor, VF, + I1Type, Factor, VF, UseMaskForGaps ? DemandedLoadStoreElts : APInt::getAllOnes(VecTy->getNumElements()), CostKind); @@ -5207,7 +5311,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( // memory access, we need to account for the cost of And-ing the two masks // inside the loop. if (UseMaskForGaps) { - auto *MaskVT = FixedVectorType::get(I8Type, VecTy->getNumElements()); + auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); } } @@ -5248,9 +5352,10 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( NumOfLoadsInInterleaveGrp; // About a half of the loads may be folded in shuffles when we have only - // one result. If we have more than one result, we do not fold loads at all. + // one result. If we have more than one result, or the loads are masked, + // we do not fold loads at all. unsigned NumOfUnfoldedLoads = - NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; // Get a number of shuffle operations per result. unsigned NumOfShufflesPerResult = diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index 27ac6a4d1439..f2f89f4269ed 100644 --- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -427,19 +427,19 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters( if (MI != MBB.end() && !MI->isDebugInstr()) DL = MI->getDebugLoc(); - for (auto it = CSI.begin(); it != CSI.end(); ++it) { - unsigned Reg = it->getReg(); + for (const CalleeSavedInfo &I : CSI) { + unsigned Reg = I.getReg(); assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) && "LR & FP are always handled in emitPrologue"); // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, it->getFrameIdx(), RC, TRI); + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI); if (emitFrameMoves) { auto Store = MI; --Store; - XFI->getSpillLabels().push_back(std::make_pair(Store, *it)); + XFI->getSpillLabels().push_back(std::make_pair(Store, I)); } } return true; diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp index b5dbdea98eea..71836133fae6 100644 --- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp +++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp @@ -48,9 +48,7 @@ bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) { const XCoreInstrInfo &TII = *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo()); unsigned StackSize = MF.getFrameInfo().getStackSize(); - for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; - ++MFI) { - MachineBasicBlock &MBB = *MFI; + for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator MBBI = MBB.begin(), EE = MBB.end(); MBBI != EE; ++MBBI) { if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) { diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp index cd28fa5cd144..6f5dcb291e6e 100644 --- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreMCInstLower.cpp @@ -103,8 +103,7 @@ MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO, void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp = LowerOperand(MO); if (MCOp.isValid()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp index b2c2efed7db8..ba7589c2bf60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -275,94 +276,64 @@ CleanupPointerRootUsers(GlobalVariable *GV, /// We just marked GV constant. Loop over all users of the global, cleaning up /// the obvious ones. This is largely just a quick scan over the use list to /// clean up the easy and obvious cruft. This returns true if it made a change. -static bool CleanupConstantGlobalUsers( - Value *V, Constant *Init, const DataLayout &DL, - function_ref<TargetLibraryInfo &(Function &)> GetTLI) { +static bool CleanupConstantGlobalUsers(GlobalVariable *GV, + const DataLayout &DL) { + Constant *Init = GV->getInitializer(); + SmallVector<User *, 8> WorkList(GV->users()); + SmallPtrSet<User *, 8> Visited; bool Changed = false; - // Note that we need to use a weak value handle for the worklist items. When - // we delete a constant array, we may also be holding pointer to one of its - // elements (or an element of one of its elements if we're dealing with an - // array of arrays) in the worklist. - SmallVector<WeakTrackingVH, 8> WorkList(V->users()); + + SmallVector<WeakTrackingVH> MaybeDeadInsts; + auto EraseFromParent = [&](Instruction *I) { + for (Value *Op : I->operands()) + if (auto *OpI = dyn_cast<Instruction>(Op)) + MaybeDeadInsts.push_back(OpI); + I->eraseFromParent(); + Changed = true; + }; while (!WorkList.empty()) { - Value *UV = WorkList.pop_back_val(); - if (!UV) + User *U = WorkList.pop_back_val(); + if (!Visited.insert(U).second) continue; - User *U = cast<User>(UV); + if (auto *BO = dyn_cast<BitCastOperator>(U)) + append_range(WorkList, BO->users()); + if (auto *ASC = dyn_cast<AddrSpaceCastOperator>(U)) + append_range(WorkList, ASC->users()); + else if (auto *GEP = dyn_cast<GEPOperator>(U)) + append_range(WorkList, GEP->users()); + else if (auto *LI = dyn_cast<LoadInst>(U)) { + // A load from zeroinitializer is always zeroinitializer, regardless of + // any applied offset. + if (Init->isNullValue()) { + LI->replaceAllUsesWith(Constant::getNullValue(LI->getType())); + EraseFromParent(LI); + continue; + } - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - if (Init) { - if (auto *Casted = - ConstantFoldLoadThroughBitcast(Init, LI->getType(), DL)) { - // Replace the load with the initializer. - LI->replaceAllUsesWith(Casted); - LI->eraseFromParent(); - Changed = true; + Value *PtrOp = LI->getPointerOperand(); + APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0); + PtrOp = PtrOp->stripAndAccumulateConstantOffsets( + DL, Offset, /* AllowNonInbounds */ true); + if (PtrOp == GV) { + if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(), + Offset, DL)) { + LI->replaceAllUsesWith(Value); + EraseFromParent(LI); } } } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { // Store must be unreachable or storing Init into the global. - SI->eraseFromParent(); - Changed = true; - } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { - if (CE->getOpcode() == Instruction::GetElementPtr) { - Constant *SubInit = nullptr; - if (Init) - SubInit = ConstantFoldLoadThroughGEPConstantExpr( - Init, CE, V->getType()->getPointerElementType(), DL); - Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI); - } else if ((CE->getOpcode() == Instruction::BitCast && - CE->getType()->isPointerTy()) || - CE->getOpcode() == Instruction::AddrSpaceCast) { - // Pointer cast, delete any stores and memsets to the global. - Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI); - } - - if (CE->use_empty()) { - CE->destroyConstant(); - Changed = true; - } - } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { - // Do not transform "gepinst (gep constexpr (GV))" here, because forming - // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold - // and will invalidate our notion of what Init is. - Constant *SubInit = nullptr; - if (!isa<ConstantExpr>(GEP->getOperand(0))) { - ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>( - ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction()))); - if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) - SubInit = ConstantFoldLoadThroughGEPConstantExpr( - Init, CE, V->getType()->getPointerElementType(), DL); - - // If the initializer is an all-null value and we have an inbounds GEP, - // we already know what the result of any load from that GEP is. - // TODO: Handle splats. - if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds()) - SubInit = Constant::getNullValue(GEP->getResultElementType()); - } - Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI); - - if (GEP->use_empty()) { - GEP->eraseFromParent(); - Changed = true; - } + EraseFromParent(SI); } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv - if (MI->getRawDest() == V) { - MI->eraseFromParent(); - Changed = true; - } - - } else if (Constant *C = dyn_cast<Constant>(U)) { - // If we have a chain of dead constantexprs or other things dangling from - // us, and if they are all dead, nuke them without remorse. - if (isSafeToDestroyConstant(C)) { - C->destroyConstant(); - CleanupConstantGlobalUsers(V, Init, DL, GetTLI); - return true; - } + if (getUnderlyingObject(MI->getRawDest()) == GV) + EraseFromParent(MI); } } + + Changed |= + RecursivelyDeleteTriviallyDeadInstructionsPermissive(MaybeDeadInsts); + GV->removeDeadConstantUsers(); return Changed; } @@ -889,7 +860,7 @@ static bool OptimizeAwayTrappingUsesOfLoads( Changed |= CleanupPointerRootUsers(GV, GetTLI); } else { Changed = true; - CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI); + CleanupConstantGlobalUsers(GV, DL); } if (GV->use_empty()) { LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); @@ -1557,8 +1528,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, } else { // Delete any stores we can find to the global. We may not be able to // make it completely dead though. - Changed = - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + Changed = CleanupConstantGlobalUsers(GV, DL); } // If the global is dead now, delete it. @@ -1583,7 +1553,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, } // Clean up any obviously simplifiable users now. - Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + Changed |= CleanupConstantGlobalUsers(GV, DL); // If the global is dead now, just nuke it. if (GV->use_empty()) { @@ -1628,7 +1598,7 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, GV->setInitializer(SOVConstant); // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); + CleanupConstantGlobalUsers(GV, DL); if (GV->use_empty()) { LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to " diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index f342c35fa283..055ee6b50296 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1885,6 +1885,7 @@ private: OMPRTL___kmpc_barrier_simple_generic); ExternalizationRAII ThreadId(OMPInfoCache, OMPRTL___kmpc_get_hardware_thread_id_in_block); + ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size); registerAAs(IsModulePass); @@ -3727,12 +3728,37 @@ struct AAKernelInfoFunction : AAKernelInfo { CheckRWInst, *this, UsedAssumedInformationInCheckRWInst)) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + bool UsedAssumedInformationFromReachingKernels = false; if (!IsKernelEntry) { - updateReachingKernelEntries(A); updateParallelLevels(A); + bool AllReachingKernelsKnown = true; + updateReachingKernelEntries(A, AllReachingKernelsKnown); + UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown; + if (!ParallelLevels.isValidState()) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else if (!ReachingKernelEntries.isValidState()) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else if (!SPMDCompatibilityTracker.empty()) { + // Check if all reaching kernels agree on the mode as we can otherwise + // not guard instructions. We might not be sure about the mode so we + // we cannot fix the internal spmd-zation state either. + int SPMD = 0, Generic = 0; + for (auto *Kernel : ReachingKernelEntries) { + auto &CBAA = A.getAAFor<AAKernelInfo>( + *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL); + if (CBAA.SPMDCompatibilityTracker.isValidState() && + CBAA.SPMDCompatibilityTracker.isAssumed()) + ++SPMD; + else + ++Generic; + if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint()) + UsedAssumedInformationFromReachingKernels = true; + } + if (SPMD != 0 && Generic != 0) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + } } // Callback to check a call instruction. @@ -3779,7 +3805,8 @@ struct AAKernelInfoFunction : AAKernelInfo { // If we haven't used any assumed information for the SPMD state we can fix // it. if (!UsedAssumedInformationInCheckRWInst && - !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed) + !UsedAssumedInformationInCheckCallInst && + !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed) SPMDCompatibilityTracker.indicateOptimisticFixpoint(); return StateBefore == getState() ? ChangeStatus::UNCHANGED @@ -3788,7 +3815,8 @@ struct AAKernelInfoFunction : AAKernelInfo { private: /// Update info regarding reaching kernels. - void updateReachingKernelEntries(Attributor &A) { + void updateReachingKernelEntries(Attributor &A, + bool &AllReachingKernelsKnown) { auto PredCallSite = [&](AbstractCallSite ACS) { Function *Caller = ACS.getInstruction()->getFunction(); @@ -3808,10 +3836,9 @@ private: return true; }; - bool AllCallSitesKnown; if (!A.checkForAllCallSites(PredCallSite, *this, true /* RequireAllCallSites */, - AllCallSitesKnown)) + AllReachingKernelsKnown)) ReachingKernelEntries.indicatePessimisticFixpoint(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp index 7402e399a88a..2d717475ce7f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -641,8 +641,7 @@ PartialInlinerImpl::computeOutliningInfo(Function &F) const { if (!CandidateFound) return std::unique_ptr<FunctionOutliningInfo>(); - // Do sanity check of the entries: threre should not - // be any successors (not in the entry set) other than + // There should not be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} assert(OutliningInfo->Entries[0] == &F.front() && "Function Entry must be the first in Entries vector"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp index a961c47a7501..b8fac9d47763 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -84,6 +84,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SampleProfileInference.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" #include <algorithm> @@ -173,6 +174,9 @@ static cl::opt<bool> cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on.")); +cl::opt<bool> + SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden, + cl::desc("Sort profiled recursion by edge weights.")); static cl::opt<bool> ProfileSizeInline( "sample-profile-inline-size", cl::Hidden, cl::init(false), @@ -1648,6 +1652,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { SmallVector<uint32_t, 4> Weights; uint32_t MaxWeight = 0; Instruction *MaxDestInst; + // Since profi treats multiple edges (multiway branches) as a single edge, + // we need to distribute the computed weight among the branches. We do + // this by evenly splitting the edge weight among destinations. + DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity; + std::vector<uint64_t> EdgeIndex; + if (SampleProfileUseProfi) { + EdgeIndex.resize(TI->getNumSuccessors()); + for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { + const BasicBlock *Succ = TI->getSuccessor(I); + EdgeIndex[I] = EdgeMultiplicity[Succ]; + EdgeMultiplicity[Succ]++; + } + } for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); Edge E = std::make_pair(BB, Succ); @@ -1660,9 +1677,19 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); Weight = std::numeric_limits<uint32_t>::max(); } - // Weight is added by one to avoid propagation errors introduced by - // 0 weights. - Weights.push_back(static_cast<uint32_t>(Weight + 1)); + if (!SampleProfileUseProfi) { + // Weight is added by one to avoid propagation errors introduced by + // 0 weights. + Weights.push_back(static_cast<uint32_t>(Weight + 1)); + } else { + // Profi creates proper weights that do not require "+1" adjustments but + // we evenly split the weight among branches with the same destination. + uint64_t W = Weight / EdgeMultiplicity[Succ]; + // Rounding up, if needed, so that first branches are hotter. + if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) + W++; + Weights.push_back(static_cast<uint32_t>(W)); + } if (Weight != 0) { if (Weight > MaxWeight) { MaxWeight = Weight; @@ -1853,7 +1880,13 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG); scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get()); while (!CGI.isAtEnd()) { - for (ProfiledCallGraphNode *Node : *CGI) { + auto Range = *CGI; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator<ProfiledCallGraph *> SI(*CGI); + Range = *SI; + } + for (auto *Node : Range) { Function *F = SymbolMap.lookup(Node->Name); if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) FunctionOrderList.push_back(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 06c9bf650f37..dc55b5a31596 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1727,16 +1727,18 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - Value *A, *B, *C; + Value *A, *B, *C, *X, *Y; // (~(A | B) & C) | ... --> ... // (~(A & B) | C) & ... --> ... // TODO: One use checks are conservative. We just need to check that a total // number of multiple used values does not exceed reduction // in operations. - if (match(Op0, m_c_BinOp(FlippedOpcode, - m_Not(m_BinOp(Opcode, m_Value(A), m_Value(B))), - m_Value(C)))) { + if (match(Op0, + m_c_BinOp(FlippedOpcode, + m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A), + m_Value(B)))), + m_Value(C)))) { // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A) if (match(Op1, @@ -1776,6 +1778,21 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, m_c_BinOp(Opcode, m_Specific(B), m_Specific(C))))))) return BinaryOperator::CreateNot(Builder.CreateBinOp( Opcode, Builder.CreateBinOp(FlippedOpcode, A, C), B)); + + // (~(A | B) & C) | ~(C | (A ^ B)) --> ~((A | B) & (C | (A ^ B))) + // Note, the pattern with swapped and/or is not handled because the + // result is more undefined than a source: + // (~(A & B) | C) & ~(C & (A ^ B)) --> (A ^ B ^ C) | ~(A | C) is invalid. + if (Opcode == Instruction::Or && Op0->hasOneUse() && + match(Op1, m_OneUse(m_Not(m_CombineAnd( + m_Value(Y), + m_c_BinOp(Opcode, m_Specific(C), + m_c_Xor(m_Specific(A), m_Specific(B)))))))) { + // X = ~(A | B) + // Y = (C | (A ^ B) + Value *Or = cast<BinaryOperator>(X)->getOperand(0); + return BinaryOperator::CreateNot(Builder.CreateAnd(Or, Y)); + } } return nullptr; @@ -2061,7 +2078,14 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Instruction *CastedAnd = foldCastedBitwiseLogic(I)) return CastedAnd; + if (Instruction *Sel = foldBinopOfSextBoolToSelect(I)) + return Sel; + // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>. + // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold + // with binop identity constant. But creating a select with non-constant + // arm may not be reversible due to poison semantics. Is that a good + // canonicalization? Value *A; if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && A->getType()->isIntOrIntVectorTy(1)) @@ -2322,11 +2346,20 @@ Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) { Value *Cond; Value *NotB; if (match(A, m_SExt(m_Value(Cond))) && - Cond->getType()->isIntOrIntVectorTy(1) && - match(B, m_OneUse(m_Not(m_Value(NotB))))) { - NotB = peekThroughBitcast(NotB, true); - if (match(NotB, m_SExt(m_Specific(Cond)))) + Cond->getType()->isIntOrIntVectorTy(1)) { + // A = sext i1 Cond; B = sext (not (i1 Cond)) + if (match(B, m_SExt(m_Not(m_Specific(Cond))))) return Cond; + + // A = sext i1 Cond; B = not ({bitcast} (sext (i1 Cond))) + // TODO: The one-use checks are unnecessary or misplaced. If the caller + // checked for uses on logic ops/casts, that should be enough to + // make this transform worthwhile. + if (match(B, m_OneUse(m_Not(m_Value(NotB))))) { + NotB = peekThroughBitcast(NotB, true); + if (match(NotB, m_SExt(m_Specific(Cond)))) + return Cond; + } } // All scalar (and most vector) possibilities should be handled now. @@ -2569,7 +2602,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { return replaceInstUsesWith(I, V); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - if (I.getType()->isIntOrIntVectorTy(1)) { + Type *Ty = I.getType(); + if (Ty->isIntOrIntVectorTy(1)) { if (auto *SI0 = dyn_cast<SelectInst>(Op0)) { if (auto *I = foldAndOrOfSelectUsingImpliedCond(Op1, *SI0, /* IsAnd */ false)) @@ -2602,7 +2636,16 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0 // The check for a 'not' op is for efficiency (if Y is known zero --> ~X). Value *Or = Builder.CreateOr(X, Y); - return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV)); + return BinaryOperator::CreateXor(Or, ConstantInt::get(Ty, *CV)); + } + + // If the operands have no common bits set: + // or (mul X, Y), X --> add (mul X, Y), X --> mul X, (Y + 1) + if (match(&I, + m_c_Or(m_OneUse(m_Mul(m_Value(X), m_Value(Y))), m_Deferred(X))) && + haveNoCommonBitsSet(Op0, Op1, DL)) { + Value *IncrementY = Builder.CreateAdd(Y, ConstantInt::get(Ty, 1)); + return BinaryOperator::CreateMul(X, IncrementY); } // (A & C) | (B & D) @@ -2635,14 +2678,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { // iff (C0 & C1) == 0 and (X & ~C0) == 0 if (match(A, m_c_Or(m_Value(X), m_Specific(B))) && MaskedValueIsZero(X, ~*C0, 0, &I)) { - Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1); + Constant *C01 = ConstantInt::get(Ty, *C0 | *C1); return BinaryOperator::CreateAnd(A, C01); } // (A & C0) | ((X | A) & C1) --> (X | A) & (C0 | C1) // iff (C0 & C1) == 0 and (X & ~C1) == 0 if (match(B, m_c_Or(m_Value(X), m_Specific(A))) && MaskedValueIsZero(X, ~*C1, 0, &I)) { - Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1); + Constant *C01 = ConstantInt::get(Ty, *C0 | *C1); return BinaryOperator::CreateAnd(B, C01); } // ((X | C2) & C0) | ((X | C3) & C1) --> (X | C2 | C3) & (C0 | C1) @@ -2652,7 +2695,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { match(B, m_Or(m_Specific(X), m_APInt(C3))) && (*C2 & ~*C0).isZero() && (*C3 & ~*C1).isZero()) { Value *Or = Builder.CreateOr(X, *C2 | *C3, "bitfield"); - Constant *C01 = ConstantInt::get(I.getType(), *C0 | *C1); + Constant *C01 = ConstantInt::get(Ty, *C0 | *C1); return BinaryOperator::CreateAnd(Or, C01); } } @@ -2788,13 +2831,20 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *CastedOr = foldCastedBitwiseLogic(I)) return CastedOr; + if (Instruction *Sel = foldBinopOfSextBoolToSelect(I)) + return Sel; + // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>. + // TODO: Move this into foldBinopOfSextBoolToSelect as a more generalized fold + // with binop identity constant. But creating a select with non-constant + // arm may not be reversible due to poison semantics. Is that a good + // canonicalization? if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) && A->getType()->isIntOrIntVectorTy(1)) - return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1); + return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op1); if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) && A->getType()->isIntOrIntVectorTy(1)) - return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0); + return SelectInst::Create(A, ConstantInt::getAllOnesValue(Ty), Op0); // Note: If we've gotten to the point of visiting the outer OR, then the // inner one couldn't be simplified. If it was a constant, then it won't @@ -2826,7 +2876,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X) --> X s> Y ? -1 : X. { Value *X, *Y; - Type *Ty = I.getType(); if (match(&I, m_c_Or(m_OneUse(m_AShr( m_NSWSub(m_Value(Y), m_Value(X)), m_SpecificInt(Ty->getScalarSizeInBits() - 1))), @@ -2876,7 +2925,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (match(&I, m_c_Or(m_Add(m_Shl(m_One(), m_Value(X)), m_AllOnes()), m_Shl(m_One(), m_Deferred(X)))) && match(&I, m_c_Or(m_OneUse(m_Value()), m_Value()))) { - Type *Ty = X->getType(); Value *Sub = Builder.CreateSub( ConstantInt::get(Ty, Ty->getScalarSizeInBits() - 1), X); return BinaryOperator::CreateLShr(Constant::getAllOnesValue(Ty), Sub); @@ -3601,6 +3649,14 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A)))) return BinaryOperator::CreateOr(A, B); + // (~A | B) ^ A --> ~(A & B) + if (match(Op0, m_OneUse(m_c_Or(m_Not(m_Specific(Op1)), m_Value(B))))) + return BinaryOperator::CreateNot(Builder.CreateAnd(Op1, B)); + + // A ^ (~A | B) --> ~(A & B) + if (match(Op1, m_OneUse(m_c_Or(m_Not(m_Specific(Op0)), m_Value(B))))) + return BinaryOperator::CreateNot(Builder.CreateAnd(Op0, B)); + // (A | B) ^ (A | C) --> (B ^ C) & ~A -- There are 4 commuted variants. // TODO: Loosen one-use restriction if common operand is a constant. Value *D; diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index bfa7bfa2290a..7da2669e1d13 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2641,7 +2641,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { ArgNo++; } - assert(ArgNo == Call.arg_size() && "sanity check"); + assert(ArgNo == Call.arg_size() && "Call arguments not processed correctly."); if (!ArgNos.empty()) { AttributeList AS = Call.getAttributes(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index ca87477c5d81..33f217659c01 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2771,7 +2771,7 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))), m_Value(Y), m_ConstantInt(IndexC)))) && DestTy->isIntegerTy() && X->getType() == DestTy && - isDesirableIntType(BitWidth)) { + Y->getType()->isIntegerTy() && isDesirableIntType(BitWidth)) { // Adjust for big endian - the LSBs are at the high index. if (DL.isBigEndian()) IndexC = SrcVTy->getNumElements() - 1 - IndexC; diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 7a9e177f19da..ed53b88aed61 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -1894,23 +1895,6 @@ Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp, return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1)))); } - // (X & C2) == 0 -> (trunc X) >= 0 - // (X & C2) != 0 -> (trunc X) < 0 - // iff C2 is a power of 2 and it masks the sign bit of a legal integer type. - const APInt *C2; - if (And->hasOneUse() && C.isZero() && match(Y, m_APInt(C2))) { - int32_t ExactLogBase2 = C2->exactLogBase2(); - if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { - Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1); - if (auto *AndVTy = dyn_cast<VectorType>(And->getType())) - NTy = VectorType::get(NTy, AndVTy->getElementCount()); - Value *Trunc = Builder.CreateTrunc(X, NTy); - auto NewPred = - Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE : CmpInst::ICMP_SLT; - return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy)); - } - } - return nullptr; } @@ -2803,7 +2787,8 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, PredB, cast<Constant>(RHS2)); if (!FlippedStrictness) return false; - assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check"); + assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && + "basic correctness failure"); RHS2 = FlippedStrictness->second; // And kind-of perform the result swap. std::swap(Less, Greater); @@ -4614,7 +4599,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) { static Instruction *foldICmpWithTrunc(ICmpInst &ICmp, InstCombiner::BuilderTy &Builder) { - const ICmpInst::Predicate Pred = ICmp.getPredicate(); + ICmpInst::Predicate Pred = ICmp.getPredicate(); Value *Op0 = ICmp.getOperand(0), *Op1 = ICmp.getOperand(1); // Try to canonicalize trunc + compare-to-constant into a mask + cmp. @@ -4624,41 +4609,31 @@ static Instruction *foldICmpWithTrunc(ICmpInst &ICmp, if (!match(Op0, m_OneUse(m_Trunc(m_Value(X)))) || !match(Op1, m_APInt(C))) return nullptr; + // This matches patterns corresponding to tests of the signbit as well as: + // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?) + // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?) + APInt Mask; + if (decomposeBitTestICmp(Op0, Op1, Pred, X, Mask, true /* WithTrunc */)) { + Value *And = Builder.CreateAnd(X, Mask); + Constant *Zero = ConstantInt::getNullValue(X->getType()); + return new ICmpInst(Pred, And, Zero); + } + unsigned SrcBits = X->getType()->getScalarSizeInBits(); - if (Pred == ICmpInst::ICMP_ULT) { - if (C->isPowerOf2()) { - // If C is a power-of-2 (one set bit): - // (trunc X) u< C --> (X & -C) == 0 (are all masked-high-bits clear?) - Constant *MaskC = ConstantInt::get(X->getType(), (-*C).zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - Constant *Zero = ConstantInt::getNullValue(X->getType()); - return new ICmpInst(ICmpInst::ICMP_EQ, And, Zero); - } + if (Pred == ICmpInst::ICMP_ULT && C->isNegatedPowerOf2()) { // If C is a negative power-of-2 (high-bit mask): // (trunc X) u< C --> (X & C) != C (are any masked-high-bits clear?) - if (C->isNegatedPowerOf2()) { - Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC); - } + Constant *MaskC = ConstantInt::get(X->getType(), C->zext(SrcBits)); + Value *And = Builder.CreateAnd(X, MaskC); + return new ICmpInst(ICmpInst::ICMP_NE, And, MaskC); } - if (Pred == ICmpInst::ICMP_UGT) { - // If C is a low-bit-mask (C+1 is a power-of-2): - // (trunc X) u> C --> (X & ~C) != 0 (are any masked-high-bits set?) - if (C->isMask()) { - Constant *MaskC = ConstantInt::get(X->getType(), (~*C).zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - Constant *Zero = ConstantInt::getNullValue(X->getType()); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } + if (Pred == ICmpInst::ICMP_UGT && (~*C).isPowerOf2()) { // If C is not-of-power-of-2 (one clear bit): // (trunc X) u> C --> (X & (C+1)) == C+1 (are all masked-high-bits set?) - if ((~*C).isPowerOf2()) { - Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits)); - Value *And = Builder.CreateAnd(X, MaskC); - return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC); - } + Constant *MaskC = ConstantInt::get(X->getType(), (*C + 1).zext(SrcBits)); + Value *And = Builder.CreateAnd(X, MaskC); + return new ICmpInst(ICmpInst::ICMP_EQ, And, MaskC); } return nullptr; diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 72e1b21e8d49..20c75188ec9f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -319,6 +319,7 @@ private: Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt); Instruction *foldCastedBitwiseLogic(BinaryOperator &I); + Instruction *foldBinopOfSextBoolToSelect(BinaryOperator &I); Instruction *narrowBinOp(TruncInst &Trunc); Instruction *narrowMaskedBinOp(BinaryOperator &And); Instruction *narrowMathIfNoOverflow(BinaryOperator &I); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp index 7dc516c6fdc3..42ba4a34a5a9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp @@ -403,7 +403,7 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) { NonNegatedOps.emplace_back(Op); // Just record which operand that was. } assert((NegatedOps.size() + NonNegatedOps.size()) == 2 && - "Internal consistency sanity check."); + "Internal consistency check failed."); // Did we manage to sink negation into both of the operands? if (NegatedOps.size() == 2) // Then we get to keep the `add`! return Builder.CreateAdd(NegatedOps[0], NegatedOps[1], diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 4a1e82ae9c1d..518d3952dce5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -246,12 +246,16 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, static unsigned getSelectFoldableOperands(BinaryOperator *I) { switch (I->getOpcode()) { case Instruction::Add: + case Instruction::FAdd: case Instruction::Mul: + case Instruction::FMul: case Instruction::And: case Instruction::Or: case Instruction::Xor: return 3; // Can fold through either operand. case Instruction::Sub: // Can only fold on the amount subtracted. + case Instruction::FSub: + case Instruction::FDiv: // Can only fold on the divisor amount. case Instruction::Shl: // Can only fold on the shift amount. case Instruction::LShr: case Instruction::AShr: diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 47b6dcb67a78..1f81624f79e7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -967,6 +967,29 @@ Value *InstCombinerImpl::dyn_castNegVal(Value *V) const { return nullptr; } +/// A binop with a constant operand and a sign-extended boolean operand may be +/// converted into a select of constants by applying the binary operation to +/// the constant with the two possible values of the extended boolean (0 or -1). +Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) { + // TODO: Handle non-commutative binop (constant is operand 0). + // TODO: Handle zext. + // TODO: Peek through 'not' of cast. + Value *BO0 = BO.getOperand(0); + Value *BO1 = BO.getOperand(1); + Value *X; + Constant *C; + if (!match(BO0, m_SExt(m_Value(X))) || !match(BO1, m_ImmConstant(C)) || + !X->getType()->isIntOrIntVectorTy(1)) + return nullptr; + + // bo (sext i1 X), C --> select X, (bo -1, C), (bo 0, C) + Constant *Ones = ConstantInt::getAllOnesValue(BO.getType()); + Constant *Zero = ConstantInt::getNullValue(BO.getType()); + Constant *TVal = ConstantExpr::get(BO.getOpcode(), Ones, C); + Constant *FVal = ConstantExpr::get(BO.getOpcode(), Zero, C); + return SelectInst::Create(X, TVal, FVal); +} + static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO, InstCombiner::BuilderTy &Builder) { if (auto *Cast = dyn_cast<CastInst>(&I)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b56329ad76ae..bd2dc8d639fc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file is a part of AddressSanitizer, an address sanity checker. +// This file is a part of AddressSanitizer, an address basic correctness +// checker. // Details of the algorithm: // https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm // diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 62c265e40dab..8d3bc1383e96 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// // /// \file -/// This file is a part of HWAddressSanitizer, an address sanity checker -/// based on tagged addressing. +/// This file is a part of HWAddressSanitizer, an address basic correctness +/// checker based on tagged addressing. //===----------------------------------------------------------------------===// #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 36a66e096382..d1d3b8ffdf7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -64,10 +64,10 @@ cl::opt<bool> DoHashBasedCounterSplit( cl::desc("Rename counter variable of a comdat function based on cfg hash"), cl::init(true)); -cl::opt<bool> RuntimeCounterRelocation( - "runtime-counter-relocation", - cl::desc("Enable relocating counters at runtime."), - cl::init(false)); +cl::opt<bool> + RuntimeCounterRelocation("runtime-counter-relocation", + cl::desc("Enable relocating counters at runtime."), + cl::init(false)); cl::opt<bool> ValueProfileStaticAlloc( "vp-static-alloc", @@ -331,8 +331,9 @@ private: // Check whether the loop satisfies the basic conditions needed to perform // Counter Promotions. - bool isPromotionPossible(Loop *LP, - const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) { + bool + isPromotionPossible(Loop *LP, + const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) { // We can't insert into a catchswitch. if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) { return isa<CatchSwitchInst>(Exit->getTerminator()); @@ -421,13 +422,13 @@ PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { } char InstrProfilingLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN( - InstrProfilingLegacyPass, "instrprof", - "Frontend instrumentation-based coverage lowering.", false, false) +INITIALIZE_PASS_BEGIN(InstrProfilingLegacyPass, "instrprof", + "Frontend instrumentation-based coverage lowering.", + false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END( - InstrProfilingLegacyPass, "instrprof", - "Frontend instrumentation-based coverage lowering.", false, false) +INITIALIZE_PASS_END(InstrProfilingLegacyPass, "instrprof", + "Frontend instrumentation-based coverage lowering.", false, + false) ModulePass * llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options, @@ -634,13 +635,9 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { GlobalVariable *Name = Ind->getName(); uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); uint64_t Index = Ind->getIndex()->getZExtValue(); - auto It = ProfileDataMap.find(Name); - if (It == ProfileDataMap.end()) { - PerFunctionProfileData PD; - PD.NumValueSites[ValueKind] = Index + 1; - ProfileDataMap[Name] = PD; - } else if (It->second.NumValueSites[ValueKind] <= Index) - It->second.NumValueSites[ValueKind] = Index + 1; + auto &PD = ProfileDataMap[Name]; + PD.NumValueSites[ValueKind] = + std::max(PD.NumValueSites[ValueKind], (uint32_t)(Index + 1)); } void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { @@ -703,14 +700,15 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { LoadInst *LI = dyn_cast<LoadInst>(&I); if (!LI) { IRBuilder<> Builder(&I); - GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName()); + GlobalVariable *Bias = + M->getGlobalVariable(getInstrProfCounterBiasVarName()); if (!Bias) { // Compiler must define this variable when runtime counter relocation // is being used. Runtime has a weak external reference that is used // to check whether that's the case or not. - Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage, - Constant::getNullValue(Int64Ty), - getInstrProfCounterBiasVarName()); + Bias = new GlobalVariable( + *M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage, + Constant::getNullValue(Int64Ty), getInstrProfCounterBiasVarName()); Bias->setVisibility(GlobalVariable::HiddenVisibility); // A definition that's weak (linkonce_odr) without being in a COMDAT // section wouldn't lead to link errors, but it would lead to a dead @@ -839,8 +837,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { return false; // Use linker script magic to get data/cnts/name start/end. if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() || - TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || - TT.isOSWindows()) + TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() || TT.isOSWindows()) return false; return true; @@ -849,13 +846,9 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { GlobalVariable * InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { GlobalVariable *NamePtr = Inc->getName(); - auto It = ProfileDataMap.find(NamePtr); - PerFunctionProfileData PD; - if (It != ProfileDataMap.end()) { - if (It->second.RegionCounters) - return It->second.RegionCounters; - PD = It->second; - } + auto &PD = ProfileDataMap[NamePtr]; + if (PD.RegionCounters) + return PD.RegionCounters; // Match the linkage and visibility of the name global. Function *Fn = Inc->getParent()->getParent(); @@ -922,6 +915,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { CounterPtr->setAlignment(Align(8)); MaybeSetComdat(CounterPtr); CounterPtr->setLinkage(Linkage); + PD.RegionCounters = CounterPtr; auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); // Allocate statically the array of pointers to value profile nodes for @@ -1000,9 +994,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { MaybeSetComdat(Data); Data->setLinkage(Linkage); - PD.RegionCounters = CounterPtr; PD.DataVar = Data; - ProfileDataMap[NamePtr] = PD; // Mark the data variable as used so that it isn't stripped out. CompilerUsedVars.push_back(Data); @@ -1013,7 +1005,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { // Collect the referenced names to be used by emitNameData. ReferencedNames.push_back(NamePtr); - return CounterPtr; + return PD.RegionCounters; } void InstrProfiling::emitVNodes() { @@ -1078,8 +1070,8 @@ void InstrProfiling::emitNameData() { } auto &Ctx = M->getContext(); - auto *NamesVal = ConstantDataArray::getString( - Ctx, StringRef(CompressedNameStr), false); + auto *NamesVal = + ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); NamesVar = new GlobalVariable(*M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index f98e39d751f4..180012198c42 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -110,7 +110,7 @@ namespace { /// the module. struct ThreadSanitizer { ThreadSanitizer() { - // Sanity check options and warn user. + // Check options and warn user. if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) { errs() << "warning: Option -tsan-compound-read-before-write has no effect " diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 74e4eb07b219..4921209f041b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -94,11 +94,9 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, return false; } else if (const auto *CS = dyn_cast<CallBase>(Inst)) { // For calls, just check the arguments (and not the callee operand). - for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) { - const Value *Op = *OI; + for (const Value *Op : CS->args()) if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op)) return true; - } return false; } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // Special-case stores, because we don't care about the stored value, just diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index ca9567dc7ac8..a3fd97079b1d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -52,6 +52,11 @@ using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" +static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned( + "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden, + cl::desc("Enables canonicalization of signed relational predicates to " + "unsigned (e.g. sgt => ugt)")); + STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); STATISTIC(NumSelects, "Number of selects propagated"); @@ -64,7 +69,8 @@ STATISTIC(NumSDivSRemsNarrowed, STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); STATISTIC(NumUDivURemsNarrowed, "Number of udivs/urems whose width was decreased"); -STATISTIC(NumAShrs, "Number of ashr converted to lshr"); +STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr"); +STATISTIC(NumAShrsRemoved, "Number of ashr removed"); STATISTIC(NumSRems, "Number of srem converted to urem"); STATISTIC(NumSExt, "Number of sext converted to zext"); STATISTIC(NumSICmps, "Number of signed icmp preds simplified to unsigned"); @@ -297,6 +303,9 @@ static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) { } static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { + if (!CanonicalizeICmpPredicatesToUnsigned) + return false; + // Only for signed relational comparisons of scalar integers. if (Cmp->getType()->isVectorTy() || !Cmp->getOperand(0)->getType()->isIntegerTy()) @@ -376,13 +385,7 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, // ConstantFoldTerminator() as the underlying SwitchInst can be changed. SwitchInstProfUpdateWrapper SI(*I); - APInt Low = - APInt::getSignedMaxValue(Cond->getType()->getScalarSizeInBits()); - APInt High = - APInt::getSignedMinValue(Cond->getType()->getScalarSizeInBits()); - - SwitchInst::CaseIt CI = SI->case_begin(); - for (auto CE = SI->case_end(); CI != CE;) { + for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { ConstantInt *Case = CI->getCaseValue(); LazyValueInfo::Tristate State = LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I, @@ -415,28 +418,9 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, break; } - // Get Lower/Upper bound from switch cases. - Low = APIntOps::smin(Case->getValue(), Low); - High = APIntOps::smax(Case->getValue(), High); - // Increment the case iterator since we didn't delete it. ++CI; } - - // Try to simplify default case as unreachable - if (CI == SI->case_end() && SI->getNumCases() != 0 && - !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg())) { - const ConstantRange SIRange = - LVI->getConstantRange(SI->getCondition(), SI); - - // If the numbered switch cases cover the entire range of the condition, - // then the default case is not reachable. - if (SIRange.getSignedMin() == Low && SIRange.getSignedMax() == High && - SI->getNumCases() == High - Low + 1) { - createUnreachableSwitchDefault(SI, &DTU); - Changed = true; - } - } } if (Changed) @@ -688,7 +672,7 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) { ArgNo++; } - assert(ArgNo == CB.arg_size() && "sanity check"); + assert(ArgNo == CB.arg_size() && "Call arguments not processed correctly."); if (ArgNos.empty()) return Changed; @@ -954,10 +938,22 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { if (SDI->getType()->isVectorTy()) return false; + ConstantRange LRange = LVI->getConstantRange(SDI->getOperand(0), SDI); + unsigned OrigWidth = SDI->getType()->getIntegerBitWidth(); + ConstantRange NegOneOrZero = + ConstantRange(APInt(OrigWidth, (uint64_t)-1, true), APInt(OrigWidth, 1)); + if (NegOneOrZero.contains(LRange)) { + // ashr of -1 or 0 never changes the value, so drop the whole instruction + ++NumAShrsRemoved; + SDI->replaceAllUsesWith(SDI->getOperand(0)); + SDI->eraseFromParent(); + return true; + } + if (!isNonNegative(SDI->getOperand(0), LVI, SDI)) return false; - ++NumAShrs; + ++NumAShrsConverted; auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1), SDI->getName(), SDI); BO->setDebugLoc(SDI->getDebugLoc()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a8ec8bb97970..e0d3a6accadd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -159,52 +159,22 @@ static cl::opt<unsigned> MemorySSAPathCheckLimit( cl::desc("The maximum number of blocks to check when trying to prove that " "all paths to an exit go through a killing block (default = 50)")); +// This flags allows or disallows DSE to optimize MemorySSA during its +// traversal. Note that DSE optimizing MemorySSA may impact other passes +// downstream of the DSE invocation and can lead to issues not being +// reproducible in isolation (i.e. when MemorySSA is built from scratch). In +// those cases, the flag can be used to check if DSE's MemorySSA optimizations +// impact follow-up passes. +static cl::opt<bool> + OptimizeMemorySSA("dse-optimize-memoryssa", cl::init(true), cl::Hidden, + cl::desc("Allow DSE to optimize memory accesses.")); + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// using OverlapIntervalsTy = std::map<int64_t, int64_t>; using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>; -/// Does this instruction write some memory? This only returns true for things -/// that we can analyze with other helpers below. -static bool hasAnalyzableMemoryWrite(Instruction *I, - const TargetLibraryInfo &TLI) { - if (isa<StoreInst>(I)) - return true; - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - default: - return false; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - case Intrinsic::memcpy_inline: - case Intrinsic::memcpy_element_unordered_atomic: - case Intrinsic::memmove_element_unordered_atomic: - case Intrinsic::memset_element_unordered_atomic: - case Intrinsic::init_trampoline: - case Intrinsic::lifetime_end: - case Intrinsic::masked_store: - return true; - } - } - if (auto *CB = dyn_cast<CallBase>(I)) { - LibFunc LF; - if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { - switch (LF) { - case LibFunc_strcpy: - case LibFunc_strncpy: - case LibFunc_strcat: - case LibFunc_strncat: - return true; - default: - return false; - } - } - } - return false; -} - /// If the value of this instruction and the memory it writes to is unused, may /// we delete this instruction? static bool isRemovable(Instruction *I) { @@ -214,7 +184,7 @@ static bool isRemovable(Instruction *I) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { - default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate"); + default: llvm_unreachable("Does not have LocForWrite"); case Intrinsic::lifetime_end: // Never remove dead lifetime_end's, e.g. because it is followed by a // free. @@ -296,6 +266,7 @@ enum OverwriteResult { OW_End, OW_PartialEarlierWithFullLater, OW_MaybePartial, + OW_None, OW_Unknown }; @@ -841,7 +812,7 @@ struct DSEState { /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. - DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs; + MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs; // Class contains self-reference, make sure it's not copied/moved. DSEState(const DSEState &) = delete; @@ -889,6 +860,7 @@ struct DSEState { /// Return OW_MaybePartial if \p KillingI does not completely overwrite /// \p DeadI, but they both write to the same underlying object. In that /// case, use isPartialOverwrite to check if \p KillingI partially overwrites + /// \p DeadI. Returns 'OR_None' if \p KillingI is known to not overwrite the /// \p DeadI. Returns 'OW_Unknown' if nothing can be determined. OverwriteResult isOverwrite(const Instruction *KillingI, const Instruction *DeadI, @@ -951,8 +923,16 @@ struct DSEState { // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. - if (DeadUndObj != KillingUndObj) + if (DeadUndObj != KillingUndObj) { + // Non aliasing stores to different objects don't overlap. Note that + // if the killing store is known to overwrite whole object (out of + // bounds access overwrites whole object as well) then it is assumed to + // completely overwrite any store to the same object even if they don't + // actually alias (see next check). + if (AAR == AliasResult::NoAlias) + return OW_None; return OW_Unknown; + } // If the KillingI store is to a recognizable object, get its size. uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F); @@ -1006,9 +986,8 @@ struct DSEState { return OW_MaybePartial; } - // Can reach here only if accesses are known not to overlap. There is no - // dedicated code to indicate no overlap so signal "unknown". - return OW_Unknown; + // Can reach here only if accesses are known not to overlap. + return OW_None; } bool isInvisibleToCallerAfterRet(const Value *V) { @@ -1304,6 +1283,15 @@ struct DSEState { Instruction *KillingI = KillingDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " trying to get dominating access\n"); + // Only optimize defining access of KillingDef when directly starting at its + // defining access. The defining access also must only access KillingLoc. At + // the moment we only support instructions with a single write location, so + // it should be sufficient to disable optimizations for instructions that + // also read from memory. + bool CanOptimize = OptimizeMemorySSA && + KillingDef->getDefiningAccess() == StartAccess && + !KillingI->mayReadFromMemory(); + // Find the next clobbering Mod access for DefLoc, starting at StartAccess. Optional<MemoryLocation> CurrentLoc; for (;; Current = cast<MemoryDef>(Current)->getDefiningAccess()) { @@ -1345,8 +1333,10 @@ struct DSEState { Instruction *CurrentI = CurrentDef->getMemoryInst(); if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj), - TLI)) + TLI)) { + CanOptimize = false; continue; + } // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing @@ -1380,15 +1370,13 @@ struct DSEState { return None; } - // If Current cannot be analyzed or is not removable, check the next - // candidate. - if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) - continue; - - // If Current does not have an analyzable write location, skip it + // If Current does not have an analyzable write location or is not + // removable, skip it. CurrentLoc = getLocForWriteEx(CurrentI); - if (!CurrentLoc) + if (!CurrentLoc || !isRemovable(CurrentI)) { + CanOptimize = false; continue; + } // AliasAnalysis does not account for loops. Limit elimination to // candidates for which we can guarantee they always store to the same @@ -1396,6 +1384,7 @@ struct DSEState { if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) { LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n"); WalkerStepLimit -= 1; + CanOptimize = false; continue; } @@ -1403,16 +1392,32 @@ struct DSEState { // If the killing def is a memory terminator (e.g. lifetime.end), check // the next candidate if the current Current does not write the same // underlying object as the terminator. - if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) + if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI)) { + CanOptimize = false; continue; + } } else { int64_t KillingOffset = 0; int64_t DeadOffset = 0; auto OR = isOverwrite(KillingI, CurrentI, KillingLoc, *CurrentLoc, KillingOffset, DeadOffset); + if (CanOptimize) { + // CurrentDef is the earliest write clobber of KillingDef. Use it as + // optimized access. Do not optimize if CurrentDef is already the + // defining access of KillingDef. + if (CurrentDef != KillingDef->getDefiningAccess() && + (OR == OW_Complete || OR == OW_MaybePartial)) + KillingDef->setOptimized(CurrentDef); + + // Once a may-aliasing def is encountered do not set an optimized + // access. + if (OR != OW_None) + CanOptimize = false; + } + // If Current does not write to the same object as KillingDef, check // the next candidate. - if (OR == OW_Unknown) + if (OR == OW_Unknown || OR == OW_None) continue; else if (OR == OW_MaybePartial) { // If KillingDef only partially overwrites Current, check the next @@ -1421,6 +1426,7 @@ struct DSEState { // which are less likely to be removable in the end. if (PartialLimit <= 1) { WalkerStepLimit -= 1; + LLVM_DEBUG(dbgs() << " ... reached partial limit ... continue with next access\n"); continue; } PartialLimit -= 1; @@ -1922,7 +1928,14 @@ struct DSEState { if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) || !isRemovable(Def->getMemoryInst())) continue; - auto *UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess()); + MemoryDef *UpperDef; + // To conserve compile-time, we avoid walking to the next clobbering def. + // Instead, we just try to get the optimized access, if it exists. DSE + // will try to optimize defs during the earlier traversal. + if (Def->isOptimized()) + UpperDef = dyn_cast<MemoryDef>(Def->getOptimized()); + else + UpperDef = dyn_cast<MemoryDef>(Def->getDefiningAccess()); if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef)) continue; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index ae2fe2767074..7001d330fce0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1951,7 +1951,6 @@ bool IndVarSimplify::run(Loop *L) { // using it. if (!DisableLFTR) { BasicBlock *PreHeader = L->getLoopPreheader(); - BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -1987,7 +1986,7 @@ bool IndVarSimplify::run(Loop *L) { // Avoid high cost expansions. Note: This heuristic is questionable in // that our definition of "high cost" is not exactly principled. if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, - TTI, PreHeaderBR)) + TTI, PreHeader->getTerminator())) continue; // Check preconditions for proper SCEVExpander operation. SCEV does not diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp index bf714d167670..6f97f3e93123 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp @@ -486,7 +486,7 @@ bool LoopInvariantCodeMotion::runOnLoop( // Check that neither this loop nor its parent have had LCSSA broken. LICM is // specifically moving instructions across the loop boundary and so it is - // especially in need of sanity checking here. + // especially in need of basic functional correctness checking here. assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!"); assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) && "Parent loop not left in LCSSA form after LICM!"); @@ -1860,6 +1860,7 @@ class LoopPromoter : public LoadAndStorePromoter { bool UnorderedAtomic; AAMDNodes AATags; ICFLoopSafetyInfo &SafetyInfo; + bool CanInsertStoresInExitBlocks; // We're about to add a use of V in a loop exit block. Insert an LCSSA phi // (if legal) if doing so would add an out-of-loop use to an instruction @@ -1886,12 +1887,13 @@ public: SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC, MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl, Align Alignment, bool UnorderedAtomic, const AAMDNodes &AATags, - ICFLoopSafetyInfo &SafetyInfo) + ICFLoopSafetyInfo &SafetyInfo, bool CanInsertStoresInExitBlocks) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP), PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)), Alignment(Alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags), - SafetyInfo(SafetyInfo) {} + SafetyInfo(SafetyInfo), + CanInsertStoresInExitBlocks(CanInsertStoresInExitBlocks) {} bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction *> &) const override { @@ -1903,7 +1905,7 @@ public: return PointerMustAliases.count(Ptr); } - void doExtraRewritesBeforeFinalDeletion() override { + void insertStoresInLoopExitBlocks() { // Insert stores after in the loop exit blocks. Each exit block gets a // store of the live-out values that feed them. Since we've already told // the SSA updater about the defs in the loop and the preheader @@ -1937,10 +1939,21 @@ public: } } + void doExtraRewritesBeforeFinalDeletion() override { + if (CanInsertStoresInExitBlocks) + insertStoresInLoopExitBlocks(); + } + void instructionDeleted(Instruction *I) const override { SafetyInfo.removeInstruction(I); MSSAU->removeMemoryAccess(I); } + + bool shouldDelete(Instruction *I) const override { + if (isa<StoreInst>(I)) + return CanInsertStoresInExitBlocks; + return true; + } }; bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L, @@ -2039,6 +2052,7 @@ bool llvm::promoteLoopAccessesToScalars( bool DereferenceableInPH = false; bool SafeToInsertStore = false; + bool FoundLoadToPromote = false; SmallVector<Instruction *, 64> LoopUses; @@ -2067,16 +2081,11 @@ bool llvm::promoteLoopAccessesToScalars( IsKnownThreadLocalObject = !isa<AllocaInst>(Object); } - // Check that all of the pointers in the alias set have the same type. We - // cannot (yet) promote a memory location that is loaded and stored in + // Check that all accesses to pointers in the aliass set use the same type. + // We cannot (yet) promote a memory location that is loaded and stored in // different sizes. While we are at it, collect alignment and AA info. + Type *AccessTy = nullptr; for (Value *ASIV : PointerMustAliases) { - // Check that all of the pointers in the alias set have the same type. We - // cannot (yet) promote a memory location that is loaded and stored in - // different sizes. - if (SomePtr->getType() != ASIV->getType()) - return false; - for (User *U : ASIV->users()) { // Ignore instructions that are outside the loop. Instruction *UI = dyn_cast<Instruction>(U); @@ -2091,6 +2100,7 @@ bool llvm::promoteLoopAccessesToScalars( SawUnorderedAtomic |= Load->isAtomic(); SawNotAtomic |= !Load->isAtomic(); + FoundLoadToPromote = true; Align InstAlignment = Load->getAlign(); @@ -2153,6 +2163,11 @@ bool llvm::promoteLoopAccessesToScalars( } else return false; // Not a load or store. + if (!AccessTy) + AccessTy = getLoadStoreType(UI); + else if (AccessTy != getLoadStoreType(UI)) + return false; + // Merge the AA tags. if (LoopUses.empty()) { // On the first load/store, just take its AA tags. @@ -2175,9 +2190,7 @@ bool llvm::promoteLoopAccessesToScalars( // If we're inserting an atomic load in the preheader, we must be able to // lower it. We're only guaranteed to be able to lower naturally aligned // atomics. - auto *SomePtrElemType = SomePtr->getType()->getPointerElementType(); - if (SawUnorderedAtomic && - Alignment < MDL.getTypeStoreSize(SomePtrElemType)) + if (SawUnorderedAtomic && Alignment < MDL.getTypeStoreSize(AccessTy)) return false; // If we couldn't prove we can hoist the load, bail. @@ -2199,13 +2212,20 @@ bool llvm::promoteLoopAccessesToScalars( } } - // If we've still failed to prove we can sink the store, give up. - if (!SafeToInsertStore) + // If we've still failed to prove we can sink the store, hoist the load + // only, if possible. + if (!SafeToInsertStore && !FoundLoadToPromote) + // If we cannot hoist the load either, give up. return false; - // Otherwise, this is safe to promote, lets do it! - LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr - << '\n'); + // Lets do the promotion! + if (SafeToInsertStore) + LLVM_DEBUG(dbgs() << "LICM: Promoting load/store of the value: " << *SomePtr + << '\n'); + else + LLVM_DEBUG(dbgs() << "LICM: Promoting load of the value: " << *SomePtr + << '\n'); + ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0]) @@ -2224,13 +2244,14 @@ bool llvm::promoteLoopAccessesToScalars( SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL, - Alignment, SawUnorderedAtomic, AATags, *SafetyInfo); + Alignment, SawUnorderedAtomic, AATags, *SafetyInfo, + SafeToInsertStore); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. LoadInst *PreheaderLoad = new LoadInst( - SomePtr->getType()->getPointerElementType(), SomePtr, - SomePtr->getName() + ".promoted", Preheader->getTerminator()); + AccessTy, SomePtr, SomePtr->getName() + ".promoted", + Preheader->getTerminator()); if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); PreheaderLoad->setAlignment(Alignment); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 3df4cfe8e4c1..6c783848432b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -49,9 +49,17 @@ void PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &, LPMUpdater &>::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { - for (unsigned Idx = 0, Size = LoopPasses.size(); Idx != Size; ++Idx) { - auto *P = LoopPasses[Idx].get(); - P->printPipeline(OS, MapClassName2PassName); + assert(LoopPasses.size() + LoopNestPasses.size() == IsLoopNestPass.size()); + + unsigned IdxLP = 0, IdxLNP = 0; + for (unsigned Idx = 0, Size = IsLoopNestPass.size(); Idx != Size; ++Idx) { + if (IsLoopNestPass[Idx]) { + auto *P = LoopNestPasses[IdxLNP++].get(); + P->printPipeline(OS, MapClassName2PassName); + } else { + auto *P = LoopPasses[IdxLP++].get(); + P->printPipeline(OS, MapClassName2PassName); + } if (Idx + 1 < Size) OS << ","; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index a87843d658a9..728d63fe2847 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -256,8 +256,8 @@ private: } } - // Sanity check: amount of dead and live loop blocks should match the total - // number of blocks in loop. + // Amount of dead and live loop blocks should match the total number of + // blocks in loop. assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() && "Malformed block sets?"); @@ -305,7 +305,6 @@ private: BlocksInLoopAfterFolding.insert(BB); } - // Sanity check: header must be in loop. assert(BlocksInLoopAfterFolding.count(L.getHeader()) && "Header not in loop?"); assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() && diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 67702520511b..39c8b65968aa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -806,28 +806,27 @@ static Optional<unsigned> shouldFullUnroll( ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP) { + assert(FullUnrollTripCount && "should be non-zero!"); - if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { - // When computing the unrolled size, note that BEInsns are not replicated - // like the rest of the loop body. - if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) { - return FullUnrollTripCount; + if (FullUnrollTripCount > UP.FullUnrollMaxCount) + return None; - } else { - // The loop isn't that small, but we still can fully unroll it if that - // helps to remove a significant number of instructions. - // To check that, run additional analysis on the loop. - if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, EphValues, TTI, - UP.Threshold * UP.MaxPercentThresholdBoost / 100, - UP.MaxIterationsCountToAnalyze)) { - unsigned Boost = - getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); - if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - return FullUnrollTripCount; - } - } - } + // When computing the unrolled size, note that BEInsns are not replicated + // like the rest of the loop body. + if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) + return FullUnrollTripCount; + + // The loop isn't that small, but we still can fully unroll it if that + // helps to remove a significant number of instructions. + // To check that, run additional analysis on the loop. + if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( + L, FullUnrollTripCount, DT, SE, EphValues, TTI, + UP.Threshold * UP.MaxPercentThresholdBoost / 100, + UP.MaxIterationsCountToAnalyze)) { + unsigned Boost = + getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); + if (Cost->UnrolledCost < UP.Threshold * Boost / 100) + return FullUnrollTripCount; } return None; } @@ -837,51 +836,48 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP) { + if (!TripCount) + return None; + + if (!UP.Partial) { + LLVM_DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); + return 0; + } unsigned count = UP.Count; - if (TripCount) { - if (!UP.Partial) { - LLVM_DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); - count = 0; - return count; - } - if (count == 0) - count = TripCount; - if (UP.PartialThreshold != NoThreshold) { - // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) - count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / - (LoopSize - UP.BEInsns); - if (count > UP.MaxCount) - count = UP.MaxCount; - while (count != 0 && TripCount % count != 0) - count--; - if (UP.AllowRemainder && count <= 1) { - // If there is no Count that is modulo of TripCount, set Count to - // largest power-of-two factor that satisfies the threshold limit. - // As we'll create fixup loop, do the type of unrolling only if - // remainder loop is allowed. - count = UP.DefaultUnrollRuntimeCount; - while (count != 0 && - UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) - count >>= 1; - } - if (count < 2) { - count = 0; - } - } else { - count = TripCount; - } + if (count == 0) + count = TripCount; + if (UP.PartialThreshold != NoThreshold) { + // Reduce unroll count to be modulo of TripCount for partial unrolling. + if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) + count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / + (LoopSize - UP.BEInsns); if (count > UP.MaxCount) count = UP.MaxCount; - - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n"); - - return count; + while (count != 0 && TripCount % count != 0) + count--; + if (UP.AllowRemainder && count <= 1) { + // If there is no Count that is modulo of TripCount, set Count to + // largest power-of-two factor that satisfies the threshold limit. + // As we'll create fixup loop, do the type of unrolling only if + // remainder loop is allowed. + count = UP.DefaultUnrollRuntimeCount; + while (count != 0 && + UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) + count >>= 1; + } + if (count < 2) { + count = 0; + } + } else { + count = TripCount; } + if (count > UP.MaxCount) + count = UP.MaxCount; - // if didn't return until here, should continue to other priorties - return None; + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n"); + + return count; } // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. @@ -900,7 +896,6 @@ bool llvm::computeUnrollCount( TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { UnrollCostEstimator UCE(*L, LoopSize); - Optional<unsigned> UnrollFactor; const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; const bool PragmaFullUnroll = hasUnrollFullPragma(L); @@ -926,9 +921,8 @@ bool llvm::computeUnrollCount( // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. // 2nd priority is unroll count set by pragma. - UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP); - - if (UnrollFactor) { + if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, + UCE, UP)) { UP.Count = *UnrollFactor; if (UserUnrollCount || (PragmaCount > 0)) { @@ -948,11 +942,20 @@ bool llvm::computeUnrollCount( } } - // 3rd priority is full unroll count. - // Full unroll makes sense only when TripCount or its upper bound could be - // statically calculated. - // Also we need to check if we exceed FullUnrollMaxCount. + // 3rd priority is exact full unrolling. This will eliminate all copies + // of some exit test. + UP.Count = 0; + if (TripCount) { + UP.Count = TripCount; + if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, + TripCount, UCE, UP)) { + UP.Count = *UnrollFactor; + UseUpperBound = false; + return ExplicitUnroll; + } + } + // 4th priority is bounded unrolling. // We can unroll by the upper bound amount if it's generally allowed or if // we know that the loop is executed either the upper bound or zero times. // (MaxOrZero unrolling keeps only the first loop test, so the number of @@ -961,37 +964,21 @@ bool llvm::computeUnrollCount( // number of loop tests goes up which may end up being worse on targets with // constrained branch predictor resources so is controlled by an option.) // In addition we only unroll small upper bounds. - unsigned FullUnrollMaxTripCount = MaxTripCount; - if (!(UP.UpperBound || MaxOrZero) || - FullUnrollMaxTripCount > UnrollMaxUpperBound) - FullUnrollMaxTripCount = 0; - - // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only - // compute the former when the latter is zero. - unsigned ExactTripCount = TripCount; - assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) && - "ExtractTripCount and UnrollByMaxCount cannot both be non zero."); - - unsigned FullUnrollTripCount = - ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; - UP.Count = FullUnrollTripCount; - - UnrollFactor = - shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP); - - // if shouldFullUnroll can do the unrolling, some side parameteres should be - // set - if (UnrollFactor) { - UP.Count = *UnrollFactor; - UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); - TripCount = FullUnrollTripCount; - TripMultiple = UP.UpperBound ? 1 : TripMultiple; - return ExplicitUnroll; - } else { - UP.Count = FullUnrollTripCount; + // Note that the cost of bounded unrolling is always strictly greater than + // cost of exact full unrolling. As such, if we have an exact count and + // found it unprofitable, we'll never chose to bounded unroll. + if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) && + MaxTripCount <= UnrollMaxUpperBound) { + UP.Count = MaxTripCount; + if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, + MaxTripCount, UCE, UP)) { + UP.Count = *UnrollFactor; + UseUpperBound = true; + return ExplicitUnroll; + } } - // 4th priority is loop peeling. + // 5th priority is loop peeling. computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UP.Threshold); if (PP.PeelCount) { UP.Runtime = false; @@ -1004,11 +991,9 @@ bool llvm::computeUnrollCount( if (TripCount) UP.Partial |= ExplicitUnroll; - // 5th priority is partial unrolling. + // 6th priority is partial unrolling. // Try partial unroll only when TripCount could be statically calculated. - UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP); - - if (UnrollFactor) { + if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) { UP.Count = *UnrollFactor; if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && @@ -1049,7 +1034,7 @@ bool llvm::computeUnrollCount( "because loop has a runtime trip count."; }); - // 6th priority is runtime unrolling. + // 7th priority is runtime unrolling. // Don't unroll a runtime trip count loop when it is disabled. if (hasRuntimeUnrollDisablePragma(L)) { UP.Count = 0; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp index b0fb8daaba8f..c354fa177a60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -494,7 +494,7 @@ static bool LinearizeExprTree(Instruction *I, SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order. #ifndef NDEBUG - SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme. + SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme. #endif while (!Worklist.empty()) { std::pair<Instruction*, APInt> P = Worklist.pop_back_val(); @@ -2313,11 +2313,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { MadeChange |= LinearizeExprTree(I, Tree); SmallVector<ValueEntry, 8> Ops; Ops.reserve(Tree.size()); - for (unsigned i = 0, e = Tree.size(); i != e; ++i) { - RepeatedValue E = Tree[i]; - Ops.append(E.second.getZExtValue(), - ValueEntry(getRank(E.first), E.first)); - } + for (const RepeatedValue &E : Tree) + Ops.append(E.second.getZExtValue(), ValueEntry(getRank(E.first), E.first)); LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 86d3620c312e..3799d2dd1cf2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -227,8 +227,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, unsigned IterCnt = 0; (void)IterCnt; while (LocalChange) { - assert(IterCnt++ < 1000 && - "Sanity: iterative simplification didn't converge!"); + assert(IterCnt++ < 1000 && "Iterative simplification didn't converge!"); LocalChange = false; // Loop over all of the basic blocks and remove them if they are unneeded. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 6469c899feea..d6d6b1a7fa09 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -235,22 +235,26 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, // These dominator edges will be redirected from Pred. std::vector<DominatorTree::UpdateType> Updates; if (DTU) { - SmallPtrSet<BasicBlock *, 2> SuccsOfBB(succ_begin(BB), succ_end(BB)); + // To avoid processing the same predecessor more than once. + SmallPtrSet<BasicBlock *, 8> SeenSuccs; SmallPtrSet<BasicBlock *, 2> SuccsOfPredBB(succ_begin(PredBB), succ_end(PredBB)); - Updates.reserve(Updates.size() + 2 * SuccsOfBB.size() + 1); + Updates.reserve(Updates.size() + 2 * succ_size(BB) + 1); // Add insert edges first. Experimentally, for the particular case of two // blocks that can be merged, with a single successor and single predecessor // respectively, it is beneficial to have all insert updates first. Deleting // edges first may lead to unreachable blocks, followed by inserting edges // making the blocks reachable again. Such DT updates lead to high compile // times. We add inserts before deletes here to reduce compile time. - for (BasicBlock *SuccOfBB : SuccsOfBB) + for (BasicBlock *SuccOfBB : successors(BB)) // This successor of BB may already be a PredBB's successor. if (!SuccsOfPredBB.contains(SuccOfBB)) - Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB}); - for (BasicBlock *SuccOfBB : SuccsOfBB) - Updates.push_back({DominatorTree::Delete, BB, SuccOfBB}); + if (SeenSuccs.insert(SuccOfBB).second) + Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB}); + SeenSuccs.clear(); + for (BasicBlock *SuccOfBB : successors(BB)) + if (SeenSuccs.insert(SuccOfBB).second) + Updates.push_back({DominatorTree::Delete, BB, SuccOfBB}); Updates.push_back({DominatorTree::Delete, PredBB, BB}); } @@ -804,14 +808,14 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt, if (DTU) { SmallVector<DominatorTree::UpdateType, 8> Updates; // Old dominates New. New node dominates all other nodes dominated by Old. - SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New), - succ_end(New)); + SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld; Updates.push_back({DominatorTree::Insert, Old, New}); - Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfOld.size()); - for (BasicBlock *UniqueSuccessorOfOld : UniqueSuccessorsOfOld) { - Updates.push_back({DominatorTree::Insert, New, UniqueSuccessorOfOld}); - Updates.push_back({DominatorTree::Delete, Old, UniqueSuccessorOfOld}); - } + Updates.reserve(Updates.size() + 2 * succ_size(New)); + for (BasicBlock *SuccessorOfOld : successors(New)) + if (UniqueSuccessorsOfOld.insert(SuccessorOfOld).second) { + Updates.push_back({DominatorTree::Insert, New, SuccessorOfOld}); + Updates.push_back({DominatorTree::Delete, Old, SuccessorOfOld}); + } DTU->applyUpdates(Updates); } else if (DT) @@ -870,14 +874,14 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt, SmallVector<DominatorTree::UpdateType, 8> DTUpdates; // New dominates Old. The predecessor nodes of the Old node dominate // New node. - SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld(pred_begin(New), - pred_end(New)); + SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld; DTUpdates.push_back({DominatorTree::Insert, New, Old}); - DTUpdates.reserve(DTUpdates.size() + 2 * UniquePredecessorsOfOld.size()); - for (BasicBlock *UniquePredecessorOfOld : UniquePredecessorsOfOld) { - DTUpdates.push_back({DominatorTree::Insert, UniquePredecessorOfOld, New}); - DTUpdates.push_back({DominatorTree::Delete, UniquePredecessorOfOld, Old}); - } + DTUpdates.reserve(DTUpdates.size() + 2 * pred_size(New)); + for (BasicBlock *PredecessorOfOld : predecessors(New)) + if (UniquePredecessorsOfOld.insert(PredecessorOfOld).second) { + DTUpdates.push_back({DominatorTree::Insert, PredecessorOfOld, New}); + DTUpdates.push_back({DominatorTree::Delete, PredecessorOfOld, Old}); + } DTU->applyUpdates(DTUpdates); @@ -910,13 +914,14 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, } else { // Split block expects NewBB to have a non-empty set of predecessors. SmallVector<DominatorTree::UpdateType, 8> Updates; - SmallPtrSet<BasicBlock *, 8> UniquePreds(Preds.begin(), Preds.end()); + SmallPtrSet<BasicBlock *, 8> UniquePreds; Updates.push_back({DominatorTree::Insert, NewBB, OldBB}); - Updates.reserve(Updates.size() + 2 * UniquePreds.size()); - for (auto *UniquePred : UniquePreds) { - Updates.push_back({DominatorTree::Insert, UniquePred, NewBB}); - Updates.push_back({DominatorTree::Delete, UniquePred, OldBB}); - } + Updates.reserve(Updates.size() + 2 * Preds.size()); + for (auto *Pred : Preds) + if (UniquePreds.insert(Pred).second) { + Updates.push_back({DominatorTree::Insert, Pred, NewBB}); + Updates.push_back({DominatorTree::Delete, Pred, OldBB}); + } DTU->applyUpdates(Updates); } } else if (DT) { @@ -1376,14 +1381,14 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore, BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); if (DTU) { - SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail), - succ_end(Tail)); + SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead; Updates.push_back({DominatorTree::Insert, Head, Tail}); - Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfHead.size()); - for (BasicBlock *UniqueSuccessorOfHead : UniqueSuccessorsOfHead) { - Updates.push_back({DominatorTree::Insert, Tail, UniqueSuccessorOfHead}); - Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead}); - } + Updates.reserve(Updates.size() + 2 * succ_size(Tail)); + for (BasicBlock *SuccessorOfHead : successors(Tail)) + if (UniqueSuccessorsOfHead.insert(SuccessorOfHead).second) { + Updates.push_back({DominatorTree::Insert, Tail, SuccessorOfHead}); + Updates.push_back({DominatorTree::Delete, Head, SuccessorOfHead}); + } } Instruction *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 957935398972..580cfd80141e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -452,18 +452,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_mempcpy: case LibFunc_memccpy: + Changed |= setWillReturn(F); + LLVM_FALLTHROUGH; + case LibFunc_memcpy_chk: Changed |= setDoesNotThrow(F); Changed |= setOnlyAccessesArgMemory(F); - Changed |= setWillReturn(F); Changed |= setDoesNotAlias(F, 0); Changed |= setOnlyWritesMemory(F, 0); Changed |= setDoesNotAlias(F, 1); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; - case LibFunc_memcpy_chk: - Changed |= setDoesNotThrow(F); - return Changed; case LibFunc_memalign: Changed |= setOnlyAccessesInaccessibleMemory(F); Changed |= setRetNoUndef(F); @@ -1018,9 +1017,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 0); Changed |= setDoesNotCapture(F, 1); return Changed; - // TODO: add LibFunc entries for: - // case LibFunc_memset_pattern4: - // case LibFunc_memset_pattern8: + case LibFunc_memset_pattern4: + case LibFunc_memset_pattern8: case LibFunc_memset_pattern16: Changed |= setOnlyAccessesArgMemory(F); Changed |= setDoesNotCapture(F, 0); @@ -1029,10 +1027,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setOnlyReadsMemory(F, 1); return Changed; case LibFunc_memset: - Changed |= setOnlyAccessesArgMemory(F); Changed |= setWillReturn(F); - Changed |= setDoesNotThrow(F); + LLVM_FALLTHROUGH; + case LibFunc_memset_chk: + Changed |= setOnlyAccessesArgMemory(F); Changed |= setOnlyWritesMemory(F, 0); + Changed |= setDoesNotThrow(F); return Changed; // int __nvvm_reflect(const char *) case LibFunc_nvvm_reflect: diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp index 200deca4b317..57c273a0e3c5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -135,10 +135,18 @@ std::unique_ptr<Module> llvm::CloneModule( // Similarly, copy over function bodies now... // for (const Function &I : M) { - if (I.isDeclaration()) + Function *F = cast<Function>(VMap[&I]); + + if (I.isDeclaration()) { + // Copy over metadata for declarations since we're not doing it below in + // CloneFunctionInto(). + SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; + I.getAllMetadata(MDs); + for (auto MD : MDs) + F->addMetadata(MD.first, *MapMetadata(MD.second, VMap)); continue; + } - Function *F = cast<Function>(VMap[&I]); if (!ShouldCloneDefinition(&I)) { // Skip after setting the correct linkage for an external reference. F->setLinkage(GlobalValue::ExternalLinkage); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp index 4dbcbf80d3da..7c310f16d46e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp @@ -74,7 +74,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, {}, {}, nullptr, "widenable_cond"); CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC, "exiplicit_guard_cond")); - assert(isWidenableBranch(CheckBI) && "sanity check"); + assert(isWidenableBranch(CheckBI) && "Branch must be widenable."); } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp index f4776589910f..997667810580 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1218,10 +1218,9 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) { if (!RI || !isa<CallBase>(RI->getOperand(0))) continue; auto *RetVal = cast<CallBase>(RI->getOperand(0)); - // Sanity check that the cloned RetVal exists and is a call, otherwise we - // cannot add the attributes on the cloned RetVal. - // Simplification during inlining could have transformed the cloned - // instruction. + // Check that the cloned RetVal exists and is a call, otherwise we cannot + // add the attributes on the cloned RetVal. Simplification during inlining + // could have transformed the cloned instruction. auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal)); if (!NewRetVal) continue; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp index 74ab37fadf36..ec926b1f5a94 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp @@ -529,8 +529,8 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive( std::function<void(Value *)> AboutToDeleteCallback) { unsigned S = 0, E = DeadInsts.size(), Alive = 0; for (; S != E; ++S) { - auto *I = cast<Instruction>(DeadInsts[S]); - if (!isInstructionTriviallyDead(I)) { + auto *I = dyn_cast<Instruction>(DeadInsts[S]); + if (!I || !isInstructionTriviallyDead(I)) { DeadInsts[S] = nullptr; ++Alive; } @@ -760,15 +760,18 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, SmallVector<DominatorTree::UpdateType, 32> Updates; if (DTU) { - SmallPtrSet<BasicBlock *, 2> PredsOfPredBB(pred_begin(PredBB), - pred_end(PredBB)); - Updates.reserve(Updates.size() + 2 * PredsOfPredBB.size() + 1); - for (BasicBlock *PredOfPredBB : PredsOfPredBB) + // To avoid processing the same predecessor more than once. + SmallPtrSet<BasicBlock *, 2> SeenPreds; + Updates.reserve(Updates.size() + 2 * pred_size(PredBB) + 1); + for (BasicBlock *PredOfPredBB : predecessors(PredBB)) // This predecessor of PredBB may already have DestBB as a successor. if (PredOfPredBB != PredBB) - Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB}); - for (BasicBlock *PredOfPredBB : PredsOfPredBB) - Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB}); + if (SeenPreds.insert(PredOfPredBB).second) + Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB}); + SeenPreds.clear(); + for (BasicBlock *PredOfPredBB : predecessors(PredBB)) + if (SeenPreds.insert(PredOfPredBB).second) + Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB}); Updates.push_back({DominatorTree::Delete, PredBB, DestBB}); } @@ -1096,16 +1099,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, SmallVector<DominatorTree::UpdateType, 32> Updates; if (DTU) { + // To avoid processing the same predecessor more than once. + SmallPtrSet<BasicBlock *, 8> SeenPreds; // All predecessors of BB will be moved to Succ. - SmallPtrSet<BasicBlock *, 8> PredsOfBB(pred_begin(BB), pred_end(BB)); SmallPtrSet<BasicBlock *, 8> PredsOfSucc(pred_begin(Succ), pred_end(Succ)); - Updates.reserve(Updates.size() + 2 * PredsOfBB.size() + 1); - for (auto *PredOfBB : PredsOfBB) + Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1); + for (auto *PredOfBB : predecessors(BB)) // This predecessor of BB may already have Succ as a successor. if (!PredsOfSucc.contains(PredOfBB)) - Updates.push_back({DominatorTree::Insert, PredOfBB, Succ}); - for (auto *PredOfBB : PredsOfBB) - Updates.push_back({DominatorTree::Delete, PredOfBB, BB}); + if (SeenPreds.insert(PredOfBB).second) + Updates.push_back({DominatorTree::Insert, PredOfBB, Succ}); + SeenPreds.clear(); + for (auto *PredOfBB : predecessors(BB)) + if (SeenPreds.insert(PredOfBB).second) + Updates.push_back({DominatorTree::Delete, PredOfBB, BB}); Updates.push_back({DominatorTree::Delete, BB, Succ}); } @@ -2190,26 +2197,6 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}}); } -void llvm::createUnreachableSwitchDefault(SwitchInst *Switch, - DomTreeUpdater *DTU) { - LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); - auto *BB = Switch->getParent(); - auto *OrigDefaultBlock = Switch->getDefaultDest(); - OrigDefaultBlock->removePredecessor(BB); - BasicBlock *NewDefaultBlock = BasicBlock::Create( - BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), - OrigDefaultBlock); - new UnreachableInst(Switch->getContext(), NewDefaultBlock); - Switch->setDefaultDest(&*NewDefaultBlock); - if (DTU) { - SmallVector<DominatorTree::UpdateType, 2> Updates; - Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock}); - if (!is_contained(successors(BB), OrigDefaultBlock)) - Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock}); - DTU->applyUpdates(Updates); - } -} - BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI, BasicBlock *UnwindEdge, DomTreeUpdater *DTU) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index a92cb6a313d3..bb719a499a4c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -623,15 +623,13 @@ bool llvm::UnrollRuntimeLoopRemainder( if (!SE) return false; - // Only unroll loops with a computable trip count, and the trip count needs - // to be an int value (allowing a pointer type is a TODO item). + // Only unroll loops with a computable trip count. // We calculate the backedge count by using getExitCount on the Latch block, // which is proven to be the only exiting block in this loop. This is same as // calculating getBackedgeTakenCount on the loop (which computes SCEV for all // exiting blocks). const SCEV *BECountSC = SE->getExitCount(L, Latch); - if (isa<SCEVCouldNotCompute>(BECountSC) || - !BECountSC->getType()->isIntegerTy()) { + if (isa<SCEVCouldNotCompute>(BECountSC)) { LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n"); return false; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp index 68572d479742..c8e42acdffb3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1049,6 +1049,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, return Builder.CreateOrReduce(Src); case RecurKind::Xor: return Builder.CreateXorReduce(Src); + case RecurKind::FMulAdd: case RecurKind::FAdd: return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy), Src); @@ -1091,7 +1092,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B, Value *llvm::createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start) { - assert(Desc.getRecurrenceKind() == RecurKind::FAdd && + assert((Desc.getRecurrenceKind() == RecurKind::FAdd || + Desc.getRecurrenceKind() == RecurKind::FMulAdd) && "Unexpected reduction kind"); assert(Src->getType()->isVectorTy() && "Expected a vector type"); assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 5893ce15b129..7d9992176658 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -446,6 +446,9 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) { // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. for (Instruction *User : Insts) { + if (!shouldDelete(User)) + continue; + // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp new file mode 100644 index 000000000000..9495e442e0bf --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -0,0 +1,462 @@ +//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a profile inference algorithm. Given an incomplete and +// possibly imprecise block counts, the algorithm reconstructs realistic block +// and edge counts that satisfy flow conservation rules, while minimally modify +// input block counts. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SampleProfileInference.h" +#include "llvm/Support/Debug.h" +#include <queue> +#include <set> + +using namespace llvm; +#define DEBUG_TYPE "sample-profile-inference" + +namespace { + +/// A value indicating an infinite flow/capacity/weight of a block/edge. +/// Not using numeric_limits<int64_t>::max(), as the values can be summed up +/// during the execution. +static constexpr int64_t INF = ((int64_t)1) << 50; + +/// The minimum-cost maximum flow algorithm. +/// +/// The algorithm finds the maximum flow of minimum cost on a given (directed) +/// network using a modified version of the classical Moore-Bellman-Ford +/// approach. The algorithm applies a number of augmentation iterations in which +/// flow is sent along paths of positive capacity from the source to the sink. +/// The worst-case time complexity of the implementation is O(v(f)*m*n), where +/// where m is the number of edges, n is the number of vertices, and v(f) is the +/// value of the maximum flow. However, the observed running time on typical +/// instances is sub-quadratic, that is, o(n^2). +/// +/// The input is a set of edges with specified costs and capacities, and a pair +/// of nodes (source and sink). The output is the flow along each edge of the +/// minimum total cost respecting the given edge capacities. +class MinCostMaxFlow { +public: + // Initialize algorithm's data structures for a network of a given size. + void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) { + Source = SourceNode; + Target = SinkNode; + + Nodes = std::vector<Node>(NodeCount); + Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>()); + } + + // Run the algorithm. + int64_t run() { + // Find an augmenting path and update the flow along the path + size_t AugmentationIters = 0; + while (findAugmentingPath()) { + augmentFlowAlongPath(); + AugmentationIters++; + } + + // Compute the total flow and its cost + int64_t TotalCost = 0; + int64_t TotalFlow = 0; + for (uint64_t Src = 0; Src < Nodes.size(); Src++) { + for (auto &Edge : Edges[Src]) { + if (Edge.Flow > 0) { + TotalCost += Edge.Cost * Edge.Flow; + if (Src == Source) + TotalFlow += Edge.Flow; + } + } + } + LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters + << " iterations with " << TotalFlow << " total flow" + << " of " << TotalCost << " cost\n"); + (void)TotalFlow; + return TotalCost; + } + + /// Adding an edge to the network with a specified capacity and a cost. + /// Multiple edges between a pair of nodes are allowed but self-edges + /// are not supported. + void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) { + assert(Capacity > 0 && "adding an edge of zero capacity"); + assert(Src != Dst && "loop edge are not supported"); + + Edge SrcEdge; + SrcEdge.Dst = Dst; + SrcEdge.Cost = Cost; + SrcEdge.Capacity = Capacity; + SrcEdge.Flow = 0; + SrcEdge.RevEdgeIndex = Edges[Dst].size(); + + Edge DstEdge; + DstEdge.Dst = Src; + DstEdge.Cost = -Cost; + DstEdge.Capacity = 0; + DstEdge.Flow = 0; + DstEdge.RevEdgeIndex = Edges[Src].size(); + + Edges[Src].push_back(SrcEdge); + Edges[Dst].push_back(DstEdge); + } + + /// Adding an edge to the network of infinite capacity and a given cost. + void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) { + addEdge(Src, Dst, INF, Cost); + } + + /// Get the total flow from a given source node. + /// Returns a list of pairs (target node, amount of flow to the target). + const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const { + std::vector<std::pair<uint64_t, int64_t>> Flow; + for (auto &Edge : Edges[Src]) { + if (Edge.Flow > 0) + Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow)); + } + return Flow; + } + + /// Get the total flow between a pair of nodes. + int64_t getFlow(uint64_t Src, uint64_t Dst) const { + int64_t Flow = 0; + for (auto &Edge : Edges[Src]) { + if (Edge.Dst == Dst) { + Flow += Edge.Flow; + } + } + return Flow; + } + + /// A cost of increasing a block's count by one. + static constexpr int64_t AuxCostInc = 10; + /// A cost of decreasing a block's count by one. + static constexpr int64_t AuxCostDec = 20; + /// A cost of increasing a count of zero-weight block by one. + static constexpr int64_t AuxCostIncZero = 11; + /// A cost of increasing the entry block's count by one. + static constexpr int64_t AuxCostIncEntry = 40; + /// A cost of decreasing the entry block's count by one. + static constexpr int64_t AuxCostDecEntry = 10; + /// A cost of taking an unlikely jump. + static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20; + +private: + /// Check for existence of an augmenting path with a positive capacity. + bool findAugmentingPath() { + // Initialize data structures + for (auto &Node : Nodes) { + Node.Distance = INF; + Node.ParentNode = uint64_t(-1); + Node.ParentEdgeIndex = uint64_t(-1); + Node.Taken = false; + } + + std::queue<uint64_t> Queue; + Queue.push(Source); + Nodes[Source].Distance = 0; + Nodes[Source].Taken = true; + while (!Queue.empty()) { + uint64_t Src = Queue.front(); + Queue.pop(); + Nodes[Src].Taken = false; + // Although the residual network contains edges with negative costs + // (in particular, backward edges), it can be shown that there are no + // negative-weight cycles and the following two invariants are maintained: + // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, + // where Dist is the length of the shortest path between two nodes. This + // allows to prune the search-space of the path-finding algorithm using + // the following early-stop criteria: + // -- If we find a path with zero-distance from Source to Target, stop the + // search, as the path is the shortest since Dist[Source, Target] >= 0; + // -- If we have Dist[Source, V] > Dist[Source, Target], then do not + // process node V, as it is guaranteed _not_ to be on a shortest path + // from Source to Target; it follows from inequalities + // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] + // >= Dist[Source, V] + if (Nodes[Target].Distance == 0) + break; + if (Nodes[Src].Distance > Nodes[Target].Distance) + continue; + + // Process adjacent edges + for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) { + auto &Edge = Edges[Src][EdgeIdx]; + if (Edge.Flow < Edge.Capacity) { + uint64_t Dst = Edge.Dst; + int64_t NewDistance = Nodes[Src].Distance + Edge.Cost; + if (Nodes[Dst].Distance > NewDistance) { + // Update the distance and the parent node/edge + Nodes[Dst].Distance = NewDistance; + Nodes[Dst].ParentNode = Src; + Nodes[Dst].ParentEdgeIndex = EdgeIdx; + // Add the node to the queue, if it is not there yet + if (!Nodes[Dst].Taken) { + Queue.push(Dst); + Nodes[Dst].Taken = true; + } + } + } + } + } + + return Nodes[Target].Distance != INF; + } + + /// Update the current flow along the augmenting path. + void augmentFlowAlongPath() { + // Find path capacity + int64_t PathCapacity = INF; + uint64_t Now = Target; + while (Now != Source) { + uint64_t Pred = Nodes[Now].ParentNode; + auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; + PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow); + Now = Pred; + } + + assert(PathCapacity > 0 && "found incorrect augmenting path"); + + // Update the flow along the path + Now = Target; + while (Now != Source) { + uint64_t Pred = Nodes[Now].ParentNode; + auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; + auto &RevEdge = Edges[Now][Edge.RevEdgeIndex]; + + Edge.Flow += PathCapacity; + RevEdge.Flow -= PathCapacity; + + Now = Pred; + } + } + + /// An node in a flow network. + struct Node { + /// The cost of the cheapest path from the source to the current node. + int64_t Distance; + /// The node preceding the current one in the path. + uint64_t ParentNode; + /// The index of the edge between ParentNode and the current node. + uint64_t ParentEdgeIndex; + /// An indicator of whether the current node is in a queue. + bool Taken; + }; + /// An edge in a flow network. + struct Edge { + /// The cost of the edge. + int64_t Cost; + /// The capacity of the edge. + int64_t Capacity; + /// The current flow on the edge. + int64_t Flow; + /// The destination node of the edge. + uint64_t Dst; + /// The index of the reverse edge between Dst and the current node. + uint64_t RevEdgeIndex; + }; + + /// The set of network nodes. + std::vector<Node> Nodes; + /// The set of network edges. + std::vector<std::vector<Edge>> Edges; + /// Source node of the flow. + uint64_t Source; + /// Target (sink) node of the flow. + uint64_t Target; +}; + +/// Initializing flow network for a given function. +/// +/// Every block is split into three nodes that are responsible for (i) an +/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or +/// reduction of the block weight. +void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) { + uint64_t NumBlocks = Func.Blocks.size(); + assert(NumBlocks > 1 && "Too few blocks in a function"); + LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n"); + + // Pre-process data: make sure the entry weight is at least 1 + if (Func.Blocks[Func.Entry].Weight == 0) { + Func.Blocks[Func.Entry].Weight = 1; + } + // Introducing dummy source/sink pairs to allow flow circulation. + // The nodes corresponding to blocks of Func have indicies in the range + // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values. + uint64_t S = 3 * NumBlocks; + uint64_t T = S + 1; + uint64_t S1 = S + 2; + uint64_t T1 = S + 3; + + Network.initialize(3 * NumBlocks + 4, S1, T1); + + // Create three nodes for every block of the function + for (uint64_t B = 0; B < NumBlocks; B++) { + auto &Block = Func.Blocks[B]; + assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) && + "non-zero weight of a block w/o weight except for an entry"); + + // Split every block into two nodes + uint64_t Bin = 3 * B; + uint64_t Bout = 3 * B + 1; + uint64_t Baux = 3 * B + 2; + if (Block.Weight > 0) { + Network.addEdge(S1, Bout, Block.Weight, 0); + Network.addEdge(Bin, T1, Block.Weight, 0); + } + + // Edges from S and to T + assert((!Block.isEntry() || !Block.isExit()) && + "a block cannot be an entry and an exit"); + if (Block.isEntry()) { + Network.addEdge(S, Bin, 0); + } else if (Block.isExit()) { + Network.addEdge(Bout, T, 0); + } + + // An auxiliary node to allow increase/reduction of block counts: + // We assume that decreasing block counts is more expensive than increasing, + // and thus, setting separate costs here. In the future we may want to tune + // the relative costs so as to maximize the quality of generated profiles. + int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc; + int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec; + if (Block.UnknownWeight) { + // Do not penalize changing weights of blocks w/o known profile count + AuxCostInc = 0; + AuxCostDec = 0; + } else { + // Increasing the count for "cold" blocks with zero initial count is more + // expensive than for "hot" ones + if (Block.Weight == 0) { + AuxCostInc = MinCostMaxFlow::AuxCostIncZero; + } + // Modifying the count of the entry block is expensive + if (Block.isEntry()) { + AuxCostInc = MinCostMaxFlow::AuxCostIncEntry; + AuxCostDec = MinCostMaxFlow::AuxCostDecEntry; + } + } + // For blocks with self-edges, do not penalize a reduction of the count, + // as all of the increase can be attributed to the self-edge + if (Block.HasSelfEdge) { + AuxCostDec = 0; + } + + Network.addEdge(Bin, Baux, AuxCostInc); + Network.addEdge(Baux, Bout, AuxCostInc); + if (Block.Weight > 0) { + Network.addEdge(Bout, Baux, AuxCostDec); + Network.addEdge(Baux, Bin, AuxCostDec); + } + } + + // Creating edges for every jump + for (auto &Jump : Func.Jumps) { + uint64_t Src = Jump.Source; + uint64_t Dst = Jump.Target; + if (Src != Dst) { + uint64_t SrcOut = 3 * Src + 1; + uint64_t DstIn = 3 * Dst; + uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0; + Network.addEdge(SrcOut, DstIn, Cost); + } + } + + // Make sure we have a valid flow circulation + Network.addEdge(T, S, 0); +} + +/// Extract resulting block and edge counts from the flow network. +void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) { + uint64_t NumBlocks = Func.Blocks.size(); + + // Extract resulting block counts + for (uint64_t Src = 0; Src < NumBlocks; Src++) { + auto &Block = Func.Blocks[Src]; + uint64_t SrcOut = 3 * Src + 1; + int64_t Flow = 0; + for (auto &Adj : Network.getFlow(SrcOut)) { + uint64_t DstIn = Adj.first; + int64_t DstFlow = Adj.second; + bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2); + if (!IsAuxNode || Block.HasSelfEdge) { + Flow += DstFlow; + } + } + Block.Flow = Flow; + assert(Flow >= 0 && "negative block flow"); + } + + // Extract resulting jump counts + for (auto &Jump : Func.Jumps) { + uint64_t Src = Jump.Source; + uint64_t Dst = Jump.Target; + int64_t Flow = 0; + if (Src != Dst) { + uint64_t SrcOut = 3 * Src + 1; + uint64_t DstIn = 3 * Dst; + Flow = Network.getFlow(SrcOut, DstIn); + } else { + uint64_t SrcOut = 3 * Src + 1; + uint64_t SrcAux = 3 * Src + 2; + int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux); + if (AuxFlow > 0) + Flow = AuxFlow; + } + Jump.Flow = Flow; + assert(Flow >= 0 && "negative jump flow"); + } +} + +#ifndef NDEBUG +/// Verify that the computed flow values satisfy flow conservation rules +void verifyWeights(const FlowFunction &Func) { + const uint64_t NumBlocks = Func.Blocks.size(); + auto InFlow = std::vector<uint64_t>(NumBlocks, 0); + auto OutFlow = std::vector<uint64_t>(NumBlocks, 0); + for (auto &Jump : Func.Jumps) { + InFlow[Jump.Target] += Jump.Flow; + OutFlow[Jump.Source] += Jump.Flow; + } + + uint64_t TotalInFlow = 0; + uint64_t TotalOutFlow = 0; + for (uint64_t I = 0; I < NumBlocks; I++) { + auto &Block = Func.Blocks[I]; + if (Block.isEntry()) { + TotalInFlow += Block.Flow; + assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); + } else if (Block.isExit()) { + TotalOutFlow += Block.Flow; + assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); + } else { + assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); + assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); + } + } + assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); +} +#endif + +} // end of anonymous namespace + +/// Apply the profile inference algorithm for a given flow function +void llvm::applyFlowInference(FlowFunction &Func) { + // Create and apply an inference network model + auto InferenceNetwork = MinCostMaxFlow(); + initializeNetwork(InferenceNetwork, Func); + InferenceNetwork.run(); + + // Extract flow values for every block and every edge + extractWeights(InferenceNetwork, Func); + +#ifndef NDEBUG + // Verify the result + verifyWeights(Func); +#endif +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp index 6d995cf4c048..ea0e8343eb88 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp @@ -34,6 +34,10 @@ cl::opt<bool> NoWarnSampleUnused( cl::desc("Use this option to turn off/on warnings about function with " "samples but without debug information to use those samples. ")); +cl::opt<bool> SampleProfileUseProfi( + "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Use profi to infer block and edge counts.")); + namespace sampleprofutil { /// Return true if the given callsite is hot wrt to hot cutoff threshold. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index a042146d7ace..71c15d5c51fc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" @@ -1833,22 +1834,6 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) { return V; } -/// Check whether value has nuw/nsw/exact set but SCEV does not. -/// TODO: In reality it is better to check the poison recursively -/// but this is better than nothing. -static bool SCEVLostPoisonFlags(const SCEV *S, const Instruction *I) { - if (isa<OverflowingBinaryOperator>(I)) { - if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) { - if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap()) - return true; - if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap()) - return true; - } - } else if (isa<PossiblyExactOperator>(I) && I->isExact()) - return true; - return false; -} - ScalarEvolution::ValueOffsetPair SCEVExpander::FindValueInExprValueMap(const SCEV *S, const Instruction *InsertPt) { @@ -1872,8 +1857,7 @@ SCEVExpander::FindValueInExprValueMap(const SCEV *S, if (S->getType() == V->getType() && SE.DT.dominates(EntInst, InsertPt) && (SE.LI.getLoopFor(EntInst->getParent()) == nullptr || - SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)) && - !SCEVLostPoisonFlags(S, EntInst)) + SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt))) return {V, Offset}; } } @@ -1952,26 +1936,36 @@ Value *SCEVExpander::expand(const SCEV *S) { if (!V) V = visit(S); - else if (VO.second) { - if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { - Type *Ety = Vty->getPointerElementType(); - int64_t Offset = VO.second->getSExtValue(); - int64_t ESize = SE.getTypeSizeInBits(Ety); - if ((Offset * 8) % ESize == 0) { - ConstantInt *Idx = + else { + // If we're reusing an existing instruction, we are effectively CSEing two + // copies of the instruction (with potentially different flags). As such, + // we need to drop any poison generating flags unless we can prove that + // said flags must be valid for all new users. + if (auto *I = dyn_cast<Instruction>(V)) + if (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)) + I->dropPoisonGeneratingFlags(); + + if (VO.second) { + if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) { + Type *Ety = Vty->getPointerElementType(); + int64_t Offset = VO.second->getSExtValue(); + int64_t ESize = SE.getTypeSizeInBits(Ety); + if ((Offset * 8) % ESize == 0) { + ConstantInt *Idx = ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize); - V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); - } else { - ConstantInt *Idx = + V = Builder.CreateGEP(Ety, V, Idx, "scevgep"); + } else { + ConstantInt *Idx = ConstantInt::getSigned(VO.second->getType(), -Offset); - unsigned AS = Vty->getAddressSpace(); - V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); - V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, - "uglygep"); - V = Builder.CreateBitCast(V, Vty); + unsigned AS = Vty->getAddressSpace(); + V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS)); + V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx, + "uglygep"); + V = Builder.CreateBitCast(V, Vty); + } + } else { + V = Builder.CreateSub(V, VO.second); } - } else { - V = Builder.CreateSub(V, VO.second); } } // Remember the expanded value for this SCEV at this location. @@ -2180,7 +2174,9 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At, } // Use expand's logic which is used for reusing a previous Value in - // ExprValueMap. + // ExprValueMap. Note that we don't currently model the cost of + // needing to drop poison generating flags on the instruction if we + // want to reuse it. We effectively assume that has zero cost. ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At); if (VO.first) return VO; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index f467de5f924e..afa3ecde77f9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3936,7 +3936,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm, BasicBlock *KeepEdge1 = TrueBB; BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr; - SmallPtrSet<BasicBlock *, 2> RemovedSuccessors; + SmallSetVector<BasicBlock *, 2> RemovedSuccessors; // Then remove the rest. for (BasicBlock *Succ : successors(OldTerm)) { @@ -4782,6 +4782,26 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { return true; } +static void createUnreachableSwitchDefault(SwitchInst *Switch, + DomTreeUpdater *DTU) { + LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); + auto *BB = Switch->getParent(); + auto *OrigDefaultBlock = Switch->getDefaultDest(); + OrigDefaultBlock->removePredecessor(BB); + BasicBlock *NewDefaultBlock = BasicBlock::Create( + BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), + OrigDefaultBlock); + new UnreachableInst(Switch->getContext(), NewDefaultBlock); + Switch->setDefaultDest(&*NewDefaultBlock); + if (DTU) { + SmallVector<DominatorTree::UpdateType, 2> Updates; + Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock}); + if (!is_contained(successors(BB), OrigDefaultBlock)) + Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock}); + DTU->applyUpdates(Updates); + } +} + /// Turn a switch with two reachable destinations into an integer range /// comparison and branch. bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI, @@ -4927,10 +4947,14 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, // Gather dead cases. SmallVector<ConstantInt *, 8> DeadCases; SmallDenseMap<BasicBlock *, int, 8> NumPerSuccessorCases; + SmallVector<BasicBlock *, 8> UniqueSuccessors; for (auto &Case : SI->cases()) { auto *Successor = Case.getCaseSuccessor(); - if (DTU) + if (DTU) { + if (!NumPerSuccessorCases.count(Successor)) + UniqueSuccessors.push_back(Successor); ++NumPerSuccessorCases[Successor]; + } const APInt &CaseVal = Case.getCaseValue()->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) { @@ -4973,9 +4997,9 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, if (DTU) { std::vector<DominatorTree::UpdateType> Updates; - for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases) - if (I.second == 0) - Updates.push_back({DominatorTree::Delete, SI->getParent(), I.first}); + for (auto *Successor : UniqueSuccessors) + if (NumPerSuccessorCases[Successor] == 0) + Updates.push_back({DominatorTree::Delete, SI->getParent(), Successor}); DTU->applyUpdates(Updates); } @@ -6040,15 +6064,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, if (Succ == SI->getDefaultDest()) continue; Succ->removePredecessor(BB); - RemovedSuccessors.insert(Succ); + if (DTU && RemovedSuccessors.insert(Succ).second) + Updates.push_back({DominatorTree::Delete, BB, Succ}); } SI->eraseFromParent(); - if (DTU) { - for (BasicBlock *RemovedSuccessor : RemovedSuccessors) - Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor}); + if (DTU) DTU->applyUpdates(Updates); - } ++NumLookupTables; if (NeedMask) @@ -6215,7 +6237,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) { // Eliminate redundant destinations. SmallPtrSet<Value *, 8> Succs; - SmallPtrSet<BasicBlock *, 8> RemovedSuccs; + SmallSetVector<BasicBlock *, 8> RemovedSuccs; for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { BasicBlock *Dest = IBI->getDestination(i); if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) { @@ -6305,8 +6327,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, // We've found an identical block. Update our predecessors to take that // path instead and make ourselves dead. - SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB)); - for (BasicBlock *Pred : Preds) { + SmallSetVector<BasicBlock *, 16> UniquePreds(pred_begin(BB), pred_end(BB)); + for (BasicBlock *Pred : UniquePreds) { InvokeInst *II = cast<InvokeInst>(Pred->getTerminator()); assert(II->getNormalDest() != BB && II->getUnwindDest() == BB && "unexpected successor"); @@ -6323,8 +6345,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, if (isa<DbgInfoIntrinsic>(Inst)) Inst.eraseFromParent(); - SmallPtrSet<BasicBlock *, 16> Succs(succ_begin(BB), succ_end(BB)); - for (BasicBlock *Succ : Succs) { + SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB)); + for (BasicBlock *Succ : UniqueSuccs) { Succ->removePredecessor(BB); if (DTU) Updates.push_back({DominatorTree::Delete, BB, Succ}); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 23bb6f0860c9..5ca0adb4242c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -473,18 +473,10 @@ public: /// handle the more complex control flow around the loops. virtual BasicBlock *createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, - VPTransformState &State); - /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); - /// Widen a single select instruction within the innermost loop. - void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, - bool InvariantCond, VPTransformState &State); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State); @@ -496,12 +488,6 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; - /// Vectorize a single GetElementPtrInst based on information gathered and - /// decisions taken during planning. - void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, - unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); - /// Vectorize a single first-order recurrence or pointer induction PHINode in /// a block. This method handles the induction variable canonicalization. It /// supports both VF = 1 for unrolled loops and arbitrary length vectors. @@ -511,9 +497,9 @@ public: /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -538,15 +524,6 @@ public: ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask = nullptr); - /// Vectorize Load and Store instructions with the base address given in \p - /// Addr, optionally masking the vector operations if \p BlockInMask is - /// non-null. Use \p State to translate given VPValues to IR values in the - /// vectorized loop. - void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, - VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, - bool ConsecutiveStride, bool Reverse); - /// Set the debug location in the builder \p Ptr using the debug location in /// \p V. If \p Ptr is None then it uses the class member's Builder. void setDebugLocFromInst(const Value *V, @@ -566,6 +543,17 @@ public: /// element. virtual Value *getBroadcastInstrs(Value *V); + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Instruction *To, Instruction *From); + + /// Similar to the previous function but it adds the metadata to a + /// vector of instructions. + void addMetadata(ArrayRef<Value *> To, Instruction *From); + protected: friend class LoopVectorizationPlanner; @@ -741,16 +729,16 @@ protected: /// vector loop. void addNewMetadata(Instruction *To, const Instruction *Orig); - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Instruction *To, Instruction *From); - - /// Similar to the previous function but it adds the metadata to a - /// vector of instructions. - void addMetadata(ArrayRef<Value *> To, Instruction *From); + /// Collect poison-generating recipes that may generate a poison value that is + /// used after vectorization, even when their operands are not poison. Those + /// recipes meet the following conditions: + /// * Contribute to the address computation of a recipe generating a widen + /// memory load/store (VPWidenMemoryInstructionRecipe or + /// VPInterleaveRecipe). + /// * Such a widen memory load/store has at least one underlying Instruction + /// that is in a basic block that needs predication and after vectorization + /// the generated instruction won't be predicated. + void collectPoisonGeneratingRecipes(VPTransformState &State); /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. @@ -1173,6 +1161,84 @@ void InnerLoopVectorizer::addNewMetadata(Instruction *To, LVer->annotateInstWithNoAlias(To, Orig); } +void InnerLoopVectorizer::collectPoisonGeneratingRecipes( + VPTransformState &State) { + + // Collect recipes in the backward slice of `Root` that may generate a poison + // value that is used after vectorization. + SmallPtrSet<VPRecipeBase *, 16> Visited; + auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { + SmallVector<VPRecipeBase *, 16> Worklist; + Worklist.push_back(Root); + + // Traverse the backward slice of Root through its use-def chain. + while (!Worklist.empty()) { + VPRecipeBase *CurRec = Worklist.back(); + Worklist.pop_back(); + + if (!Visited.insert(CurRec).second) + continue; + + // Prune search if we find another recipe generating a widen memory + // instruction. Widen memory instructions involved in address computation + // will lead to gather/scatter instructions, which don't need to be + // handled. + if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || + isa<VPInterleaveRecipe>(CurRec)) + continue; + + // This recipe contributes to the address computation of a widen + // load/store. Collect recipe if its underlying instruction has + // poison-generating flags. + Instruction *Instr = CurRec->getUnderlyingInstr(); + if (Instr && Instr->hasPoisonGeneratingFlags()) + State.MayGeneratePoisonRecipes.insert(CurRec); + + // Add new definitions to the worklist. + for (VPValue *operand : CurRec->operands()) + if (VPDef *OpDef = operand->getDef()) + Worklist.push_back(cast<VPRecipeBase>(OpDef)); + } + }); + + // Traverse all the recipes in the VPlan and collect the poison-generating + // recipes in the backward slice starting at the address of a VPWidenRecipe or + // VPInterleaveRecipe. + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { + for (VPRecipeBase &Recipe : *VPBB) { + if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { + Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); + VPDef *AddrDef = WidenRec->getAddr()->getDef(); + if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && + Legal->blockNeedsPredication(UnderlyingInstr->getParent())) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { + VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); + if (AddrDef) { + // Check if any member of the interleave group needs predication. + const InterleaveGroup<Instruction> *InterGroup = + InterleaveRec->getInterleaveGroup(); + bool NeedPredication = false; + for (int I = 0, NumMembers = InterGroup->getNumMembers(); + I < NumMembers; ++I) { + Instruction *Member = InterGroup->getMember(I); + if (Member) + NeedPredication |= + Legal->blockNeedsPredication(Member->getParent()); + } + + if (NeedPredication) + collectPoisonGeneratingInstrsInBackwardSlice( + cast<VPRecipeBase>(AddrDef)); + } + } + } + } +} + void InnerLoopVectorizer::addMetadata(Instruction *To, Instruction *From) { propagateMetadata(To, From); @@ -1541,7 +1607,16 @@ public: // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { + bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + // When we know the load is uniform and the original scalar loop was not + // predicated we don't need to mark it as a predicated instruction. Any + // vectorised blocks created when tail-folding are something artificial we + // have introduced and we know there is always at least one active lane. + // That's why we call Legal->blockNeedsPredication here because it doesn't + // query tail-folding. + if (IsKnownUniform && isa<LoadInst>(I) && + !Legal->blockNeedsPredication(I->getParent())) + return false; if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated @@ -1816,9 +1891,11 @@ private: /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized - /// during vectorization. Non-uniform scalarized instructions will be - /// represented by VF values in the vectorized loop, each corresponding to an - /// iteration of the original scalar loop. + /// during vectorization. collectLoopScalars should only add non-uniform nodes + /// to the list if they are used by a load/store instruction that is marked as + /// CM_Scalarize. Non-uniform scalarized instructions will be represented by + /// VF values in the vectorized loop, each corresponding to an iteration of + /// the original scalar loop. void collectLoopScalars(ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. @@ -2918,132 +2995,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::vectorizeMemoryInstruction( - Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, - bool Reverse) { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast<LoadInst>(Instr); - StoreInst *SI = dyn_cast<StoreInst>(Instr); - - assert((LI || SI) && "Invalid Load/Store instruction"); - assert((!SI || StoredValue) && "No stored value provided for widened store"); - assert((!LI || !StoredValue) && "Stored value provided for widened load"); - - Type *ScalarDataTy = getLoadStoreType(Instr); - - auto *DataTy = VectorType::get(ScalarDataTy, VF); - const Align Alignment = getLoadStoreAlignment(Instr); - bool CreateGatherScatter = !ConsecutiveStride; - - VectorParts BlockInMaskParts(UF); - bool isMaskRequired = BlockInMask; - if (isMaskRequired) - for (unsigned Part = 0; Part < UF; ++Part) - BlockInMaskParts[Part] = State.get(BlockInMask, Part); - - const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { - // Calculate the pointer for the specific unroll-part. - GetElementPtrInst *PartPtr = nullptr; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); - // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); - PartPtr = - cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); - PartPtr->setIsInBounds(InBounds); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); - } else { - Value *Increment = - createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); - PartPtr->setIsInBounds(InBounds); - } - - unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - }; - - // Handle Stores: - if (SI) { - setDebugLocFromInst(SI); - - for (unsigned Part = 0; Part < UF; ++Part) { - Instruction *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); - } else { - if (Reverse) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = reverseVector(StoredVal); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } - addMetadata(NewSI, SI); - } - return; - } - - // Handle loads. - assert(LI && "Must have a load instruction"); - setDebugLocFromInst(LI); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *NewLI; - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(Addr, Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); - addMetadata(NewLI, LI); - } else { - auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); - - // Add metadata to the load, but setVectorValue to the reverse shuffle. - addMetadata(NewLI, LI); - if (Reverse) - NewLI = reverseVector(NewLI); - } - - State.set(Def, NewLI, Part); - } -} - -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, - VPUser &User, +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -3064,17 +3017,26 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + // If the scalarized instruction contributes to the address computation of a + // widen masked load/store which was in a basic block that needed predication + // and is not predicated after vectorization, we can't propagate + // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized + // instruction could feed a poison value to the base address of the widen + // load/store. + if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + Cloned->dropPoisonGeneratingFlags(); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3082,7 +3044,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - State.set(Def, Cloned, Instance); + State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast<AssumeInst>(Cloned)) @@ -4615,77 +4577,6 @@ bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, - VPUser &Operands, unsigned UF, - ElementCount VF, bool IsPtrLoopInvariant, - SmallBitVector &IsIndexLoopInvariant, - VPTransformState &State) { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - State.set(VPDef, EntryPart, Part); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : State.get(Operands.getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { - VPValue *Operand = Operands.getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(VPDef, NewGEP, Part); - addMetadata(NewGEP, GEP); - } - } -} - void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, VPTransformState &State) { @@ -4745,38 +4636,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); - unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); - - bool NeedsVectorIndex = !IsUniform && VF.isScalable(); - Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; - if (NeedsVectorIndex) { - Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); - UnitStepVec = Builder.CreateStepVector(VecIVTy); - PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); - } + assert((IsUniform || !State.VF.isScalable()) && + "Cannot scalarize a scalable VF"); + unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); for (unsigned Part = 0; Part < UF; ++Part) { Value *PartStart = createStepForVF(Builder, PtrInd->getType(), VF, Part); - if (NeedsVectorIndex) { - // Here we cache the whole vector, which means we can support the - // extraction of any lane. However, in some cases the extractelement - // instruction that is generated for scalar uses of this vector (e.g. - // a load instruction) is not folded away. Therefore we still - // calculate values for the first n lanes to avoid redundant moves - // (when extracting the 0th element) and to produce scalar code (i.e. - // additional add/gep instructions instead of expensive extractelement - // instructions) when extracting higher-order elements. - Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); - Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); - Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); - SclrGep->setName("next.gep"); - State.set(PhiR, SclrGep, Part); - } - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); @@ -4858,114 +4725,6 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, - VPUser &User, - VPTransformState &State) { - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : User.operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) - VecOp->copyIRFlags(&I); - - // Use this vector value for all users of the original instruction. - State.set(Def, V, Part); - addMetadata(V, &I); - } - - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *B = State.get(User.getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(Def, C, Part); - addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = - (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *A = State.get(User.getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(Def, Cast, Part); - addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State) { @@ -5039,31 +4798,6 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, } } -void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, - VPUser &Operands, - bool InvariantCond, - VPTransformState &State) { - setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = InvariantCond - ? State.get(Operands.getOperand(0), VPIteration(0, 0)) - : nullptr; - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = - InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); - Value *Op0 = State.get(Operands.getOperand(1), Part); - Value *Op1 = State.get(Operands.getOperand(2), Part); - Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); - State.set(VPDef, Sel, Part); - addMetadata(Sel, &I); - } -} - void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -5103,38 +4837,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { !TheLoop->isLoopInvariant(V); }; - auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { - if (!isa<PHINode>(Ptr) || - !Legal->getInductionVars().count(cast<PHINode>(Ptr))) - return false; - auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; - if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - return isScalarUse(MemAccess, Ptr); - }; - - // A helper that evaluates a memory access's use of a pointer. If the - // pointer is actually the pointer induction of a loop, it is being - // inserted into Worklist. If the use will be a scalar use, and the - // pointer is only used by memory accesses, we place the pointer in - // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. + // A helper that evaluates a memory access's use of a pointer. If the use will + // be a scalar use and the pointer is only used by memory accesses, we place + // the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { - if (isScalarPtrInduction(MemAccess, Ptr)) { - Worklist.insert(cast<Instruction>(Ptr)); - LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr - << "\n"); - - Instruction *Update = cast<Instruction>( - cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); - - // If there is more than one user of Update (Ptr), we shouldn't assume it - // will be scalar after vectorisation as other users of the instruction - // may require widening. Otherwise, add it to ScalarPtrs. - if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { - ScalarPtrs.insert(Update); - return; - } - } // We only care about bitcast and getelementptr instructions contained in // the loop. if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5226,11 +4933,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; + // Returns true if \p Indvar is a pointer induction that is used directly by + // load/store instruction \p I. + auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, + Instruction *I) { + return Induction.second.getKind() == + InductionDescriptor::IK_PtrInduction && + (isa<LoadInst>(I) || isa<StoreInst>(I)) && + Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); + }; + // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(Ind, I); }); if (!ScalarInd) continue; @@ -5240,7 +4958,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { auto ScalarIndUpdate = llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return I == Ind || !TheLoop->contains(I) || Worklist.count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || + IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); }); if (!ScalarIndUpdate) continue; @@ -7079,6 +6798,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` + // that it is being called from this specific place. // Figure out whether the access is strided and get the stride value // if it's known in compile time @@ -7286,6 +7007,12 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + // For a call to the llvm.fmuladd intrinsic we need to add the cost of a + // normal fmul instruction to the cost of the fadd reduction. + if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + BaseCost += + TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); + // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. @@ -7962,6 +7689,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { + if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + return *RedCost; bool NeedToScalarize; CallInst *CI = cast<CallInst>(I); InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); @@ -8260,6 +7990,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + ILV.collectPoisonGeneratingRecipes(State); ILV.printDebugTracesAtStart(); @@ -8468,7 +8199,8 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "intermediate fn:\n" + << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -8666,7 +8398,7 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { DEBUG_WITH_TYPE(VerboseDebug, { - dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; + dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; }); } @@ -9052,7 +8784,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + Range); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -9354,7 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( if (VPBB) VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); else { - Plan->setEntry(FirstVPBBForBB); + auto *TopRegion = new VPRegionBlock("vector loop"); + TopRegion->setEntry(FirstVPBBForBB); + Plan->setEntry(TopRegion); HeaderVPBB = FirstVPBBForBB; } VPBB = FirstVPBBForBB; @@ -9426,9 +9161,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - assert(isa<VPBasicBlock>(Plan->getEntry()) && + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && - "entry block must be set to a non-empty VPBasicBlock"); + "entry block must be set to a VPRegionBlock having a non-empty entry " + "VPBasicBlock"); + cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9653,12 +9390,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( unsigned FirstOpId; assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && "Only min/max recurrences allowed for inloop reductions"); + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = (Kind == RecurKind::FMulAdd); + assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && + "Expected instruction to be a call to the llvm.fmuladd intrinsic"); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { assert(isa<VPWidenSelectRecipe>(WidenRecipe) && "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && + assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || + (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -9669,8 +9411,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *CondOp = CM.foldTailByMasking() ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) : nullptr; - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + + if (IsFMulAdd) { + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe to use as the vector operand for the + // fadd reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); + FMulRecipe->setFastMathFlags(R->getFastMathFlags()); + WidenRecipe->getParent()->insert(FMulRecipe, + WidenRecipe->getIterator()); + VecOp = FMulRecipe; + } + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9744,18 +9498,218 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { } void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), - this, *this, InvariantCond, State); + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.ILV->setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.ILV->addMetadata(Sel, &I); + } } void VPWidenRecipe::execute(VPTransformState &State) { - State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.ILV->setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.count(this) > 0) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.ILV->addMetadata(V, &I); + } + + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.ILV->setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.ILV->addMetadata(C, &I); + } + + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.ILV->setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.ILV->addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. } void VPWidenGEPRecipe::execute(VPTransformState &State) { - State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, - *this, State.UF, State.VF, IsPtrLoopInvariant, - IsIndexLoopInvariant, State); + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.ILV->addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = IsInBounds + ? State.Builder.CreateInBoundsGEP( + GEP->getSourceElementType(), Ptr, Indices) + : State.Builder.CreateGEP(GEP->getSourceElementType(), + Ptr, Indices); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.ILV->addMetadata(NewGEP, GEP); + } + } } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { @@ -9867,8 +9821,8 @@ void VPReductionRecipe::execute(VPTransformState &State) { void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9891,7 +9845,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, VPIteration(Part, Lane), IsPredicated, State); } @@ -9970,9 +9924,129 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - State.ILV->vectorizeMemoryInstruction( - &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), - StoredValue, getMask(), Consecutive, Reverse); + + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); + StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); + + assert((LI || SI) && "Invalid Load/Store instruction"); + assert((!SI || StoredValue) && "No stored value provided for widened store"); + assert((!LI || !StoredValue) && "Stored value provided for widened load"); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGatherScatter = !Consecutive; + + auto &Builder = State.Builder; + InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); + bool isMaskRequired = getMask(); + if (isMaskRequired) + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = State.get(getMask(), Part); + + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { + // Calculate the pointer for the specific unroll-part. + GetElementPtrInst *PartPtr = nullptr; + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + PartPtr = + cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); + PartPtr->setIsInBounds(InBounds); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + BlockInMaskParts[Part] = + Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); + } else { + Value *Increment = + createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); + PartPtr->setIsInBounds(InBounds); + } + + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + }; + + // Handle Stores: + if (SI) { + State.ILV->setDebugLocFromInst(SI); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, Part); + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } else { + if (Reverse) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + BlockInMaskParts[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } + State.ILV->addMetadata(NewSI, SI); + } + return; + } + + // Handle loads. + assert(LI && "Must have a load instruction"); + State.ILV->setDebugLocFromInst(LI); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewLI; + if (CreateGatherScatter) { + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(getAddr(), Part); + NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + State.ILV->addMetadata(NewLI, LI); + } else { + auto *VecPtr = + CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + if (isMaskRequired) + NewLI = Builder.CreateMaskedLoad( + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); + else + NewLI = + Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.ILV->addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + } + + State.set(getVPSingleValue(), NewLI, Part); + } } // Determine how to lower the scalar epilogue, which depends on 1) optimising diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3ef0b794f68..95061e9053fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -283,6 +283,26 @@ static bool isCommutative(Instruction *I) { return false; } +/// Checks if the given value is actually an undefined constant vector. +static bool isUndefVector(const Value *V) { + if (isa<UndefValue>(V)) + return true; + auto *C = dyn_cast<Constant>(V); + if (!C) + return false; + if (!C->containsUndefOrPoisonElement()) + return false; + auto *VecTy = dyn_cast<FixedVectorType>(C->getType()); + if (!VecTy) + return false; + for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { + if (Constant *Elem = C->getAggregateElement(I)) + if (!isa<UndefValue>(Elem)) + return false; + } + return true; +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -327,7 +347,11 @@ static bool isCommutative(Instruction *I) { /// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { - auto *EI0 = cast<ExtractElementInst>(VL[0]); + const auto *It = + find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); }); + if (It == VL.end()) + return None; + auto *EI0 = cast<ExtractElementInst>(*It); if (isa<ScalableVectorType>(EI0->getVectorOperandType())) return None; unsigned Size = @@ -336,33 +360,41 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { Value *Vec2 = nullptr; enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; + Mask.assign(VL.size(), UndefMaskElem); for (unsigned I = 0, E = VL.size(); I < E; ++I) { + // Undef can be represented as an undef element in a vector. + if (isa<UndefValue>(VL[I])) + continue; auto *EI = cast<ExtractElementInst>(VL[I]); + if (isa<ScalableVectorType>(EI->getVectorOperandType())) + return None; auto *Vec = EI->getVectorOperand(); + // We can extractelement from undef or poison vector. + if (isUndefVector(Vec)) + continue; // All vector operands must have the same number of vector elements. if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) return None; + if (isa<UndefValue>(EI->getIndexOperand())) + continue; auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) { - Mask.push_back(UndefMaskElem); + if (Idx->getValue().uge(Size)) continue; - } unsigned IntIdx = Idx->getValue().getZExtValue(); - Mask.push_back(IntIdx); - // We can extractelement from undef or poison vector. - if (isa<UndefValue>(Vec)) - continue; + Mask[I] = IntIdx; // For correct shuffling we have to have at most 2 different vector operands // in all extractelement instructions. - if (!Vec1 || Vec1 == Vec) + if (!Vec1 || Vec1 == Vec) { Vec1 = Vec; - else if (!Vec2 || Vec2 == Vec) + } else if (!Vec2 || Vec2 == Vec) { Vec2 = Vec; - else + Mask[I] += Size; + } else { return None; + } if (CommonShuffleMode == Permute) continue; // If the extract index is not the same as the operation number, it is a @@ -1680,6 +1712,28 @@ private: return IsSame(Scalars, ReuseShuffleIndices); } + /// \returns true if current entry has same operands as \p TE. + bool hasEqualOperands(const TreeEntry &TE) const { + if (TE.getNumOperands() != getNumOperands()) + return false; + SmallBitVector Used(getNumOperands()); + for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { + unsigned PrevCount = Used.count(); + for (unsigned K = 0; K < E; ++K) { + if (Used.test(K)) + continue; + if (getOperand(K) == TE.getOperand(I)) { + Used.set(K); + break; + } + } + // Check if we actually found the matching operand. + if (PrevCount == Used.count()) + return false; + } + return true; + } + /// \return Final vectorization factor for the node. Defined by the total /// number of vectorized scalars, including those, used several times in the /// entry and counted in the \a ReuseShuffleIndices, if any. @@ -1773,6 +1827,12 @@ private: return Operands[OpIdx]; } + /// \returns the \p OpIdx operand of this TreeEntry. + ArrayRef<Value *> getOperand(unsigned OpIdx) const { + assert(OpIdx < Operands.size() && "Off bounds"); + return Operands[OpIdx]; + } + /// \returns the number of operands. unsigned getNumOperands() const { return Operands.size(); } @@ -2078,7 +2138,7 @@ private: SmallPtrSet<const Value *, 32> EphValues; /// Holds all of the instructions that we gathered. - SetVector<Instruction *> GatherSeq; + SetVector<Instruction *> GatherShuffleSeq; /// A list of blocks that we are going to CSE. SetVector<BasicBlock *> CSEBlocks; @@ -4386,15 +4446,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, bool IsGather) { DenseMap<Value *, int> ExtractVectorsTys; for (auto *V : VL) { + if (isa<UndefValue>(V)) + continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || - (IsGather && ScalarToTreeEntry.count(V))) + if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) continue; auto *EE = cast<ExtractElementInst>(V); - unsigned Idx = *getExtractIndex(EE); + Optional<unsigned> EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; if (TTIRef.getNumberOfParts(VecTy) != TTIRef.getNumberOfParts(EE->getVectorOperandType())) { auto It = @@ -4426,6 +4490,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, for (const auto &Data : ExtractVectorsTys) { auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); unsigned NumElts = VecTy->getNumElements(); + if (Data.second % NumElts == 0) + continue; if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); @@ -4488,10 +4554,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } - if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && - allSameBlock(VL) && - !isa<ScalableVectorType>( - cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) { + if ((E->getOpcode() == Instruction::ExtractElement || + all_of(E->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && + allSameType(VL)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. SmallVector<int> Mask; @@ -4738,7 +4806,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0)); })); - if (isa<UndefValue>(FirstInsert->getOperand(0))) { + if (isUndefVector(FirstInsert->getOperand(0))) { Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); } else { SmallVector<int> InsertMask(NumElts); @@ -5016,7 +5084,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. InstructionCost VecCost = 0; - if (Instruction::isBinaryOp(E->getOpcode())) { + // Try to find the previous shuffle node with the same operands and same + // main/alternate ops. + auto &&TryFindNodeWithEqualOperands = [this, E]() { + for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { + if (TE.get() == E) + break; + if (TE->isAltShuffle() && + ((TE->getOpcode() == E->getOpcode() && + TE->getAltOpcode() == E->getAltOpcode()) || + (TE->getOpcode() == E->getAltOpcode() && + TE->getAltOpcode() == E->getOpcode())) && + TE->hasEqualOperands(*E)) + return true; + } + return false; + }; + if (TryFindNodeWithEqualOperands()) { + LLVM_DEBUG({ + dbgs() << "SLP: diamond match for alternate node found.\n"; + E->dump(); + }); + // No need to add new vector costs here since we're going to reuse + // same main/alternate vector ops, just do different shuffling. + } else if (Instruction::isBinaryOp(E->getOpcode())) { VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); @@ -5060,7 +5151,11 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { [this](Value *V) { return EphValues.contains(V); }) && (allConstant(TE->Scalars) || isSplat(TE->Scalars) || TE->Scalars.size() < Limit || - (TE->getOpcode() == Instruction::ExtractElement && + ((TE->getOpcode() == Instruction::ExtractElement || + all_of(TE->Scalars, + [](Value *V) { + return isa<ExtractElementInst, UndefValue>(V); + })) && isFixedVectorShuffle(TE->Scalars, Mask)) || (TE->State == TreeEntry::NeedToGather && TE->getOpcode() == Instruction::Load && !TE->isAltShuffle())); @@ -5280,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } +/// Check if two insertelement instructions are from the same buildvector. +static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, + InsertElementInst *V) { + // Instructions must be from the same basic blocks. + if (VU->getParent() != V->getParent()) + return false; + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + // Multiple used inserts are separate nodes. + if (!VU->hasOneUse() && !V->hasOneUse()) + return false; + auto *IE1 = VU; + auto *IE2 = V; + // Go through the vector operand of insertelement instructions trying to find + // either VU as the original vector for IE2 or V as the original vector for + // IE1. + do { + if (IE2 == VU || IE1 == V) + return true; + if (IE1) { + if (IE1 != VU && !IE1->hasOneUse()) + IE1 = nullptr; + else + IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); + } + if (IE2) { + if (IE2 != V && !IE2->hasOneUse()) + IE2 = nullptr; + else + IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); + } + } while (IE1 || IE2); + return false; +} + InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " @@ -5306,7 +5437,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { SmallVector<APInt> DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) + if (!isa_and_nonnull<InsertElementInst>(EU.User) && + !ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be @@ -5326,35 +5458,35 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { // If found user is an insertelement, do not calculate extract cost but try // to detect it as a final shuffled/identity match. - if (isa_and_nonnull<InsertElementInst>(EU.User)) { - if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { - Optional<int> InsertIdx = getInsertIndex(EU.User, 0); + if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) { + if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) { + Optional<int> InsertIdx = getInsertIndex(VU, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - Value *VU = EU.User; auto *It = find_if(FirstUsers, [VU](Value *V) { - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - auto *IE1 = cast<InsertElementInst>(VU); - auto *IE2 = cast<InsertElementInst>(V); - // Go through of insertelement instructions trying to find either VU - // as the original vector for IE2 or V as the original vector for IE1. - do { - if (IE1 == VU || IE2 == V) - return true; - if (IE1) - IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); - if (IE2) - IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); - } while (IE1 || IE2); - return false; + return areTwoInsertFromSameBuildVector(VU, + cast<InsertElementInst>(V)); }); int VecId = -1; if (It == FirstUsers.end()) { VF.push_back(FTy->getNumElements()); ShuffleMask.emplace_back(VF.back(), UndefMaskElem); - FirstUsers.push_back(EU.User); + // Find the insertvector, vectorized in tree, if any. + Value *Base = VU; + while (isa<InsertElementInst>(Base)) { + // Build the mask for the vectorized insertelement instructions. + if (const TreeEntry *E = getTreeEntry(Base)) { + VU = cast<InsertElementInst>(Base); + do { + int Idx = E->findLaneForValue(Base); + ShuffleMask.back()[Idx] = Idx; + Base = cast<InsertElementInst>(Base)->getOperand(0); + } while (E == getTreeEntry(Base)); + break; + } + Base = cast<InsertElementInst>(Base)->getOperand(0); + } + FirstUsers.push_back(VU); DemandedElts.push_back(APInt::getZero(VF.back())); VecId = FirstUsers.size() - 1; } else { @@ -5363,6 +5495,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; DemandedElts[VecId].setBit(Idx); + continue; } } @@ -5386,47 +5519,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - // For the very first element - simple shuffle of the source vector. - int Limit = ShuffleMask[I].size() * 2; - if (I == 0 && - all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + if (FirstUsers.size() == 1) { + int Limit = ShuffleMask.front().size() * 2; + if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); + cast<FixedVectorType>(FirstUsers.front()->getType()), + ShuffleMask.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement external users " << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; - continue; } - // Other elements - permutation of 2 vectors (the initial one and the next - // Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int &Mask = ShuffleMask[I][Idx]; - Mask = Mask == UndefMaskElem ? Idx : VF + Mask; - } - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), - ShuffleMask[I]); - LLVM_DEBUG( - dbgs() - << "SLP: Adding cost " << C - << " for final shuffle of vector node and external insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, - /*Extract*/ false); + cast<FixedVectorType>(FirstUsers.front()->getType()), + DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); Cost -= InsertCost; + } else if (FirstUsers.size() >= 2) { + unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); + // Combined masks of the first 2 vectors. + SmallVector<int> CombinedMask(MaxVF, UndefMaskElem); + copy(ShuffleMask.front(), CombinedMask.begin()); + APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); + auto *VecTy = FixedVectorType::get( + cast<VectorType>(FirstUsers.front()->getType())->getElementType(), + MaxVF); + for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { + if (ShuffleMask[1][I] != UndefMaskElem) { + CombinedMask[I] = ShuffleMask[1][I] + MaxVF; + CombinedDemandedElts.setBit(I); + } + } + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost << " for insertelements gather.\n" << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + for (int I = 2, E = FirstUsers.size(); I < E; ++I) { + // Other elements - permutation of 2 vectors (the initial one and the + // next Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int Mask = ShuffleMask[I][Idx]; + if (Mask != UndefMaskElem) + CombinedMask[Idx] = MaxVF + Mask; + else if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + } + for (unsigned Idx = VF; Idx < MaxVF; ++Idx) + if (CombinedMask[Idx] != UndefMaskElem) + CombinedMask[Idx] = Idx; + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost -= InsertCost; + } } #ifndef NDEBUG @@ -5728,7 +5900,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) { auto *InsElt = dyn_cast<InsertElementInst>(Vec); if (!InsElt) return Vec; - GatherSeq.insert(InsElt); + GatherShuffleSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (TreeEntry *Entry = getTreeEntry(V)) { @@ -5771,10 +5943,17 @@ class ShuffleInstructionBuilder { const unsigned VF = 0; bool IsFinalized = false; SmallVector<int, 4> Mask; + /// Holds all of the instructions that we gathered. + SetVector<Instruction *> &GatherShuffleSeq; + /// A list of blocks that we are going to CSE. + SetVector<BasicBlock *> &CSEBlocks; public: - ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) - : Builder(Builder), VF(VF) {} + ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF, + SetVector<Instruction *> &GatherShuffleSeq, + SetVector<BasicBlock *> &CSEBlocks) + : Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq), + CSEBlocks(CSEBlocks) {} /// Adds a mask, inverting it before applying. void addInversedMask(ArrayRef<unsigned> SubMask) { @@ -5804,7 +5983,12 @@ public: if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) return V; - return Builder.CreateShuffleVector(V, Mask, "shuffle"); + Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle"); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; } ~ShuffleInstructionBuilder() { @@ -5862,6 +6046,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { std::iota(UniformMask.begin(), UniformMask.end(), 0); V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); } + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } return V; } @@ -5909,15 +6097,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { VL = UniqueValues; } - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } return Vec; } @@ -5932,7 +6117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); unsigned VF = E->getVectorFactor(); - ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq, + CSEBlocks); if (E->State == TreeEntry::NeedToGather) { if (E->getMainOp()) setInsertPointAfterBundle(E); @@ -5946,16 +6132,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { "Expected shuffle of 1 or 2 entries."); Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + if (auto *I = dyn_cast<Instruction>(Vec)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } else { Vec = gather(E->Scalars); } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); - if (auto *I = dyn_cast<Instruction>(Vec)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = Vec; return Vec; @@ -6072,11 +6258,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IsIdentity &= *InsertIdx - Offset == I; Mask[*InsertIdx - Offset] = I; } - if (!IsIdentity || NumElts != NumScalars) + if (!IsIdentity || NumElts != NumScalars) { V = Builder.CreateShuffleVector(V, Mask); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } if ((!IsIdentity || Offset != 0 || - !isa<UndefValue>(FirstInsert->getOperand(0))) && + !isUndefVector(FirstInsert->getOperand(0))) && NumElts != NumScalars) { SmallVector<int> InsertMask(NumElts); std::iota(InsertMask.begin(), InsertMask.end(), 0); @@ -6088,6 +6279,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V = Builder.CreateShuffleVector( FirstInsert->getOperand(0), V, InsertMask, cast<Instruction>(E->Scalars.back())->getName()); + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } ++NumVectorInstructions; @@ -6444,6 +6639,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { V1 = Builder.CreateCast( static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } + // Add V0 and V1 to later analysis to try to find and remove matching + // instruction, if any. + for (Value *V : {V0, V1}) { + if (auto *I = dyn_cast<Instruction>(V)) { + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to @@ -6462,8 +6665,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, Mask); - if (Instruction *I = dyn_cast<Instruction>(V)) + if (auto *I = dyn_cast<Instruction>(V)) { V = propagateMetadata(I, E->Scalars); + GatherShuffleSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6657,10 +6863,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } void BoUpSLP::optimizeGatherSequence() { - LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. - for (Instruction *I : GatherSeq) { + for (Instruction *I : GatherShuffleSeq) { if (isDeleted(I)) continue; @@ -6677,11 +6883,10 @@ void BoUpSLP::optimizeGatherSequence() { // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't // hoist this instruction. - auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); - auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); - if (Op0 && L->contains(Op0)) - continue; - if (Op1 && L->contains(Op1)) + if (any_of(I->operands(), [L](Value *V) { + auto *OpI = dyn_cast<Instruction>(V); + return OpI && L->contains(OpI); + })) continue; // We can hoist this instruction. Move it to the pre-header. @@ -6705,7 +6910,50 @@ void BoUpSLP::optimizeGatherSequence() { return A->getDFSNumIn() < B->getDFSNumIn(); }); - // Perform O(N^2) search over the gather sequences and merge identical + // Less defined shuffles can be replaced by the more defined copies. + // Between two shuffles one is less defined if it has the same vector operands + // and its mask indeces are the same as in the first one or undefs. E.g. + // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, + // poison, <0, 0, 0, 0>. + auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2, + SmallVectorImpl<int> &NewMask) { + if (I1->getType() != I2->getType()) + return false; + auto *SI1 = dyn_cast<ShuffleVectorInst>(I1); + auto *SI2 = dyn_cast<ShuffleVectorInst>(I2); + if (!SI1 || !SI2) + return I1->isIdenticalTo(I2); + if (SI1->isIdenticalTo(SI2)) + return true; + for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) + if (SI1->getOperand(I) != SI2->getOperand(I)) + return false; + // Check if the second instruction is more defined than the first one. + NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end()); + ArrayRef<int> SM1 = SI1->getShuffleMask(); + // Count trailing undefs in the mask to check the final number of used + // registers. + unsigned LastUndefsCnt = 0; + for (int I = 0, E = NewMask.size(); I < E; ++I) { + if (SM1[I] == UndefMaskElem) + ++LastUndefsCnt; + else + LastUndefsCnt = 0; + if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem && + NewMask[I] != SM1[I]) + return false; + if (NewMask[I] == UndefMaskElem) + NewMask[I] = SM1[I]; + } + // Check if the last undefs actually change the final number of used vector + // registers. + return SM1.size() - LastUndefsCnt > 1 && + TTI->getNumberOfParts(SI1->getType()) == + TTI->getNumberOfParts( + FixedVectorType::get(SI1->getType()->getElementType(), + SM1.size() - LastUndefsCnt)); + }; + // Perform O(N^2) search over the gather/shuffle sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallVector<Instruction *, 16> Visited; @@ -6719,17 +6967,35 @@ void BoUpSLP::optimizeGatherSequence() { if (isDeleted(&In)) continue; if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) && - !isa<ShuffleVectorInst>(&In)) + !isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the // visited instructions. bool Replaced = false; - for (Instruction *v : Visited) { - if (In.isIdenticalTo(v) && - DT->dominates(v->getParent(), In.getParent())) { - In.replaceAllUsesWith(v); + for (Instruction *&V : Visited) { + SmallVector<int> NewMask; + if (IsIdenticalOrLessDefined(&In, V, NewMask) && + DT->dominates(V->getParent(), In.getParent())) { + In.replaceAllUsesWith(V); eraseInstruction(&In); + if (auto *SI = dyn_cast<ShuffleVectorInst>(V)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + Replaced = true; + break; + } + if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) && + GatherShuffleSeq.contains(V) && + IsIdenticalOrLessDefined(V, &In, NewMask) && + DT->dominates(In.getParent(), V->getParent())) { + In.moveAfter(V); + V->replaceAllUsesWith(&In); + eraseInstruction(V); + if (auto *SI = dyn_cast<ShuffleVectorInst>(&In)) + if (!NewMask.empty()) + SI->setShuffleMask(NewMask); + V = &In; Replaced = true; break; } @@ -6741,7 +7007,7 @@ void BoUpSLP::optimizeGatherSequence() { } } CSEBlocks.clear(); - GatherSeq.clear(); + GatherShuffleSeq.clear(); } // Groups the instructions to a bundle (which is then a single scheduling entity) @@ -8791,6 +9057,8 @@ private: assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + assert(RdxKind != RecurKind::FMulAdd && + "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, @@ -9123,8 +9391,9 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, SmallVector<Value *, 16> BuildVectorOpds; SmallVector<int> Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - (llvm::all_of(BuildVectorOpds, - [](Value *V) { return isa<ExtractElementInst>(V); }) && + (llvm::all_of( + BuildVectorOpds, + [](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) && isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; @@ -9132,44 +9401,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, return tryToVectorizeList(BuildVectorInsts, R); } -bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, - bool AtTerminator) { - bool OpsChanged = false; - SmallVector<Instruction *, 4> PostponedCmps; - for (auto *I : reverse(Instructions)) { - if (R.isDeleted(I)) - continue; - if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) - OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) - OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (isa<CmpInst>(I)) - PostponedCmps.push_back(I); - } - if (AtTerminator) { - // Try to find reductions first. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - for (Value *Op : I->operands()) - OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); - } - // Try to vectorize operands as vector bundles. - for (Instruction *I : PostponedCmps) { - if (R.isDeleted(I)) - continue; - OpsChanged |= tryToVectorize(I, R); - } - Instructions.clear(); - } else { - // Insert in reverse order since the PostponedCmps vector was filled in - // reverse order. - Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); - } - return OpsChanged; -} - template <typename T> static bool tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, @@ -9242,6 +9473,101 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +bool SLPVectorizerPass::vectorizeSimpleInstructions( + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { + bool OpsChanged = false; + SmallVector<Instruction *, 4> PostponedCmps; + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) + continue; + if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) + OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); + else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) + OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); + else if (isa<CmpInst>(I)) + PostponedCmps.push_back(I); + } + if (AtTerminator) { + // Try to find reductions first. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + OpsChanged |= tryToVectorize(I, R); + } + // Try to vectorize list of compares. + // Sort by type, compare predicate, etc. + // TODO: Add analysis on the operand opcodes (profitable to vectorize + // instructions with same/alternate opcodes/const values). + auto &&CompareSorter = [&R](Value *V, Value *V2) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return true; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + return CI1->getPredicate() < CI2->getPredicate() || + (CI1->getPredicate() > CI2->getPredicate() && + CI1->getPredicate() < + CmpInst::getSwappedPredicate(CI2->getPredicate())); + }; + + auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { + if (V1 == V2) + return true; + auto *CI1 = cast<CmpInst>(V1); + auto *CI2 = cast<CmpInst>(V2); + if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) + return false; + return CI1->getPredicate() == CI2->getPredicate() || + CI1->getPredicate() == + CmpInst::getSwappedPredicate(CI2->getPredicate()); + }; + auto Limit = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; + + SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end()); + OpsChanged |= tryToVectorizeSequence<Value>( + Vals, Limit, CompareSorter, AreCompatibleCompares, + [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) { + // Exclude possible reductions from other blocks. + bool ArePossiblyReducedInOtherBlock = + any_of(Candidates, [](Value *V) { + return any_of(V->users(), [V](User *U) { + return isa<SelectInst>(U) && + cast<SelectInst>(U)->getParent() != + cast<Instruction>(V)->getParent(); + }); + }); + if (ArePossiblyReducedInOtherBlock) + return false; + return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + }, + /*LimitForRegisterSize=*/true); + Instructions.clear(); + } else { + // Insert in reverse order since the PostponedCmps vector was filled in + // reverse order. + Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); + } + return OpsChanged; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 638467f94e1c..44b5e1df0839 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -718,6 +718,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, void VPInstruction::execute(VPTransformState &State) { assert(!State.Instance && "VPInstruction executing an Instance"); + IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); + State.Builder.setFastMathFlags(FMF); for (unsigned Part = 0; Part < State.UF; ++Part) generateInstruction(State, Part); } @@ -760,6 +762,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << Instruction::getOpcodeName(getOpcode()); } + O << FMF; + for (const VPValue *Operand : operands()) { O << " "; Operand->printAsOperand(O, SlotTracker); @@ -767,6 +771,16 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, } #endif +void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { + // Make sure the VPInstruction is a floating-point operation. + assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::FNeg || Opcode == Instruction::FSub || + Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FCmp) && + "this op can't take fast-math flags"); + FMF = FMFNew; +} + /// Generate the code inside the body of the vectorized loop. Assumes a single /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. @@ -1196,8 +1210,10 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); - O << " + reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) - << " ("; + O << " +"; + if (isa<FPMathOperator>(getUnderlyingInstr())) + O << getUnderlyingInstr()->getFastMathFlags(); + O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (getCondOp()) { O << ", "; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 00ee31007cb7..810dd5030f95 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -59,6 +59,7 @@ class Value; class VPBasicBlock; class VPRegionBlock; class VPlan; +class VPReplicateRecipe; class VPlanSlp; /// Returns a calculation for the total number of elements for a given \p VF. @@ -346,6 +347,10 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; + + /// Holds recipes that may generate a poison value that is used after + /// vectorization, even when their operands are not poison. + SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes; }; /// VPUsers instance used by VPBlockBase to manage CondBit and the block @@ -789,6 +794,7 @@ public: private: typedef unsigned char OpcodeTy; OpcodeTy Opcode; + FastMathFlags FMF; /// Utility method serving execute(): generates a single instance of the /// modeled instruction. @@ -802,13 +808,6 @@ public: : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} - VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { - for (auto *I : Operands) - addOperand(I->getVPSingleValue()); - } - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} @@ -870,6 +869,9 @@ public: return true; } } + + /// Set the fast-math flags. + void setFastMathFlags(FastMathFlags FMFNew); }; /// VPWidenRecipe is a recipe for producing a copy of vector type its @@ -1511,7 +1513,7 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue { Instruction &Ingredient; // Whether the loaded-from / stored-to addresses are consecutive. @@ -1533,10 +1535,10 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, bool Consecutive, bool Reverse) - : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), + : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), + VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } @@ -1544,6 +1546,7 @@ public: VPValue *StoredValue, VPValue *Mask, bool Consecutive, bool Reverse) : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}), + VPValue(VPValue::VPVMemoryInstructionSC, &Store, this), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); setMask(Mask); |