diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib')
25 files changed, 1221 insertions, 218 deletions
diff --git a/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 74664098ce1d..33f122728d2a 100644 --- a/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1648,8 +1648,32 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,      }    SmallVector<Value *, 4> V1Srcs; +  // For a recursive phi, that recurses through a contant gep, we can perform +  // aliasing calculations using the other phi operands with an unknown size to +  // specify that an unknown number of elements after the initial value are +  // potentially accessed.    bool isRecursive = false; -  if (PV)  { +  auto CheckForRecPhi = [&](Value *PV) { +    if (!EnableRecPhiAnalysis) +      return false; +    if (GEPOperator *PVGEP = dyn_cast<GEPOperator>(PV)) { +      // Check whether the incoming value is a GEP that advances the pointer +      // result of this PHI node (e.g. in a loop). If this is the case, we +      // would recurse and always get a MayAlias. Handle this case specially +      // below. We need to ensure that the phi is inbounds and has a constant +      // positive operand so that we can check for alias with the initial value +      // and an unknown but positive size. +      if (PVGEP->getPointerOperand() == PN && PVGEP->isInBounds() && +          PVGEP->getNumIndices() == 1 && isa<ConstantInt>(PVGEP->idx_begin()) && +          !cast<ConstantInt>(PVGEP->idx_begin())->isNegative()) { +        isRecursive = true; +        return true; +      } +    } +    return false; +  }; + +  if (PV) {      // If we have PhiValues then use it to get the underlying phi values.      const PhiValues::ValueSet &PhiValueSet = PV->getValuesForPhi(PN);      // If we have more phi values than the search depth then return MayAlias @@ -1660,19 +1684,8 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,        return MayAlias;      // Add the values to V1Srcs      for (Value *PV1 : PhiValueSet) { -      if (EnableRecPhiAnalysis) { -        if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) { -          // Check whether the incoming value is a GEP that advances the pointer -          // result of this PHI node (e.g. in a loop). If this is the case, we -          // would recurse and always get a MayAlias. Handle this case specially -          // below. -          if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 && -              isa<ConstantInt>(PV1GEP->idx_begin())) { -            isRecursive = true; -            continue; -          } -        } -      } +      if (CheckForRecPhi(PV1)) +        continue;        V1Srcs.push_back(PV1);      }    } else { @@ -1687,18 +1700,8 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,          // and 'n' are the number of PHI sources.          return MayAlias; -      if (EnableRecPhiAnalysis) -        if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) { -          // Check whether the incoming value is a GEP that advances the pointer -          // result of this PHI node (e.g. in a loop). If this is the case, we -          // would recurse and always get a MayAlias. Handle this case specially -          // below. -          if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 && -              isa<ConstantInt>(PV1GEP->idx_begin())) { -            isRecursive = true; -            continue; -          } -        } +      if (CheckForRecPhi(PV1)) +        continue;        if (UniqueSrc.insert(PV1).second)          V1Srcs.push_back(PV1); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 6c5ef0255a08..204fb556d810 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -220,6 +220,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {          continue;        if (StackProtectorFI == (int)i)          continue; +      if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) +        continue;        switch (MFI.getObjectSSPLayout(i)) {        case MachineFrameInfo::SSPLK_None: @@ -254,6 +256,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {        continue;      if (ProtectedObjs.count(i))        continue; +    if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) +      continue;      AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);    } diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f14b3dba4f31..ec384d2a7c56 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11372,9 +11372,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {        // Stop if more than one members are non-undef.        if (NumDefs > 1)          break; +        VTs.push_back(EVT::getVectorVT(*DAG.getContext(),                                       VT.getVectorElementType(), -                                     X.getValueType().getVectorNumElements())); +                                     X.getValueType().getVectorElementCount()));      }      if (NumDefs == 0) @@ -18795,6 +18796,11 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {  static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {    EVT VT = N->getValueType(0);    EVT OpVT = N->getOperand(0).getValueType(); + +  // We currently can't generate an appropriate shuffle for a scalable vector. +  if (VT.isScalableVector()) +    return SDValue(); +    int NumElts = VT.getVectorNumElements();    int NumOpElts = OpVT.getVectorNumElements(); @@ -19055,11 +19061,14 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {      return V;    // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR -  // nodes often generate nop CONCAT_VECTOR nodes. -  // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that -  // place the incoming vectors at the exact same location. +  // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR +  // operands and look for a CONCAT operations that place the incoming vectors +  // at the exact same location. +  // +  // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.    SDValue SingleSource = SDValue(); -  unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); +  unsigned PartNumElem = +      N->getOperand(0).getValueType().getVectorMinNumElements();    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {      SDValue Op = N->getOperand(i); @@ -19181,7 +19190,10 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {    // The binop must be a vector type, so we can extract some fraction of it.    EVT WideBVT = BinOp.getValueType(); -  if (!WideBVT.isVector()) +  // The optimisations below currently assume we are dealing with fixed length +  // vectors. It is possible to add support for scalable vectors, but at the +  // moment we've done no analysis to prove whether they are profitable or not. +  if (!WideBVT.isFixedLengthVector())      return SDValue();    EVT VT = Extract->getValueType(0); diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 414ba25ffd5f..c81d03cac81b 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2151,7 +2151,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {    EVT InVT = Lo.getValueType();    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), -                               InVT.getVectorNumElements()); +                               InVT.getVectorElementCount());    if (N->isStrictFPOpcode()) {      Lo = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other },  @@ -2197,13 +2197,19 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {    SDValue Idx = N->getOperand(1);    SDLoc dl(N);    SDValue Lo, Hi; + +  if (SubVT.isScalableVector() != +      N->getOperand(0).getValueType().isScalableVector()) +    report_fatal_error("Extracting a fixed-length vector from an illegal " +                       "scalable vector is not yet supported"); +    GetSplitVector(N->getOperand(0), Lo, Hi); -  uint64_t LoElts = Lo.getValueType().getVectorNumElements(); +  uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();    if (IdxVal < LoElts) { -    assert(IdxVal + SubVT.getVectorNumElements() <= LoElts && +    assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts &&             "Extracted subvector crosses vector split!");      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);    } else { @@ -2559,13 +2565,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {    SDValue InVec = N->getOperand(OpNo);    EVT InVT = InVec->getValueType(0);    EVT OutVT = N->getValueType(0); -  unsigned NumElements = OutVT.getVectorNumElements(); +  ElementCount NumElements = OutVT.getVectorElementCount();    bool IsFloat = OutVT.isFloatingPoint(); -  // Widening should have already made sure this is a power-two vector -  // if we're trying to split it at all. assert() that's true, just in case. -  assert(!(NumElements & 1) && "Splitting vector, but not in half!"); -    unsigned InElementSize = InVT.getScalarSizeInBits();    unsigned OutElementSize = OutVT.getScalarSizeInBits(); @@ -2595,6 +2597,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {    GetSplitVector(InVec, InLoVec, InHiVec);    // Truncate them to 1/2 the element size. +  // +  // This assumes the number of elements is a power of two; any vector that +  // isn't should be widened, not split.    EVT HalfElementVT = IsFloat ?      EVT::getFloatingPointVT(InElementSize/2) :      EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); @@ -3605,16 +3610,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {    EVT InVT = N->getOperand(0).getValueType();    EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));    SDLoc dl(N); -  unsigned WidenNumElts = WidenVT.getVectorNumElements(); -  unsigned NumInElts = InVT.getVectorNumElements();    unsigned NumOperands = N->getNumOperands();    bool InputWidened = false; // Indicates we need to widen the input.    if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) { -    if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) { +    unsigned WidenNumElts = WidenVT.getVectorMinNumElements(); +    unsigned NumInElts = InVT.getVectorMinNumElements(); +    if (WidenNumElts % NumInElts == 0) {        // Add undef vectors to widen to correct length. -      unsigned NumConcat = WidenVT.getVectorNumElements() / -                           InVT.getVectorNumElements(); +      unsigned NumConcat = WidenNumElts / NumInElts;        SDValue UndefVal = DAG.getUNDEF(InVT);        SmallVector<SDValue, 16> Ops(NumConcat);        for (unsigned i=0; i < NumOperands; ++i) @@ -3638,6 +3642,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {          return GetWidenedVector(N->getOperand(0));        if (NumOperands == 2) { +        assert(!WidenVT.isScalableVector() && +               "Cannot use vector shuffles to widen CONCAT_VECTOR result"); +        unsigned WidenNumElts = WidenVT.getVectorNumElements(); +        unsigned NumInElts = InVT.getVectorNumElements(); +          // Replace concat of two operands with a shuffle.          SmallVector<int, 16> MaskOps(WidenNumElts, -1);          for (unsigned i = 0; i < NumInElts; ++i) { @@ -3652,6 +3661,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {      }    } +  assert(!WidenVT.isScalableVector() && +         "Cannot use build vectors to widen CONCAT_VECTOR result"); +  unsigned WidenNumElts = WidenVT.getVectorNumElements(); +  unsigned NumInElts = InVT.getVectorNumElements(); +    // Fall back to use extracts and build vector.    EVT EltVT = WidenVT.getVectorElementType();    SmallVector<SDValue, 16> Ops(WidenNumElts); @@ -4913,7 +4927,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,    int LdWidth = LdVT.getSizeInBits();    int WidthDiff = WidenWidth - LdWidth; -  // Allow wider loads. +  // Allow wider loads if they are sufficiently aligned to avoid memory faults +  // and if the original load is simple.    unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();    // Find the vector type that can load from. @@ -4965,19 +4980,6 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,                        LD->getPointerInfo().getWithOffset(Offset),                        LD->getOriginalAlign(), MMOFlags, AAInfo);        LdChain.push_back(L.getValue(1)); -      if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) { -        // Later code assumes the vector loads produced will be mergeable, so we -        // must pad the final entry up to the previous width. Scalars are -        // combined separately. -        SmallVector<SDValue, 16> Loads; -        Loads.push_back(L); -        unsigned size = L->getValueSizeInBits(0); -        while (size < LdOp->getValueSizeInBits(0)) { -          Loads.push_back(DAG.getUNDEF(L->getValueType(0))); -          size += L->getValueSizeInBits(0); -        } -        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads); -      }      } else {        L = DAG.getLoad(NewVT, dl, Chain, BasePtr,                        LD->getPointerInfo().getWithOffset(Offset), @@ -5018,8 +5020,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,      EVT NewLdTy = LdOps[i].getValueType();      if (NewLdTy != LdTy) {        // Create a larger vector. +      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits(); +      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0); +      SmallVector<SDValue, 16> WidenOps(NumOps); +      unsigned j = 0; +      for (; j != End-Idx; ++j) +        WidenOps[j] = ConcatOps[Idx+j]; +      for (; j != NumOps; ++j) +        WidenOps[j] = DAG.getUNDEF(LdTy); +        ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy, -                                     makeArrayRef(&ConcatOps[Idx], End - Idx)); +                                     WidenOps);        Idx = End - 1;        LdTy = NewLdTy;      } diff --git a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp index 4796ef531054..8e7bf1eb0169 100644 --- a/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -375,6 +375,7 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,    COFFSymbol *Local = nullptr;    if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {      Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL; +    Sym->Section = nullptr;      COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);      if (!WeakDefault) { diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index efa3fd5ca9ce..4789a9f02937 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1192,7 +1192,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,    // Process the SVE callee-saves to determine what space needs to be    // allocated. -  if (AFI->getSVECalleeSavedStackSize()) { +  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {      // Find callee save instructions in frame.      CalleeSavesBegin = MBBI;      assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1200,11 +1200,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,        ++MBBI;      CalleeSavesEnd = MBBI; -    int64_t OffsetToFirstCalleeSaveFromSP = -        MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); -    StackOffset OffsetToCalleeSavesFromSP = -        StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; -    AllocateBefore -= OffsetToCalleeSavesFromSP; +    AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};      AllocateAfter = SVEStackSize - AllocateBefore;    } @@ -1582,7 +1578,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,    // deallocated.    StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;    MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; -  if (AFI->getSVECalleeSavedStackSize()) { +  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {      RestoreBegin = std::prev(RestoreEnd);;      while (IsSVECalleeSave(RestoreBegin) &&             RestoreBegin != MBB.begin()) @@ -1592,23 +1588,21 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,      assert(IsSVECalleeSave(RestoreBegin) &&             IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); -    int64_t OffsetToFirstCalleeSaveFromSP = -        MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); -    StackOffset OffsetToCalleeSavesFromSP = -        StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; -    DeallocateBefore = OffsetToCalleeSavesFromSP; -    DeallocateAfter = SVEStackSize - DeallocateBefore; +    StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8}; +    DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; +    DeallocateAfter = CalleeSavedSizeAsOffset;    }    // Deallocate the SVE area.    if (SVEStackSize) {      if (AFI->isStackRealigned()) { -      if (AFI->getSVECalleeSavedStackSize()) -        // Set SP to start of SVE area, from which the callee-save reloads -        // can be done. The code below will deallocate the stack space +      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) +        // Set SP to start of SVE callee-save area from which they can +        // be reloaded. The code below will deallocate the stack space          // space by moving FP -> SP.          emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, -                        -SVEStackSize, TII, MachineInstr::FrameDestroy); +                        {-CalleeSavedSize, MVT::nxv1i8}, TII, +                        MachineInstr::FrameDestroy);      } else {        if (AFI->getSVECalleeSavedStackSize()) {          // Deallocate the non-SVE locals first before we can deallocate (and @@ -2595,25 +2589,23 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,                                                int &MinCSFrameIndex,                                                int &MaxCSFrameIndex,                                                bool AssignOffsets) { +#ifndef NDEBUG    // First process all fixed stack objects. -  int64_t Offset = 0;    for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) -    if (MFI.getStackID(I) == TargetStackID::SVEVector) { -      int64_t FixedOffset = -MFI.getObjectOffset(I); -      if (FixedOffset > Offset) -        Offset = FixedOffset; -    } +    assert(MFI.getStackID(I) != TargetStackID::SVEVector && +           "SVE vectors should never be passed on the stack by value, only by " +           "reference."); +#endif    auto Assign = [&MFI](int FI, int64_t Offset) {      LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");      MFI.setObjectOffset(FI, Offset);    }; +  int64_t Offset = 0; +    // Then process all callee saved slots.    if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { -    // Make sure to align the last callee save slot. -    MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); -      // Assign offsets to the callee save slots.      for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {        Offset += MFI.getObjectSize(I); @@ -2623,6 +2615,9 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,      }    } +  // Ensure that the Callee-save area is aligned to 16bytes. +  Offset = alignTo(Offset, Align(16U)); +    // Create a buffer of SVE objects to allocate and sort it.    SmallVector<int, 8> ObjectsToAllocate;    for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 9d0a6d9eaf25..444740cb50ab 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -105,6 +105,12 @@ public:      }    } +  bool isStackIdSafeForLocalArea(unsigned StackId) const override { +    // We don't support putting SVE objects into the pre-allocated local +    // frame block at the moment. +    return StackId != TargetStackID::SVEVector; +  } +  private:    bool shouldCombineCSRLocalStackBump(MachineFunction &MF,                                        uint64_t StackBumpBytes) const; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 10c477853353..7799ebfbd68e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -245,7 +245,8 @@ public:                           unsigned SubRegIdx);    void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);    void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); -  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); +  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, +                            unsigned Opc_rr, unsigned Opc_ri);    bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);    /// SVE Reg+Imm addressing mode. @@ -262,14 +263,12 @@ public:    void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);    void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);    void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); -  template <unsigned Scale> -  void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr, -                             const unsigned Opc_ri); -  template <unsigned Scale> +  void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale, +                             unsigned Opc_rr, unsigned Opc_ri);    std::tuple<unsigned, SDValue, SDValue> -  findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, -                           const unsigned Opc_ri, const SDValue &OldBase, -                           const SDValue &OldOffset); +  findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri, +                           const SDValue &OldBase, const SDValue &OldOffset, +                           unsigned Scale);    bool tryBitfieldExtractOp(SDNode *N);    bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -1414,12 +1413,12 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,  /// Optimize \param OldBase and \param OldOffset selecting the best addressing  /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the  /// new Base and an SDValue representing the new offset. -template <unsigned Scale>  std::tuple<unsigned, SDValue, SDValue> -AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, -                                              const unsigned Opc_ri, +AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, +                                              unsigned Opc_ri,                                                const SDValue &OldBase, -                                              const SDValue &OldOffset) { +                                              const SDValue &OldOffset, +                                              unsigned Scale) {    SDValue NewBase = OldBase;    SDValue NewOffset = OldOffset;    // Detect a possible Reg+Imm addressing mode. @@ -1429,21 +1428,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,    // Detect a possible reg+reg addressing mode, but only if we haven't already    // detected a Reg+Imm one.    const bool IsRegReg = -      !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset); +      !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);    // Select the instruction.    return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);  }  void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, -                                               const unsigned Opc) { +                                               unsigned Scale, unsigned Opc_ri, +                                               unsigned Opc_rr) { +  assert(Scale < 4 && "Invalid scaling value.");    SDLoc DL(N);    EVT VT = N->getValueType(0);    SDValue Chain = N->getOperand(0); +  // Optimize addressing mode. +  SDValue Base, Offset; +  unsigned Opc; +  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( +      N, Opc_rr, Opc_ri, N->getOperand(2), +      CurDAG->getTargetConstant(0, DL, MVT::i64), Scale); +    SDValue Ops[] = {N->getOperand(1), // Predicate -                   N->getOperand(2), // Memory operand -                   CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; +                   Base,             // Memory operand +                   Offset, Chain};    const EVT ResTys[] = {MVT::Untyped, MVT::Other}; @@ -1479,10 +1487,9 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,    ReplaceNode(N, St);  } -template <unsigned Scale>  void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, -                                                const unsigned Opc_rr, -                                                const unsigned Opc_ri) { +                                                unsigned Scale, unsigned Opc_rr, +                                                unsigned Opc_ri) {    SDLoc dl(N);    // Form a REG_SEQUENCE to force register allocation. @@ -1492,9 +1499,9 @@ void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,    // Optimize addressing mode.    unsigned Opc;    SDValue Offset, Base; -  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>( +  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(        N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3), -      CurDAG->getTargetConstant(0, dl, MVT::i64)); +      CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);    SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate                     Base,                               // address @@ -4085,63 +4092,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {      }      case Intrinsic::aarch64_sve_st2: {        if (VT == MVT::nxv16i8) { -        SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B, -                                           AArch64::ST2B_IMM); +        SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);          return;        } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||                   (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { -        SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H, -                                           AArch64::ST2H_IMM); +        SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);          return;        } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { -        SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W, -                                           AArch64::ST2W_IMM); +        SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);          return;        } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { -        SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D, -                                           AArch64::ST2D_IMM); +        SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);          return;        }        break;      }      case Intrinsic::aarch64_sve_st3: {        if (VT == MVT::nxv16i8) { -        SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B, -                                           AArch64::ST3B_IMM); +        SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);          return;        } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||                   (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { -        SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H, -                                           AArch64::ST3H_IMM); +        SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);          return;        } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { -        SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W, -                                           AArch64::ST3W_IMM); +        SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);          return;        } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { -        SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D, -                                           AArch64::ST3D_IMM); +        SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);          return;        }        break;      }      case Intrinsic::aarch64_sve_st4: {        if (VT == MVT::nxv16i8) { -        SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B, -                                           AArch64::ST4B_IMM); +        SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);          return;        } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||                   (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { -        SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H, -                                           AArch64::ST4H_IMM); +        SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);          return;        } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { -        SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W, -                                           AArch64::ST4W_IMM); +        SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);          return;        } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { -        SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D, -                                           AArch64::ST4D_IMM); +        SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);          return;        }        break; @@ -4741,51 +4736,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {    }    case AArch64ISD::SVE_LD2_MERGE_ZERO: {      if (VT == MVT::nxv16i8) { -      SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); +      SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);        return;      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { -      SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); +      SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);        return;      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { -      SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); +      SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W);        return;      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { -      SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); +      SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D);        return;      }      break;    }    case AArch64ISD::SVE_LD3_MERGE_ZERO: {      if (VT == MVT::nxv16i8) { -      SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); +      SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);        return;      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { -      SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); +      SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);        return;      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { -      SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); +      SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W);        return;      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { -      SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); +      SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D);        return;      }      break;    }    case AArch64ISD::SVE_LD4_MERGE_ZERO: {      if (VT == MVT::nxv16i8) { -      SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); +      SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);        return;      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { -      SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); +      SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);        return;      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { -      SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); +      SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W);        return;      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { -      SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); +      SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D);        return;      }      break; @@ -4805,10 +4800,14 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,  /// When \p PredVT is a scalable vector predicate in the form  /// MVT::nx<M>xi1, it builds the correspondent scalable vector of -/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input +/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting +/// structured vectors (NumVec >1), the output data type is +/// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input  /// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid  /// EVT. -static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { +static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT, +                                                unsigned NumVec) { +  assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors.");    if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)      return EVT(); @@ -4818,7 +4817,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {    ElementCount EC = PredVT.getVectorElementCount();    EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); -  EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC); +  EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec); +    return MemVT;  } @@ -4842,6 +4842,15 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {      return cast<VTSDNode>(Root->getOperand(3))->getVT();    case AArch64ISD::ST1_PRED:      return cast<VTSDNode>(Root->getOperand(4))->getVT(); +  case AArch64ISD::SVE_LD2_MERGE_ZERO: +    return getPackedVectorTypeFromPredicateType( +        Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2); +  case AArch64ISD::SVE_LD3_MERGE_ZERO: +    return getPackedVectorTypeFromPredicateType( +        Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3); +  case AArch64ISD::SVE_LD4_MERGE_ZERO: +    return getPackedVectorTypeFromPredicateType( +        Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4);    default:      break;    } @@ -4857,7 +4866,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {    // We are using an SVE prefetch intrinsic. Type must be inferred    // from the width of the predicate.    return getPackedVectorTypeFromPredicateType( -      Ctx, Root->getOperand(2)->getValueType(0)); +      Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1);  }  /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 85db14ab66fe..1500da2fdfc7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -932,8 +932,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,          setOperationAction(ISD::SHL, VT, Custom);          setOperationAction(ISD::SRL, VT, Custom);          setOperationAction(ISD::SRA, VT, Custom); -        if (VT.getScalarType() == MVT::i1) +        if (VT.getScalarType() == MVT::i1) {            setOperationAction(ISD::SETCC, VT, Custom); +          setOperationAction(ISD::TRUNCATE, VT, Custom); +          setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); +        }        }      } @@ -8858,6 +8861,16 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,                                               SelectionDAG &DAG) const {    EVT VT = Op.getValueType(); +  if (VT.getScalarType() == MVT::i1) { +    // Lower i1 truncate to `(x & 1) != 0`. +    SDLoc dl(Op); +    EVT OpVT = Op.getOperand(0).getValueType(); +    SDValue Zero = DAG.getConstant(0, dl, OpVT); +    SDValue One = DAG.getConstant(1, dl, OpVT); +    SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); +    return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); +  } +    if (!VT.isVector() || VT.isScalableVector())      return Op; @@ -12288,6 +12301,9 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {                  "Unsupported opcode.");    SDLoc DL(N);    EVT VT = N->getValueType(0); +  if (VT == MVT::nxv8bf16 && +      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) +    return SDValue();    EVT LoadVT = VT;    if (VT.isFloatingPoint()) @@ -14909,6 +14925,11 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {      if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))        return true; +  if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { +    if (isa<ScalableVectorType>(AI->getAllocatedType())) +      return true; +  } +    return false;  } diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 6df7970f4d82..4f4ba692c2db 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -495,6 +495,9 @@ def SImmS4XForm : SDNodeXForm<imm, [{  def SImmS16XForm : SDNodeXForm<imm, [{    return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);  }]>; +def SImmS32XForm : SDNodeXForm<imm, [{ +  return CurDAG->getTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64); +}]>;  // simm6sN predicate - True if the immediate is a multiple of N in the range  // [-32 * N, 31 * N]. @@ -546,7 +549,7 @@ def simm4s16 : Operand<i64>, ImmLeaf<i64,    let DecoderMethod = "DecodeSImm<4>";  }  def simm4s32 : Operand<i64>, ImmLeaf<i64, -[{ return Imm >=-256  && Imm <= 224 && (Imm % 32) == 0x0; }]> { +[{ return Imm >=-256  && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> {    let PrintMethod = "printImmScale<32>";    let ParserMatchClass = SImm4s32Operand;    let DecoderMethod = "DecodeSImm<4>"; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5139ae5ccaf1..08f80c9aa361 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2744,6 +2744,35 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,      return;    } +  // Copy a Z register pair by copying the individual sub-registers. +  if (AArch64::ZPR2RegClass.contains(DestReg) && +      AArch64::ZPR2RegClass.contains(SrcReg)) { +    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; +    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, +                     Indices); +    return; +  } + +  // Copy a Z register triple by copying the individual sub-registers. +  if (AArch64::ZPR3RegClass.contains(DestReg) && +      AArch64::ZPR3RegClass.contains(SrcReg)) { +    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, +                                       AArch64::zsub2}; +    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, +                     Indices); +    return; +  } + +  // Copy a Z register quad by copying the individual sub-registers. +  if (AArch64::ZPR4RegClass.contains(DestReg) && +      AArch64::ZPR4RegClass.contains(SrcReg)) { +    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, +                                       AArch64::zsub2, AArch64::zsub3}; +    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, +                     Indices); +    return; +  } +    if (AArch64::GPR64spRegClass.contains(DestReg) &&        (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {      if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 886158ca4490..83a488afc797 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -40,6 +40,14 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)    AArch64_MC::initLLVMToCVRegMapping(this);  } +static bool hasSVEArgsOrReturn(const MachineFunction *MF) { +  const Function &F = MF->getFunction(); +  return isa<ScalableVectorType>(F.getReturnType()) || +         any_of(F.args(), [](const Argument &Arg) { +           return isa<ScalableVectorType>(Arg.getType()); +         }); +} +  const MCPhysReg *  AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {    assert(MF && "Invalid MachineFunction pointer."); @@ -75,6 +83,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {      // This is for OSes other than Windows; Windows is a separate case further      // above.      return CSR_AArch64_AAPCS_X18_SaveList; +  if (hasSVEArgsOrReturn(MF)) +    return CSR_AArch64_SVE_AAPCS_SaveList;    return CSR_AArch64_AAPCS_SaveList;  } @@ -343,6 +353,15 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {    if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) {      if (needsStackRealignment(MF))        return true; + +    if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) { +      const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); +      // Frames that have variable sized objects and scalable SVE objects, +      // should always use a basepointer. +      if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE()) +        return true; +    } +      // Conservatively estimate whether the negative offset from the frame      // pointer will be sufficient to reach. If a function has a smallish      // frame, it's less likely to have lots of spills and callee saved @@ -379,8 +398,15 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {    // (closer to SP).    //    // The beginning works most reliably if we have a frame pointer. +  // In the presence of any non-constant space between FP and locals, +  // (e.g. in case of stack realignment or a scalable SVE area), it is +  // better to use SP or BP.    const AArch64FrameLowering &TFI = *getFrameLowering(MF); -  return TFI.hasFP(MF); +  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); +  assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() || +          AFI->hasCalculatedStackSizeSVE()) && +         "Expected SVE area to be calculated by this point"); +  return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE();  }  bool AArch64RegisterInfo::requiresFrameIndexScavenging( diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 28a54e6f7d79..3449a8bd16d2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1109,6 +1109,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio    defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;    defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; +  // Extract lo/hi halves of legal predicate types. +  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), +            (ZIP1_PPP_S PPR:$Ps, (PFALSE))>; +  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), +            (ZIP2_PPP_S PPR:$Ps, (PFALSE))>; +  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), +            (ZIP1_PPP_H PPR:$Ps, (PFALSE))>; +  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), +            (ZIP2_PPP_H PPR:$Ps, (PFALSE))>; +  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), +            (ZIP1_PPP_B PPR:$Ps, (PFALSE))>; +  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), +            (ZIP2_PPP_B PPR:$Ps, (PFALSE))>; + +  // Concatenate two predicates. +  def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)), +            (UZP1_PPP_S $p1, $p2)>; +  def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)), +            (UZP1_PPP_H $p1, $p2)>; +  def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)), +            (UZP1_PPP_B $p1, $p2)>; +    defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;    defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;    defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td index a005d1e65abe..c56a65b9e212 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7718,9 +7718,13 @@ multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,                    (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;    // Base addressing mode -  def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), -            (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; - +  def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)), +            (!cast<Instruction>(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; +  let AddedComplexity = 2 in { +    // Reg + Imm addressing mode +    def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))), +              (!cast<Instruction>(NAME) $Pg, $base, simm4s32:$imm)>; +  }  }  class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList, diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 11454841cab7..5c1a4cb16568 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9111,13 +9111,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {                       Op0.getOperand(1));  } -static const SDValue *getNormalLoadInput(const SDValue &Op) { +static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {    const SDValue *InputLoad = &Op;    if (InputLoad->getOpcode() == ISD::BITCAST)      InputLoad = &InputLoad->getOperand(0);    if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || -      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) +      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { +    IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;      InputLoad = &InputLoad->getOperand(0); +  }    if (InputLoad->getOpcode() != ISD::LOAD)      return nullptr;    LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); @@ -9289,7 +9291,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,    if (!BVNIsConstantSplat || SplatBitSize > 32) { -    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); +    bool IsPermutedLoad = false; +    const SDValue *InputLoad = +        getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);      // Handle load-and-splat patterns as we have instructions that will do this      // in one go.      if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9912,7 +9916,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,    // If this is a load-and-splat, we can do that with a single instruction    // in some cases. However if the load has multiple uses, we don't want to    // combine it because that will just produce multiple loads. -  const SDValue *InputLoad = getNormalLoadInput(V1); +  bool IsPermutedLoad = false; +  const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);    if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&        (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&        InputLoad->hasOneUse()) { @@ -9920,6 +9925,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,      int SplatIdx =        PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); +    // The splat index for permuted loads will be in the left half of the vector +    // which is strictly wider than the loaded value by 8 bytes. So we need to +    // adjust the splat index to point to the correct address in memory. +    if (IsPermutedLoad) { +      assert(isLittleEndian && "Unexpected permuted load on big endian target"); +      SplatIdx += IsFourByte ? 2 : 1; +      assert((SplatIdx < (IsFourByte ? 4 : 2)) && +             "Splat of a value outside of the loaded memory"); +    } +      LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);      // For 4-byte load-and-splat, we need Power9.      if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { @@ -9929,10 +9944,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,        else          Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; -      // If we are loading a partial vector, it does not make sense to adjust -      // the base pointer. This happens with (splat (s_to_v_permuted (ld))). -      if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64)) -        Offset = 0;        SDValue BasePtr = LD->getBasePtr();        if (Offset != 0)          BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index a0ae05081adc..7570385e38e3 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -184,6 +184,330 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {    return false;  } +// Check that it is a SLOI (Shift Left Ones Immediate). We first check that +// it is the right node tree: +// +//  (OR (SHL RS1, VC2), VC1) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +//  VC1 == maskTrailingOnes<uint64_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) { +  MVT XLenVT = Subtarget->getXLenVT(); +  if (N.getOpcode() == ISD::OR) { +    SDValue Or = N; +    if (Or.getOperand(0).getOpcode() == ISD::SHL) { +      SDValue Shl = Or.getOperand(0); +      if (isa<ConstantSDNode>(Shl.getOperand(1)) && +          isa<ConstantSDNode>(Or.getOperand(1))) { +        if (XLenVT == MVT::i64) { +          uint64_t VC1 = Or.getConstantOperandVal(1); +          uint64_t VC2 = Shl.getConstantOperandVal(1); +          if (VC1 == maskTrailingOnes<uint64_t>(VC2)) { +            RS1 = Shl.getOperand(0); +            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                           Shl.getOperand(1).getValueType()); +            return true; +          } +        } +        if (XLenVT == MVT::i32) { +          uint32_t VC1 = Or.getConstantOperandVal(1); +          uint32_t VC2 = Shl.getConstantOperandVal(1); +          if (VC1 == maskTrailingOnes<uint32_t>(VC2)) { +            RS1 = Shl.getOperand(0); +            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                           Shl.getOperand(1).getValueType()); +            return true; +          } +        } +      } +    } +  } +  return false; +} + +// Check that it is a SROI (Shift Right Ones Immediate). We first check that +// it is the right node tree: +// +//  (OR (SRL RS1, VC2), VC1) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +//  VC1 == maskLeadingOnes<uint64_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) { +  MVT XLenVT = Subtarget->getXLenVT(); +  if (N.getOpcode() == ISD::OR) { +    SDValue Or = N; +    if (Or.getOperand(0).getOpcode() == ISD::SRL) { +      SDValue Srl = Or.getOperand(0); +      if (isa<ConstantSDNode>(Srl.getOperand(1)) && +          isa<ConstantSDNode>(Or.getOperand(1))) { +        if (XLenVT == MVT::i64) { +          uint64_t VC1 = Or.getConstantOperandVal(1); +          uint64_t VC2 = Srl.getConstantOperandVal(1); +          if (VC1 == maskLeadingOnes<uint64_t>(VC2)) { +            RS1 = Srl.getOperand(0); +            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                           Srl.getOperand(1).getValueType()); +            return true; +          } +        } +        if (XLenVT == MVT::i32) { +          uint32_t VC1 = Or.getConstantOperandVal(1); +          uint32_t VC2 = Srl.getConstantOperandVal(1); +          if (VC1 == maskLeadingOnes<uint32_t>(VC2)) { +            RS1 = Srl.getOperand(0); +            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                           Srl.getOperand(1).getValueType()); +            return true; +          } +        } +      } +    } +  } +  return false; +} + +// Check that it is a RORI (Rotate Right Immediate). We first check that +// it is the right node tree: +// +//  (ROTL RS1, VC) +// +// The compiler translates immediate rotations to the right given by the call +// to the rotateright32/rotateright64 intrinsics as rotations to the left. +// Since the rotation to the left can be easily emulated as a rotation to the +// right by negating the constant, there is no encoding for ROLI. +// We then select the immediate left rotations as RORI by the complementary +// constant: +// +//  Shamt == XLen - VC + +bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) { +  MVT XLenVT = Subtarget->getXLenVT(); +  if (N.getOpcode() == ISD::ROTL) { +    if (isa<ConstantSDNode>(N.getOperand(1))) { +      if (XLenVT == MVT::i64) { +        uint64_t VC = N.getConstantOperandVal(1); +        Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N), +                                          N.getOperand(1).getValueType()); +        RS1 = N.getOperand(0); +        return true; +      } +      if (XLenVT == MVT::i32) { +        uint32_t VC = N.getConstantOperandVal(1); +        Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N), +                                          N.getOperand(1).getValueType()); +        RS1 = N.getOperand(0); +        return true; +      } +    } +  } +  return false; +} + + +// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32 +// on RV64). +// SLLIUW is the same as SLLI except for the fact that it clears the bits +// XLEN-1:32 of the input RS1 before shifting. +// We first check that it is the right node tree: +// +//  (AND (SHL RS1, VC2), VC1) +// +// We check that VC2, the shamt is less than 32, otherwise the pattern is +// exactly the same as SLLI and we give priority to that. +// Eventually we check that that VC1, the mask used to clear the upper 32 bits +// of RS1, is correct: +// +//  VC1 == (0xFFFFFFFF << VC2) + +bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) { +  if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) { +    SDValue And = N; +    if (And.getOperand(0).getOpcode() == ISD::SHL) { +      SDValue Shl = And.getOperand(0); +      if (isa<ConstantSDNode>(Shl.getOperand(1)) && +          isa<ConstantSDNode>(And.getOperand(1))) { +        uint64_t VC1 = And.getConstantOperandVal(1); +        uint64_t VC2 = Shl.getConstantOperandVal(1); +        if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) { +          RS1 = Shl.getOperand(0); +          Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                                            Shl.getOperand(1).getValueType()); +          return true; +        } +      } +    } +  } +  return false; +} + +// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64). +// We first check that it is the right node tree: +// +//  (SIGN_EXTEND_INREG (OR (SHL RS1, VC2), VC1)) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +//  VC1 == maskTrailingOnes<uint32_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) { +  if (Subtarget->getXLenVT() == MVT::i64 && +      N.getOpcode() == ISD::SIGN_EXTEND_INREG && +      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { +    if (N.getOperand(0).getOpcode() == ISD::OR) { +      SDValue Or = N.getOperand(0); +      if (Or.getOperand(0).getOpcode() == ISD::SHL) { +        SDValue Shl = Or.getOperand(0); +        if (isa<ConstantSDNode>(Shl.getOperand(1)) && +            isa<ConstantSDNode>(Or.getOperand(1))) { +          uint32_t VC1 = Or.getConstantOperandVal(1); +          uint32_t VC2 = Shl.getConstantOperandVal(1); +          if (VC1 == maskTrailingOnes<uint32_t>(VC2)) { +            RS1 = Shl.getOperand(0); +            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                                              Shl.getOperand(1).getValueType()); +            return true; +          } +        } +      } +    } +  } +  return false; +} + +// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64). +// We first check that it is the right node tree: +// +//  (OR (SHL RS1, VC2), VC1) +// +// and then we check that VC1, the mask used to fill with ones, is compatible +// with VC2, the shamt: +// +//  VC1 == maskLeadingOnes<uint32_t>(VC2) + +bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) { +  if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) { +    SDValue Or = N; +    if (Or.getOperand(0).getOpcode() == ISD::SRL) { +      SDValue Srl = Or.getOperand(0); +      if (isa<ConstantSDNode>(Srl.getOperand(1)) && +          isa<ConstantSDNode>(Or.getOperand(1))) { +        uint32_t VC1 = Or.getConstantOperandVal(1); +        uint32_t VC2 = Srl.getConstantOperandVal(1); +        if (VC1 == maskLeadingOnes<uint32_t>(VC2)) { +          RS1 = Srl.getOperand(0); +          Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N), +                                            Srl.getOperand(1).getValueType()); +          return true; +        } +      } +    } +  } +  return false; +} + +// Check that it is a RORIW (i32 Right Rotate Immediate on RV64). +// We first check that it is the right node tree: +// +//  (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2), +//                         (SRL (AND (AssertSext RS2, i32), VC3), VC1))) +// +// Then we check that the constant operands respect these constraints: +// +// VC2 == 32 - VC1 +// VC3 == maskLeadingOnes<uint32_t>(VC2) +// +// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32 +// and VC3 a 32 bit mask of (32 - VC1) leading ones. + +bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) { +  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG && +      Subtarget->getXLenVT() == MVT::i64 && +      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { +    if (N.getOperand(0).getOpcode() == ISD::OR) { +      SDValue Or = N.getOperand(0); +      if (Or.getOperand(0).getOpcode() == ISD::SHL && +          Or.getOperand(1).getOpcode() == ISD::SRL) { +        SDValue Shl = Or.getOperand(0); +        SDValue Srl = Or.getOperand(1); +        if (Srl.getOperand(0).getOpcode() == ISD::AND) { +          SDValue And = Srl.getOperand(0); +          if (isa<ConstantSDNode>(Srl.getOperand(1)) && +              isa<ConstantSDNode>(Shl.getOperand(1)) && +              isa<ConstantSDNode>(And.getOperand(1))) { +            uint32_t VC1 = Srl.getConstantOperandVal(1); +            uint32_t VC2 = Shl.getConstantOperandVal(1); +            uint32_t VC3 = And.getConstantOperandVal(1); +            if (VC2 == (32 - VC1) && +                VC3 == maskLeadingOnes<uint32_t>(VC2)) { +              RS1 = Shl.getOperand(0); +              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N), +                                              Srl.getOperand(1).getValueType()); +              return true; +            } +          } +        } +      } +    } +  } +  return false; +} + +// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64). +// We first check that it is the right node tree: +// +//  (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2), +//                         (SRL (AND (AssertSext RS2, i32), VC3), VC1))) +// +// Then we check that the constant operands respect these constraints: +// +// VC2 == 32 - VC1 +// VC3 == maskLeadingOnes<uint32_t>(VC2) +// +// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32 +// and VC3 a 32 bit mask of (32 - VC1) leading ones. + +bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, +                                    SDValue &Shamt) { +  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG && +      Subtarget->getXLenVT() == MVT::i64 && +      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { +    if (N.getOperand(0).getOpcode() == ISD::OR) { +      SDValue Or = N.getOperand(0); +      if (Or.getOperand(0).getOpcode() == ISD::SHL && +          Or.getOperand(1).getOpcode() == ISD::SRL) { +        SDValue Shl = Or.getOperand(0); +        SDValue Srl = Or.getOperand(1); +        if (Srl.getOperand(0).getOpcode() == ISD::AND) { +          SDValue And = Srl.getOperand(0); +          if (isa<ConstantSDNode>(Srl.getOperand(1)) && +              isa<ConstantSDNode>(Shl.getOperand(1)) && +              isa<ConstantSDNode>(And.getOperand(1))) { +            uint32_t VC1 = Srl.getConstantOperandVal(1); +            uint32_t VC2 = Shl.getConstantOperandVal(1); +            uint32_t VC3 = And.getConstantOperandVal(1); +            if (VC2 == (32 - VC1) && +                VC3 == maskLeadingOnes<uint32_t>(VC2)) { +              RS1 = Shl.getOperand(0); +              RS2 = And.getOperand(0); +              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N), +                                              Srl.getOperand(1).getValueType()); +              return true; +            } +          } +        } +      } +    } +  } +  return false; +} +  // Merge an ADDI into the offset of a load/store instruction where possible.  // (load (addi base, off1), off2) -> (load base, off1+off2)  // (store val, (addi base, off1), off2) -> (store val, base, off1+off2) diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index dcf733ec3675..0ca12510a230 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -45,6 +45,15 @@ public:    bool SelectAddrFI(SDValue Addr, SDValue &Base); +  bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt); +  bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt); +  // Include the pieces autogenerated from the target description.  #include "RISCVGenDAGISel.inc" diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 91fc69b5bc10..03d9eefd59d0 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -149,12 +149,27 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);    setOperationAction(ISD::SRA_PARTS, XLenVT, Custom); -  setOperationAction(ISD::ROTL, XLenVT, Expand); -  setOperationAction(ISD::ROTR, XLenVT, Expand); -  setOperationAction(ISD::BSWAP, XLenVT, Expand); -  setOperationAction(ISD::CTTZ, XLenVT, Expand); -  setOperationAction(ISD::CTLZ, XLenVT, Expand); -  setOperationAction(ISD::CTPOP, XLenVT, Expand); +  if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) { +    setOperationAction(ISD::ROTL, XLenVT, Expand); +    setOperationAction(ISD::ROTR, XLenVT, Expand); +  } + +  if (!Subtarget.hasStdExtZbp()) +    setOperationAction(ISD::BSWAP, XLenVT, Expand); + +  if (!Subtarget.hasStdExtZbb()) { +    setOperationAction(ISD::CTTZ, XLenVT, Expand); +    setOperationAction(ISD::CTLZ, XLenVT, Expand); +    setOperationAction(ISD::CTPOP, XLenVT, Expand); +  } + +  if (Subtarget.hasStdExtZbp()) +    setOperationAction(ISD::BITREVERSE, XLenVT, Legal); + +  if (Subtarget.hasStdExtZbt()) { +    setOperationAction(ISD::FSHL, XLenVT, Legal); +    setOperationAction(ISD::FSHR, XLenVT, Legal); +  }    ISD::CondCode FPCCToExtend[] = {        ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT, diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td index 34a463626e29..afac509f743d 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoB.td @@ -632,3 +632,432 @@ let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] i  def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),                    (C_ZEXTW GPRC:$rs1)>;  } // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64] + +//===----------------------------------------------------------------------===// +// Codegen patterns +//===----------------------------------------------------------------------===// +def SLOIPat   : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>; +def SROIPat   : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>; +def RORIPat   : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>; +def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>; +def SLOIWPat  : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>; +def SROIWPat  : ComplexPattern<i64, 2, "SelectSROIW", [or]>; +def RORIWPat  : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>; +def FSRIWPat  : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>; + +let Predicates = [HasStdExtZbbOrZbp] in { +def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>; +def : Pat<(or  GPR:$rs1, (not GPR:$rs2)), (ORN  GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp] + +let Predicates = [HasStdExtZbb] in { +def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1), +          (SLO GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1), +          (SRO GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbb] + +let Predicates = [HasStdExtZbbOrZbp] in { +def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>; +def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>; +def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>; +def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp] + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1), +          (SBCLR GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1), +          (SBCLR GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs] in +def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1), +          (SBSET GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1), +          (SBSET GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1), +          (SBINV GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1), +          (SBINV GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV32] in +def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1), +          (SBEXT GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbs, IsRV64] in +def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1), +          (SBEXT GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbb] in { +def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt), +          (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt), +          (SROI GPR:$rs1, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbb] + +// There's no encoding for roli in the current version of the 'B' extension +// (v0.92) as it can be implemented with rori by negating the immediate. +// For this reason we pattern-match only against rori[w]. +let Predicates = [HasStdExtZbbOrZbp] in +def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt), +          (RORI GPR:$rs1, uimmlog2xlen:$shamt)>; + +// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are +// pattern-matched by simple andi, ori, and xori. +let Predicates = [HasStdExtZbs] in +def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)), +          (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>; + +let Predicates = [HasStdExtZbp, IsRV32] in { +def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1), +              (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))), +          (GORCI GPR:$rs1, (i32 1))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1), +              (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))), +          (GORCI GPR:$rs1, (i32 2))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1), +              (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))), +          (GORCI GPR:$rs1, (i32 4))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1), +              (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))), +          (GORCI GPR:$rs1, (i32 8))>; +def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1), +              (shl GPR:$rs1, (i32 16))), +          (GORCI GPR:$rs1, (i32 16))>; +} // Predicates = [HasStdExtZbp, IsRV32] + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)), +                   GPR:$rs1), +              (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))), +          (GORCI GPR:$rs1, (i64 1))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)), +                   GPR:$rs1), +              (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))), +          (GORCI GPR:$rs1, (i64 2))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)), +                   GPR:$rs1), +              (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))), +          (GORCI GPR:$rs1, (i64 4))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)), +                   GPR:$rs1), +              (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))), +          (GORCI GPR:$rs1, (i64 8))>; +def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)), +                   GPR:$rs1), +              (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))), +          (GORCI GPR:$rs1, (i64 16))>; +def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1), +              (shl GPR:$rs1, (i64 32))), +          (GORCI GPR:$rs1, (i64 32))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbp, IsRV32] in { +def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)), +              (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))), +          (GREVI GPR:$rs1, (i32 1))>; +def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)), +              (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))), +          (GREVI GPR:$rs1, (i32 2))>; +def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)), +              (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))), +          (GREVI GPR:$rs1, (i32 4))>; +def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)), +              (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))), +          (GREVI GPR:$rs1, (i32 8))>; +def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>; +def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))), +          (GREVI GPR:$rs1, (i32 16))>; +def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>; +def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>; +def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>; +} // Predicates = [HasStdExtZbp, IsRV32] + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)), +              (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))), +          (GREVI GPR:$rs1, (i64 1))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)), +              (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))), +          (GREVI GPR:$rs1, (i64 2))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)), +              (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))), +          (GREVI GPR:$rs1, (i64 4))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)), +              (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))), +          (GREVI GPR:$rs1, (i64 8))>; +def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)), +              (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))), +          (GREVI GPR:$rs1, (i64 16))>; +def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))), +          (GREVI GPR:$rs1, (i64 32))>; +def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>; +def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>; +def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbt] in { +def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)), +          (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1), +          (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3), +          (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3), +          (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt), +          (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbt] + +let Predicates = [HasStdExtZbb] in { +def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>; +def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>; +def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>; +} // Predicates = [HasStdExtZbb] + +let Predicates = [HasStdExtZbb, IsRV32] in +def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>; +let Predicates = [HasStdExtZbb, IsRV64] in +def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>; + +let Predicates = [HasStdExtZbb, IsRV32] in +def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>; +let Predicates = [HasStdExtZbb, IsRV64] in +def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>; + +let Predicates = [HasStdExtZbb] in { +def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN  GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2), +          (MIN  GPR:$rs1, GPR:$rs2)>; +def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX  GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2), +          (MAX  GPR:$rs1, GPR:$rs2)>; +def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2), +          (MINU  GPR:$rs1, GPR:$rs2)>; +def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>; +def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2), +          (MAXU  GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbb] + +let Predicates = [HasStdExtZbbOrZbp, IsRV32] in +def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))), +          (PACK GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))), +          (PACK GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp, IsRV32] in +def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))), +          (PACKU GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))), +          (PACKU GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbbOrZbp] in +def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00), +              (and GPR:$rs1, 0x00FF)), +          (PACKH GPR:$rs1, GPR:$rs2)>; + +let Predicates = [HasStdExtZbp, IsRV32] in { +def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)), +                  (and GPR:$rs1, (i32 0xFF0000FF))), +              (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))), +          (SHFLI GPR:$rs1, (i32 8))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)), +                  (and GPR:$rs1, (i32 0xF00FF00F))), +              (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))), +          (SHFLI GPR:$rs1, (i32 4))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)), +                  (and GPR:$rs1, (i32 0xC3C3C3C3))), +              (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))), +          (SHFLI GPR:$rs1, (i32 2))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)), +                  (and GPR:$rs1, (i32 0x99999999))), +              (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))), +          (SHFLI GPR:$rs1, (i32 1))>; +} // Predicates = [HasStdExtZbp, IsRV32] + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)), +                  (and GPR:$rs1, (i64 0xFFFF00000000FFFF))), +              (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))), +          (SHFLI GPR:$rs1, (i64 16))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)), +                  (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))), +              (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))), +          (SHFLI GPR:$rs1, (i64 8))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)), +                  (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))), +              (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))), +          (SHFLI GPR:$rs1, (i64 4))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)), +                  (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))), +              (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))), +          (SHFLI GPR:$rs1, (i64 2))>; +def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)), +                  (and GPR:$rs1, (i64 0x9999999999999999))), +              (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))), +          (SHFLI GPR:$rs1, (i64 1))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbb, IsRV64] in { +def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)), +          (ADDIWU GPR:$rs, simm12:$simm12)>; +def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt), +          (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)), +          (ADDWU GPR:$rs1, GPR:$rs2)>; +def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)), +          (SUBWU GPR:$rs1, GPR:$rs2)>; +def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))), +          (ADDUW GPR:$rs1, GPR:$rs2)>; +def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))), +          (SUBUW GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1), +          (SLOW GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1), +          (SROW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbb, IsRV64] + +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { +def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)), +              (riscv_srlw (assertsexti32 GPR:$rs1), +                          (sub (i64 0), (assertsexti32 GPR:$rs2)))), +          (ROLW GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), +                          (sub (i64 0), (assertsexti32 GPR:$rs2))), +              (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))), +          (RORW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp, IsRV64] + +let Predicates = [HasStdExtZbs, IsRV64] in { +def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1), +               (assertsexti32 GPR:$rs1)), +          (SBCLRW GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)), +              (assertsexti32 GPR:$rs1)), +          (SBSETW GPR:$rs1, GPR:$rs2)>; +def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), +               (assertsexti32 GPR:$rs1)), +          (SBINVW GPR:$rs1, GPR:$rs2)>; +def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)), +               1), +          (SBEXTW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbs, IsRV64] + +let Predicates = [HasStdExtZbb, IsRV64] in { +def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt), +          (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt), +          (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbb, IsRV64] + +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in +def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt), +          (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>; + +let Predicates = [HasStdExtZbp, IsRV64] in { +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)), +                              GPR:$rs1), +                          (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))), +                      i32), +          (GORCIW GPR:$rs1, (i64 1))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)), +                              GPR:$rs1), +                          (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))), +                      i32), +          (GORCIW GPR:$rs1, (i64 2))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)), +                              GPR:$rs1), +                          (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))), +                      i32), +          (GORCIW GPR:$rs1, (i64 4))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)), +                              GPR:$rs1), +                          (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))), +                      i32), +          (GORCIW GPR:$rs1, (i64 8))>; +def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)), +                              GPR:$rs1), +                          (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))), +                      i32), +          (GORCIW GPR:$rs1, (i64 16))>; +def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)), +                              GPR:$rs1), +                          (shl GPR:$rs1, (i64 16))), i32), +          (GORCIW GPR:$rs1, (i64 16))>; + +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)), +                          (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))), +                      i32), +          (GREVIW GPR:$rs1, (i64 1))>; +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)), +                          (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))), +                      i32), +          (GREVIW GPR:$rs1, (i64 2))>; +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)), +                          (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))), +                      i32), +          (GREVIW GPR:$rs1, (i64 4))>; +def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)), +                          (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))), +                      i32), +          (GREVIW GPR:$rs1, (i64 8))>; +def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)), +                          (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32), +          (GREVIW GPR:$rs1, (i64 16))>; +def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>; +def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>; +} // Predicates = [HasStdExtZbp, IsRV64] + +let Predicates = [HasStdExtZbt, IsRV64] in { +def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31), +                          (i64 0), +                          (i64 17), +                          (assertsexti32 GPR:$rs1), +                          (or (riscv_sllw (assertsexti32 GPR:$rs1), +                                          (and (assertsexti32 GPR:$rs3), 31)), +                              (riscv_srlw (assertsexti32 GPR:$rs2), +                                          (sub (i64 32), +                                               (assertsexti32 GPR:$rs3))))), +          (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31), +                          (i64 0), +                          (i64 17), +                          (assertsexti32 GPR:$rs2), +                          (or (riscv_sllw (assertsexti32 GPR:$rs1), +                                          (sub (i64 32), +                                               (assertsexti32 GPR:$rs3))), +                              (riscv_srlw (assertsexti32 GPR:$rs2), +                                          (and (assertsexti32 GPR:$rs3), 31)))), +          (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt), +          (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>; +} // Predicates = [HasStdExtZbt, IsRV64] + +let Predicates = [HasStdExtZbb, IsRV64] in { +def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)), +          (CLZW GPR:$rs1)>; +// We don't pattern-match CTZW here as it has the same pattern and result as +// RV64 CTZ +def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>; +} // Predicates = [HasStdExtZbb, IsRV64] + +let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { +def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)), +                          (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)), +                      i32), +          (PACKW GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000), +              (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000), +                   (i64 16))), +          (PACKUW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbbOrZbp, IsRV64] diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp index f8b6b7eb3aff..86aa85e965f6 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30953,6 +30953,34 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {  //                           X86 Scheduler Hooks  //===----------------------------------------------------------------------===// +// Returns true if EFLAG is consumed after this iterator in the rest of the +// basic block or any successors of the basic block. +static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, +                              MachineBasicBlock *BB) { +  // Scan forward through BB for a use/def of EFLAGS. +  for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end(); +         miI != miE; ++miI) { +    const MachineInstr& mi = *miI; +    if (mi.readsRegister(X86::EFLAGS)) +      return true; +    // If we found a def, we can stop searching. +    if (mi.definesRegister(X86::EFLAGS)) +      return false; +  } + +  // If we hit the end of the block, check whether EFLAGS is live into a +  // successor. +  for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), +                                        sEnd = BB->succ_end(); +       sItr != sEnd; ++sItr) { +    MachineBasicBlock* succ = *sItr; +    if (succ->isLiveIn(X86::EFLAGS)) +      return true; +  } + +  return false; +} +  /// Utility function to emit xbegin specifying the start of an RTM region.  static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,                                       const TargetInstrInfo *TII) { @@ -30985,6 +31013,12 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,    MF->insert(I, fallMBB);    MF->insert(I, sinkMBB); +  if (isEFLAGSLiveAfter(MI, MBB)) { +    mainMBB->addLiveIn(X86::EFLAGS); +    fallMBB->addLiveIn(X86::EFLAGS); +    sinkMBB->addLiveIn(X86::EFLAGS); +  } +    // Transfer the remainder of BB and its successor edges to sinkMBB.    sinkMBB->splice(sinkMBB->begin(), MBB,                    std::next(MachineBasicBlock::iterator(MI)), MBB->end()); @@ -31373,27 +31407,8 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(  static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,                                       MachineBasicBlock* BB,                                       const TargetRegisterInfo* TRI) { -  // Scan forward through BB for a use/def of EFLAGS. -  MachineBasicBlock::iterator miI(std::next(SelectItr)); -  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { -    const MachineInstr& mi = *miI; -    if (mi.readsRegister(X86::EFLAGS)) -      return false; -    if (mi.definesRegister(X86::EFLAGS)) -      break; // Should have kill-flag - update below. -  } - -  // If we hit the end of the block, check whether EFLAGS is live into a -  // successor. -  if (miI == BB->end()) { -    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), -                                          sEnd = BB->succ_end(); -         sItr != sEnd; ++sItr) { -      MachineBasicBlock* succ = *sItr; -      if (succ->isLiveIn(X86::EFLAGS)) -        return false; -    } -  } +  if (isEFLAGSLiveAfter(SelectItr, BB)) +    return false;    // We found a def, or hit the end of the basic block and EFLAGS wasn't live    // out. SelectMI should have a kill flag on EFLAGS. @@ -44349,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,  /// A horizontal-op B, for some already available A and B, and if so then LHS is  /// set to A, RHS to B, and the routine returns 'true'.  static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, -                              const X86Subtarget &Subtarget, -                              bool IsCommutative) { +                              const X86Subtarget &Subtarget, bool IsCommutative, +                              SmallVectorImpl<int> &PostShuffleMask) {    // If either operand is undef, bail out. The binop should be simplified.    if (LHS.isUndef() || RHS.isUndef())      return false; @@ -44443,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,        RMask.push_back(i);    } +  // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split). +  if (!Subtarget.hasAVX2() && VT.isFloatingPoint() && +      (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) || +       isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask))) +    return false; +    // If A and B occur in reverse order in RHS, then canonicalize by commuting    // RHS operands and shuffle mask.    if (A != C) { @@ -44453,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,    if (!(A == C && B == D))      return false; +  PostShuffleMask.clear(); +  PostShuffleMask.append(NumElts, SM_SentinelUndef); +    // LHS and RHS are now:    //   LHS = shuffle A, B, LMask    //   RHS = shuffle A, B, RMask @@ -44461,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,    // so we just repeat the inner loop if this is a 256-bit op.    unsigned Num128BitChunks = VT.getSizeInBits() / 128;    unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; +  unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;    assert((NumEltsPer128BitChunk % 2 == 0) &&           "Vector type should have an even number of elements in each lane");    for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { @@ -44472,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,            (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))          continue; +      // Check that successive odd/even elements are being operated on. If not, +      // this is not a horizontal operation. +      if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && +          !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) +        return false; + +      // Compute the post-shuffle mask index based on where the element +      // is stored in the HOP result, and where it needs to be moved to. +      int Base = LIdx & ~1u; +      int Index = ((Base % NumEltsPer128BitChunk) / 2) + +                  ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); +        // The  low half of the 128-bit result must choose from A.        // The high half of the 128-bit result must choose from B,        // unless B is undef. In that case, we are always choosing from A. -      unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; -      unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; - -      // Check that successive elements are being operated on. If not, this is -      // not a horizontal operation. -      int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; -      if (!(LIdx == Index && RIdx == Index + 1) && -          !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) -        return false; +      if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) +        Index += NumEltsPer64BitChunk; +      PostShuffleMask[i + j] = Index;      }    }    LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.    RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. -  if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) +  bool IsIdentityPostShuffle = +      isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); +  if (IsIdentityPostShuffle) +    PostShuffleMask.clear(); + +  // Assume a SingleSource HOP if we only shuffle one input and don't need to +  // shuffle the result. +  if (!shouldUseHorizontalOp(LHS == RHS && +                                 (NumShuffles < 2 || !IsIdentityPostShuffle), +                             DAG, Subtarget))      return false;    LHS = DAG.getBitcast(VT, LHS); @@ -44509,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,    assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");    // Try to synthesize horizontal add/sub from adds/subs of shuffles. +  SmallVector<int, 8> PostShuffleMask;    if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && -      isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) -    return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); +      isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { +    SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); +    if (!PostShuffleMask.empty()) +      HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, +                                        DAG.getUNDEF(VT), PostShuffleMask); +    return HorizBinOp; +  }    // NOTE: isHorizontalBinOp may have changed LHS/RHS variables. @@ -47605,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,    bool IsAdd = N->getOpcode() == ISD::ADD;    assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); +  SmallVector<int, 8> PostShuffleMask;    if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||         VT == MVT::v8i32) &&        Subtarget.hasSSSE3() && -      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) { +      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {      auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,                                ArrayRef<SDValue> Ops) { -      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, -                         DL, Ops[0].getValueType(), Ops); +      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL, +                         Ops[0].getValueType(), Ops);      }; -    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, -                            HOpBuilder); +    SDValue HorizBinOp = +        SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); +    if (!PostShuffleMask.empty()) +      HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, +                                        DAG.getUNDEF(VT), PostShuffleMask); +    return HorizBinOp;    }    return SDValue(); diff --git a/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index c40901255424..cd39428b9c38 100644 --- a/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/contrib/llvm-project/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -191,9 +191,11 @@ static void appendFile(std::vector<NewArchiveMember> &Members,    file_magic Magic = identify_magic(MB.getBuffer());    if (Magic != file_magic::coff_object && Magic != file_magic::bitcode && -      Magic != file_magic::archive && Magic != file_magic::windows_resource) { +      Magic != file_magic::archive && Magic != file_magic::windows_resource && +      Magic != file_magic::coff_import_library) {      llvm::errs() << MB.getBufferIdentifier() -                 << ": not a COFF object, bitcode, archive or resource file\n"; +                 << ": not a COFF object, bitcode, archive, import library or " +                    "resource file\n";      exit(1);    } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 7cfe4c8b5892..c7f2f4ec3ca1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1030,12 +1030,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,                                                             APInt DemandedElts,                                                             int DMaskIdx) { -  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. -  if (DMaskIdx < 0 && -      II->getType()->getScalarSizeInBits() != 32 && -      DemandedElts.getActiveBits() == 3) -    return nullptr; -    auto *IIVTy = cast<VectorType>(II->getType());    unsigned VWidth = IIVTy->getNumElements();    if (VWidth == 1) @@ -1124,6 +1118,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,    if (!NewNumElts)      return UndefValue::get(II->getType()); +  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are +  // fully supported. +  if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3) +    return nullptr; +    if (NewNumElts >= VWidth && DemandedElts.isMask()) {      if (DMaskIdx >= 0)        II->setArgOperand(DMaskIdx, Args[DMaskIdx]); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index b3254c10a0b2..17a5ec3f87fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1543,7 +1543,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {    Constant *C;    if (match(&Inst,              m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))), -                      m_Constant(C))) && +                      m_Constant(C))) && !isa<ConstantExpr>(C) &&        cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) {      assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&             "Shuffle should not change scalar type"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9d0500419a7f..2f379b7f6160 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1859,6 +1859,14 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {      return true;    } +  // If any of predecessors end with an indirect goto, we can't change its +  // destination. Same for CallBr. +  if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { +        return isa<IndirectBrInst>(Pred->getTerminator()) || +               isa<CallBrInst>(Pred->getTerminator()); +      })) +    return false; +    // Try to duplicate BB into PredBB.    return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);  } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5bc35aa4695f..f950d0d4eb2b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7397,8 +7397,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {        // Look for the next elements with the same type.        SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;        Type *EltTy = (*IncIt)->getType(); -      unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy) -                                          : MaxVecRegSize; + +      assert(EltTy->isSized() && +             "Instructions should all be sized at this point"); +      TypeSize EltTS = DL->getTypeSizeInBits(EltTy); +      if (EltTS.isScalable()) { +        // For now, just ignore vectorizing scalable types. +        ++IncIt; +        continue; +      } + +      unsigned EltSize = EltTS.getFixedSize();        unsigned MaxNumElts = MaxVecRegSize / EltSize;        if (MaxNumElts < 2) {          ++IncIt;  | 
